Switching from VC10 to VC11 I observe a performance drop of a factor 10 when reading a file with double numbers:
#include <iostream>
int main() {
double sum = 0, x;
for(int i=0; i<1000000; i++){
std::cin >> x;
sum += x;
}
std::cerr << sum << std::endl;
return 0;
}
I built the executable in Developer Studio, so that the environment chooses the options in release mode at best.
Can anybody confirm this?
What could be the problem? Might it be related to locale?
thanks in advance,
andreas
*for some reason my previous answer was deleted (I do admit that the first sentence was a bit confusing, due to a clumsy edit when it got better results)
Actually, for me the performance is about the same.
VC11 writing/reading 1M doubles -> 6.600/3.562 seconds
VC10 writing/reading 1M doubles -> 6.266/3.606 seconds
So in my experiment, reading doubles from file in vc11 is aprox. the same performance as with vc10.
Codesample:
int _tmain(int argc, _TCHAR* argv[])
{
auto x = 0.0;
auto numberofdoubles = 1000000;
auto filename = "C:\\double.txt";
{
std::ofstream filestr(filename);
auto starttime = clock();
for(int i=0; i<numberofdoubles; i++)
filestr << (double)i << " ";
auto endtime = clock();
auto elapsed = (double)(endtime - starttime)/CLOCKS_PER_SEC;
std::cout << "writing: " << elapsed << std::endl;
}
{
std::ifstream filestr (filename);
auto starttime = clock();
for(int i=0; i<numberofdoubles; i++)
filestr >> x;
auto endtime = clock();
auto elapsed = (double)(endtime - starttime)/CLOCKS_PER_SEC;
std::cout << "reading: " << elapsed << std::endl;
}
return 0;
}
Related
Closed. This question is not reproducible or was caused by typos. It is not currently accepting answers.
This question was caused by a typo or a problem that can no longer be reproduced. While similar questions may be on-topic here, this one was resolved in a way less likely to help future readers.
Closed 3 years ago.
Improve this question
I wrote a small test to figure out the fastest mathematic operation for a special x. I wanted the x to be entered by the user, so that I can run the tests for different x. In the following code I tells me that there is an error with std::cin >> val;
error: cannot bind 'std::istream {aka std::basic_istream}' lvalue to 'std::basic_istream&&'
If I declare val as double valinstead of const double val I get more errors. What can I change in order to have a running programm?
#include <cmath>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <istream>
#include <ostream>
// for x^1.5
double test_pow_15(double x) { return std::pow(x, 1.5); };
double test_chain_15(double x) { return sqrt(x * x * x); };
double test_tmp_15(double x) { double tmp = x * x * x; return sqrt(tmp); };
volatile double sink;
const double val = 0;
const double ans_15 = std::pow(val, 1.5);
void do_test(const char* name, double(&fn)(double), const double ans) {
auto start = std::chrono::high_resolution_clock::now();
for (size_t n = 0; n < 1000 * 1000 * 10; ++n) {
sink = val;
sink = fn(sink);
}
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> dur = end - start;
std::cout << name << ".Took" << dur.count() << "ms, error:" << sink - ans << '\n';
}
int main()
{
std::cout << "Speed test"<< '\n';
std::cout << "Please enter value for x."<< '\n';
std::cout << "x = ";
std::cin >> val;
std::cout << "Speed test starts for x = "<< val <<"."<<'\n';
std::cout << " " << '\n';
std::cout << "For " << val<<"^(1.5) the speed is:" <<'\n';
do_test("std::pow(x,1.5) ",test_pow_15, ans_15);
do_test("sqrt(x*x*x) ",test_chain_15, ans_15);
do_test("tmp = x*x*x; sqrt(tmp) ",test_tmp_15, ans_15);
return 0;
}
I think if you remove the "const" keyword, it would probably work fine.
double val = 0;
I have encountered this exercise which asks for which code is faster between the following two.
First code.
int sum = 0;
for(int i = 0; i < n; i++) {
sum += array[i*n + thread_id];
}
Second code.
int sum = 0;
for(int i = 0; i < n; i++) {
sum += array[n*thread_id + i];
}
I would try the code myself I will not have a Nvidia GPU in the following days.
I think that the first code takes advantage of memory coalescing see here, while the second one would take advantage of caching.
Many thanks to #RobertCrovella for clarifying the issues regarding memory coalescing. This is my attempt to benchmark the two codes as asked for. It can be clearly noticed from the output (run on a NVS5400M GPU laptop) that the first code is twice more efficient as compared to the second one. This is because of the memory coalescing taking place in the first one (kernel1).
#include <cuda.h>
#include <ctime>
#include <iostream>
#include <stdio.h>
using namespace std;
#define BLOCK_SIZE 1024
#define GRID_SIZE 1024
// Error Handling
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
//kernel1<<<8,8>>>(d_array,d_sum1,n);
__global__ void kernel1(int *array, long *sum, int n) {
long result=0;
int thread_id=threadIdx.x+blockIdx.x*blockDim.x;
for(int i=0;i<n;i++) {
result += array[i*n + thread_id];
}
//__syncthreads();
sum[thread_id]=result;
}
__global__ void kernel2(int *array, long *sum, int n) {
long result=0;
int thread_id=threadIdx.x+blockIdx.x*blockDim.x;
for(int i=0;i<n;i++) {
result += array[n*thread_id+i];
}
__syncthreads();
sum[thread_id]=result;
}
int main() {
srand((unsigned)time(0));
long *h_sum1,*d_sum1;
long *h_sum2,*d_sum2;
int n=10;
int size1=n*BLOCK_SIZE*GRID_SIZE+n;
int *h_array;
h_array=new int[size1];
h_sum1=new long[size1];
h_sum2=new long[size1];
//random number range
int min =1, max =10;
for(int i=0;i<size1;i++) {
h_array[i]= min + (rand() % static_cast<int>(max - min + 1));
h_sum1[i]=0;
h_sum2[i]=0;
}
int *d_array;
gpuErrchk(cudaMalloc((void**)&d_array,size1*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_sum1,size1*sizeof(long)));
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
gpuErrchk(cudaMemcpy(d_array,h_array,size1*sizeof(int),cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_sum1,h_sum1,size1*sizeof(long),cudaMemcpyHostToDevice));
cudaEventRecord(start);
kernel1<<<GRID_SIZE,BLOCK_SIZE>>>(d_array,d_sum1,n);
cudaEventRecord(stop);
gpuErrchk(cudaMemcpy(h_sum1,d_sum1,size1*sizeof(long),cudaMemcpyDeviceToHost));
float milliSeconds1=0;
cudaEventElapsedTime(&milliSeconds1,start,stop);
gpuErrchk(cudaMalloc((void**)&d_sum2,size1*sizeof(long)));
gpuErrchk(cudaMemcpy(d_sum2,h_sum2,size1*sizeof(long),cudaMemcpyHostToDevice));
cudaEventRecord(start);
kernel2<<<GRID_SIZE,BLOCK_SIZE>>>(d_array,d_sum2,10);
cudaEventRecord(stop);
gpuErrchk(cudaMemcpy(h_sum2,d_sum2,size1*sizeof(long),cudaMemcpyDeviceToHost));
float milliSeconds2=0;
cudaEventElapsedTime(&milliSeconds2,start,stop);
long result_device1=0,result_host1=0;
long result_device2=0,result_host2=0;
for(int i=0;i<size1;i++) {
result_device1 += h_sum1[i];
result_device2 += h_sum2[i];
}
for(int thread_id=0;thread_id<GRID_SIZE*BLOCK_SIZE;thread_id++)
for(int i=0;i<10;i++) {
result_host1 += h_array[i*10+thread_id];
result_host2 += h_array[10*thread_id+i];
}
cout << "Device result1 = " << result_device1 << endl;
cout << "Host result1 = " << result_host1 << endl;
cout << "Time1 (ms) = " << milliSeconds1 << endl;
cout << "Device result2 = " << result_device2 << endl;
cout << "Host result2 = " << result_host2 << endl;
cout << "Time2 (ms) = " << milliSeconds2 << endl;
gpuErrchk(cudaFree(d_array));
gpuErrchk(cudaFree(d_sum1));
gpuErrchk(cudaFree(d_sum2));
return 0;
}
The Cuda Event timer output is as under:
Device result1 = 57659226
Host result1 = 57659226
Time1 (ms) = 5.21952
Device result2 = 57674257
Host result2 = 57674257
Time2 (ms) = 11.8356
I wrote the following for a class, but came across some strange behavior while testing it. arrayProcedure is meant to do things with an array based on the 2 "tweaks" at the top of the function (arrSize, and start). For the assignment, arrSize must be 10,000, and start, 100. Just for kicks, I decided to see what happens if I increase them, and for some reason, if arrSize exceeds around 60,000 (I haven't found the exact limit), the program immediately crashes with a stack overflow when using a debugger:
Unhandled exception at 0x008F6977 in TMA3Question1.exe: 0xC00000FD: Stack overflow (parameters: 0x00000000, 0x00A32000).
If I just run it without a debugger, I don't get any helpful errors; windows hangs for a fraction of a second, then gives me an error TMA3Question1.exe has stopped working.
I decided to play around with debugging it, but that didn't shed any light. I placed breaks above and below the call to arrayProcedure, as well as peppered inside of it. When arrSize doesn't exceed 60,000 it runs fine: It pauses before calling arrayProcedure, properly waits at all the points inside of it, then pauses on the break underneath the call.
If I raise arrSize however, the break before the call happens, but it appears as though it never even steps into arrayProcedure; it immediately gives me a stack overflow without pausing at any of the internal breakpoints.
The only thing I can think of is the resulting arrays exceeds my computer's current memory, but that doesn't seem likely for a couple reasons:
It should only use just under a megabyte:
sizeof(double) = 8 bytes
8 * 60000 = 480000 bytes per array
480000 * 2 = 960000 bytes for both arrays
As far as I know, arrays aren't immediately constructed when I function is entered; they're allocated on definition. I placed several breakpoints before the arrays are even declared, and they are never reached.
Any light that you could shed on this would be appreciated.
The code:
#include <iostream>
#include <ctime>
//CLOCKS_PER_SEC is a macro supplied by ctime
double msBetween(clock_t startTime, clock_t endTime) {
return endTime - startTime / (CLOCKS_PER_SEC * 1000.0);
}
void initArr(double arr[], int start, int length, int step) {
for (int i = 0, j = start; i < length; i++, j += step) {
arr[i] = j;
}
}
//The function we're going to inline in the next question
void helper(double a1, double a2) {
std::cout << a1 << " * " << a2 << " = " << a1 * a2 << std::endl;
}
void arrayProcedure() {
const int arrSize = 70000;
const int start = 1000000;
std::cout << "Checking..." << std::endl;
if (arrSize > INT_MAX) {
std::cout << "Given arrSize is too high and exceeds the INT_MAX of: " << INT_MAX << std::endl;
return;
}
double arr1[arrSize];
double arr2[arrSize];
initArr(arr1, start, arrSize, 1);
initArr(arr2, arrSize + start - 1, arrSize, -1);
for (int i = 0; i < arrSize; i++) {
helper(arr1[i], arr2[i]);
}
}
int main(int argc, char* argv[]) {
using namespace std;
const clock_t startTime = clock();
arrayProcedure();
clock_t endTime = clock();
cout << endTime << endl;
double elapsedTime = msBetween(startTime, endTime);
cout << "\n\n" << elapsedTime << " milliseconds. ("
<< elapsedTime / 60000 << " minutes)\n";
}
The default stack size is 1 MB with Visual Studio.
https://msdn.microsoft.com/en-us/library/tdkhxaks.aspx
You can increase the stack size or use the new operator.
double *arr1 = new double[arrSize];
double *arr2 = new double[arrSize];
...
delete [] arr1;
delete [] arr2;
class X {
public:
std::string name;
int age;
long references;
X(string n, int a) : references(0), name(n), age(a) {}
};
inline void intrusive_ptr_add_ref(X* x){
++x->references;
}
inline void intrusive_ptr_release(X* x){
if(--x->references == 0)
delete x;
}
int _tmain(int argc, _TCHAR* argv[])
{
time_t t=clock();
size_t rounds=1000000;
for(size_t i=0; i<rounds; i++)
{
intrusive_ptr<X> myX(new X("Michael",40));
myX->age++;
}
cout << "Time taken to generate " << rounds << " of intrusive_ptr is "
<< clock()-t << endl;
t=clock();
for(size_t i=0; i<rounds; i++)
{
boost::shared_ptr<X> myX(new X("Michael",40));
myX->age++;
}
cout << "Time taken to generate " << rounds << " of shared_ptr is "
<< clock()-t << endl;
t=clock();
for(size_t i=0; i<rounds; i++)
{
std::shared_ptr<X> myX(new X("Michael",40));
myX->age++;
}
cout << "Time taken to generate " << rounds << " of Microsoft shared_ptr is "
<< clock()-t << endl;
t=clock();
for(size_t i=0; i<rounds; i++)
{
boost::shared_ptr<X> myX=boost::make_shared<X>("Michael",40);
myX->age++;
}
cout << "Time taken to generate " << rounds << " of shared_ptr using make_shared is "
<< clock()-t << endl;
t=clock();
for(size_t i=0; i<rounds; i++)
{
std::shared_ptr<X> myX=std::make_shared<X>("Michael",40);
myX->age++;
}
cout << "Time taken to generate " << rounds << " of Microsoft shared_ptr using make_shared is "
<< clock()-t << endl;
_getche();
return 0;
}
I got below results using vs2010 for release mode.
Time taken to generate 1000000 of intrusive_ptr is 116
Time taken to generate 1000000 of shared_ptr is 175
Time taken to generate 1000000 of Microsoft shared_ptr is 182
Time taken to generate 1000000 of shared_ptr using make_shared is 176
Time taken to generate 1000000 of Microsoft shared_ptr using make_shared is 120
Seems intrusive_ptr is the fastest, but seems MS is also doig well with shared_ptr using make_shared function. But why is boost make_shared performing not as well as MS version? Anybody did a similiar test? Anything that is wrong with my test or there is something that I didn't consider?
I build in Ubuntu 12.04 with command
g++ -pthread hello.cpp
But I run parallel mode always slows than normal . Here's my code
#include <iostream>
#include <pthread.h>
#include <math.h>
using namespace std;
#define NUM_THREADS 4
#define MAX_NUMBER 10000000
void *doSomething(void *param)
{
int id = (int) param;
int sum = 0;
for (int i = 0; i < MAX_NUMBER; i++)
{
sum += sin(i) + cos(i) + tan(i); // sum
}
return NULL;
}
void runNormal()
{
// run in normal mode with NUM_THREADS times.
for (int i = 0; i < NUM_THREADS; i++)
{
doSomething((void *) i);
}
}
void runParallel()
{
pthread_t threads[NUM_THREADS];
pthread_attr_t attr;
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
int rc, i;
for (i = 0; i < NUM_THREADS; i++)
{
rc = pthread_create(&threads[i], &attr, doSomething, (void *) i);
if (rc)
{
cout << "ERROR : can't create thread #" << i;
}
}
pthread_attr_destroy(&attr);
void *status;
for (i = 0; i < NUM_THREADS; i++)
{
pthread_join(threads[i], &status);
}
}
int main()
{
int type;
cout << "Choose type of run (1 - normal, 2 - parallel) : ";
cin >> type;
clock_t init, final;
init = clock();
if (type == 1)
{
runNormal();
}
else if (type == 2)
{
runParallel();
}
else
{
cout << "Your choice is wrong.";
}
final = clock();
double duration = (double) (final - init) / CLOCKS_PER_SEC;
cout << "Duration : " << duration << " seconds." << endl;
pthread_exit(NULL);
return 0;
}
I run with 4 threads because my lap has 4 cores . I saw in System Monitor , I realize my lap used 4 cores concurrently in parrallel mode and only 1 core in normal mode but duration time of normal mode is shorter .
please see answer https://stackoverflow.com/a/2962914/1689451 for clarification of how clock works in multithreaded applications.
try it like this:
struct timespec start, finish;
double elapsed;
clock_gettime(CLOCK_MONOTONIC, &start);
if (type == 1)
{
runNormal();
}
else if (type == 2)
{
runParallel();
}
else
{
cout << "Your choice is wrong.";
}
clock_gettime(CLOCK_MONOTONIC, &finish);
elapsed = (finish.tv_sec - start.tv_sec);
elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0;
cout << " Duration : " << elapsed << " seconds." << endl;
and for completeness, i built it like this (filname par.cpp):
make CXXFLAGS="-pthread -O3 -lrt" LDLIBS=-lrt -B par && ./par