I tried to calculate the performance of some low-level assembly code produced by the compiler. However, at some moment I got strange results which I don't understand. So here is my code:
#include <iostream>
#include <chrono>
#include <vector>
//---------------------------------------------------------------------------------------
int main()
{
std::srand(time(NULL));
constexpr unsigned vectorSize = 1000u;
constexpr unsigned loopCount = 1000000u;
std::vector<int> vec1(vectorSize);
std::vector<int> vec2(vectorSize);
for (unsigned i = 0u; i < vectorSize; ++i)
{
vec1[i] = std::rand();
}
for (unsigned i = 0u; i < vectorSize; ++i)
{
vec2[i] = std::rand();
}
std::chrono::time_point<std::chrono::high_resolution_clock> start;
std::chrono::time_point<std::chrono::high_resolution_clock> end;
long long ms;
//---------------------------------------------------------------------------------------
start = std::chrono::high_resolution_clock::now();
for (unsigned j = 0u; j < loopCount; ++j)
{
for (unsigned i = 0u; i < vec2.size(); ++i)
{
vec2[i] = vec1[i];
}
}
end = std::chrono::high_resolution_clock::now();
ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << "Evaluation took: " << ms << " ms" << std::endl;
//---------------------------------------------------------------------------------------
start = std::chrono::high_resolution_clock::now();
for (unsigned j = 0u; j < loopCount; ++j)
{
for (unsigned i = 0u; i < vec2.size(); ++i)
{
vec2[i] = vec1[i];
}
}
end = std::chrono::high_resolution_clock::now();
ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << "Evaluation took: " << ms << " ms" << std::endl;
//---------------------------------------------------------------------------------------
start = std::chrono::high_resolution_clock::now();
for (unsigned j = 0u; j < loopCount; ++j)
{
for (unsigned i = 0u; i < vec2.size(); ++i)
{
vec2[i] = vec1[i];
}
}
end = std::chrono::high_resolution_clock::now();
ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << "Evaluation took: " << ms << " ms" << std::endl;
//---------------------------------------------------------------------------------------
start = std::chrono::high_resolution_clock::now();
for (unsigned j = 0u; j < loopCount; ++j)
{
for (unsigned i = 0u; i < vec2.size(); ++i)
{
vec2[i] = vec1[i];
}
}
end = std::chrono::high_resolution_clock::now();
ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << "Evaluation took: " << ms << " ms" << std::endl;
//---------------------------------------------------------------------------------------
start = std::chrono::high_resolution_clock::now();
for (unsigned j = 0u; j < loopCount; ++j)
{
for (unsigned i = 0u; i < vec2.size(); ++i)
{
vec2[i] = vec1[i];
}
}
end = std::chrono::high_resolution_clock::now();
ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << "Evaluation took: " << ms << " ms" << std::endl;
//---------------------------------------------------------------------------------------
std::cout << "Press enter to exit..." << std::endl;
std::cin.get();
return 0;
}
Main "work" in this program is done in the loop where elements of vector vec1 are simply assigned to vector vec2. The inner for loop condition is intended to don't allow compiler for SIMD optimization. I repeated the code 5 times by copying it, so I expected to get more or less the same time measurement for every single copy. However, results are different:
Evaluation took: 425 ms
Evaluation took: 694 ms
Evaluation took: 462 ms
Evaluation took: 441 ms
Evaluation took: 710 ms
Press enter to exit...
That's how it looks after building with Visual Studio 2015 in Release mode. Surprisingly, with GCC calculated times are very similar, but with Visual I got this strange pattern every single time and it's always the same.
So I want to ask if my code is somehow corrupted, am I missing something? Or maybe this is just compiler strange behavior?
It's kind of important to get right time measurement in any kind of benchmarking so I don't want to make mistake right from the start.
Related
I usually return an object of std::vector or std::map as an incoming reference paremeter(as funcVec2 and funcMap2 below). But it is a bit inconvenient when writing codes. So I think if I can use return value under c++11(as funcVec1 and funcMap1 below) because it will call move constructor but not copy constructor, so it maybe still spend only one construct time and no deconstruct as the form of incoming reference paremeter.
But I write the codes below to verify it and it turns out that funcVec1 and funcMap1 takes more times then funcVec2 and funcMap2. So I am confused now why funcVec1 and funcMap1 takes so long?
#include <iostream>
#include <vector>
#include <map>
#include <chrono>
using namespace std;
vector<int> funcVec1() {
vector<int >vec;
for (int i = 0; i < 10; ++i) {
vec.push_back(i);
}
return vec;
}
void funcVec2(vector<int>&vec) {
for (int i = 0; i < 10; ++i) {
vec.push_back(i);
}
return;
}
map<int, int> funcMap1() {
map<int, int>tmpMap;
for (int i = 0; i < 10; ++i) {
tmpMap[i] = i;
}
return tmpMap;
}
void funcMap2(map<int, int>&tmpMap) {
for (int i = 0; i < 10; ++i) {
tmpMap[i] = i;
}
}
int main()
{
using namespace std::chrono;
system_clock::time_point t1 = system_clock::now();
for (int i = 0; i < 100000; ++i) {
vector<int> vec1 = funcVec1();
}
auto t2 = std::chrono::system_clock::now();
cout << "return vec takes " << (t2 - t1).count() << " tick count" << endl;
cout << duration_cast<milliseconds>(t2 - t1).count() << " milliseconds" << endl;
cout << " --------------------------------" << endl;
vector<int> vec2;
for (int i = 0; i < 100000; ++i) {
funcVec2(vec2);
}
auto t3 = system_clock::now();
cout << "reference vec takes " << (t3 - t2).count() << " tick count" << endl;
cout << duration_cast<milliseconds>(t3 - t2).count() << " milliseconds" << endl;
cout << " --------------------------------" << endl;
for (int i = 0; i < 100000; ++i) {
map<int, int> tmpMap1 = funcMap1();
}
auto t4 = system_clock::now();
cout << "return map takes " << (t4 - t3).count() << " tick count" << endl;
cout << duration_cast<milliseconds>(t4 - t3).count() << " milliseconds" << endl;
cout << " --------------------------------" << endl;
map<int, int>tmpMap2;
for (int i = 0; i < 100000; ++i) {
funcMap2(tmpMap2);
}
auto t5 = system_clock::now();
cout << "reference map takes " << (t5 - t4).count() << " tick count" << endl;
cout << duration_cast<milliseconds>(t5 - t4).count() << " milliseconds" << endl;
cout << " --------------------------------" << endl;
return 0;
}
you are not only meassuring the time for your operations, you also include the printouts. this is suboptimal.
you should measure performance in release mode. be aware that you are not doing anything usefull with your objects and the optimizer may throw away most of your code you wanted to measure.
the comparisons are not "fair". for example in your map1 case you are constructing an empty map, fill it (memory allocations happen here) and then you throw it away. in the map2 case you are reusing the identical map object over and over again. you are not allocating memory over and over again.
My program can't read all of the data from my MarvelIn.txt file.
It reads about 29 spaces in, and MarvelIn.txt contains only 9 entries, then I get a runtime error.
I think I have all the syntax right, however this is the only error I have. It will not output to the output file "MarvelOut.txt".
Here is the code:
#include <iostream>
#include <fstream>
#include <string>
#include <iomanip>
#include <stdio.h>
using namespace std;
struct sstruct
{
string first, last;
string department;
int salary;
};
void init2(sstruct s[50])
{
int maxarray = 50;
sstruct init = { "Darth", "Vader", "None", 0 };
for (int i = 0; i < maxarray; i++) {
s[i] = init;
cout << "init: " <<s[i].first<<endl;
}
}
void read(sstruct s[50], int &nums)
{
int maxarray = 50;
ifstream inf("MarvelIn.txt");
int i = 0;
while (!inf.eof())
{
inf >> s[i].first >> s[i].last >> s[i].department >> s[i].salary;
cout << "read: "<<s[i].first<<s[i].last << s[i].department <<
s[i].salary << endl;
i++;
}
nums = i;
}
void avg(sstruct s[50], int &nums, double &average2)
{
int maxarray = 50;
int i;
for (i = 0; i < nums; i++)
average2 += s[i].salary;
average2 /= nums;
}
void print(sstruct s[50], int nums, double &average2)
{
int maxarray = 50;
ofstream outf("MarvelOut.txt");
int i = 0;
string temp;
outf << "the number of professors is: " << nums << endl;
cout << "the number of professors is: " << nums << endl;
outf << endl << "The average salary of the professors is: " << average2 << endl;
outf << "Advisor " << "Major " << " Department " << "Salary " << endl;
for (i = 0; i < nums; i++)
{
temp= s[i].last + "," + s[i].first;
cout << "last, first " << temp << endl;
outf << left << setw(20) << temp << right << setw(5)<< s[i].department << setw(5) << s[i].salary << setw(8) << endl;
}
outf << endl << endl;
}
void swap(sstruct &a, sstruct &b)
{
sstruct temp;
temp=a;
a=b;
b=temp;
}
void bubbleSort(sstruct s[50], int &nums)
{
int maxarray = 50;
int i, j;
bool swapped;
for (i = 0; i < nums - 1; i++)
{
swapped = false;
for (j = 0; j < nums - i - 1; j++)
{
if (s[j].department > s[j + 1].department)
{
swap(s[j], s[j+1]);
swapped = true;
}
}
// IF no two elements were swapped by inner loop, then break
if (swapped == false)
break;
}
}
int main() {
int nums=0;
double average3=0.0;
const int maxarray = 50;
sstruct s[maxarray];
init2(s);
print(s, nums, average3);
read(s, nums);
cout << "numsfirst: " << nums << endl;
avg(s, nums, average3);
cout << "nums" << nums << endl;
bubbleSort(s,nums);
print(s, nums, average3);
system("pause");
return 0;
}
I'm trying to implement a timer class which prints the time needed for a given scope. Somehow I can't get it to work properly. My code so far:
Main.cpp:
#include "scopetimer.hpp"
#include <cstdlib>
#include <cmath>
#include <string>
#include <chrono>
#include <iostream>
void work01()
{
double numbers[10000];
for (int i = 0; i < 10000; ++i)
{
numbers[i] = double(std::rand()) / double(RAND_MAX);
}
for (int n = 10000; n > 1; n = n - 1) {
for (int i = 0; i < n - 1; i = i + 1) {
if (numbers[i] > numbers[i + 1]) {
double tmp = numbers[i];
numbers[i] = numbers[i + 1];
numbers[i + 1] = tmp;
}
}
}
}
void work02()
{
int* buf[1024];
for (int i = 2; i < 1024; ++i)
buf[i] = new int[i];
for (int i = 2; i < 1024; ++i)
delete[] buf[i];
}
// counts the number of primes in an interval
int work03(int n0, int n1)
{
int freq = n1 - n0 + 1;
for (int i = n0; i <= n1; ++i)
{
// Have fun: use the alternative iteration direction and see how fast
// it gets!
// for(int j = 2; j < i; ++j)
for (int j = i - 1; j > 1; --j)
{
if (i%j == 0)
{
--freq;
break;
}
}
}
return freq;
}
int main(int, char**)
{
{ ScopeTimer("work01");
work01();
}
{
ScopeTimer("work02");
work02();
}
{
ScopeTimer("work03");
work03(0, 10000);
}
std::cout << std::endl << "Tests" << std::endl << std::endl;
{
clock_t start_(std::clock());
work01();
clock_t end_(std::clock());
std::cout << "Test Timer: " << end_ - start_ << "ns" << std::endl;
}
{
clock_t start_(std::clock());
work02();
clock_t end_(std::clock());
std::cout << "Test Timer: " << end_ - start_ << "ns" << std::endl;
}
{
clock_t start_(std::clock());
work03(0,10000);
clock_t end_(std::clock());
std::cout << "Test Timer: " << end_ - start_ << "ns" << std::endl;
}
system("Pause");
}
scopetimer.cpp
#include "scopetimer.hpp"
#include <cmath>
#include <string>
#include <chrono>
#include <iostream>
ScopeTimer::ScopeTimer(const std::string& name)
:name_(name),
start_(std::clock()) {
}
ScopeTimer::~ScopeTimer() {
double elapsed = (double(std::clock() - start_) / double(CLOCKS_PER_SEC));
std::cout << name_ << ": " << int(elapsed) << "ns" << std::endl;
}
I tested the clock functions outside of ScopeTimer(), which works fine. So the only issues, as far as I can tell, is that I can't get ScopeTimer() to work. It always prints 0ns. I mostly followed the turorial: https://felix.abecassis.me/2011/09/cpp-timer-raii/
Kind regards
In ~ScopeTimer() you print how many complete seconds have passed not how many nanoseconds, while in the second part of main, you print the number of clock ticks, which may or may not be the same as a nanosecond.
I came across the same problem
The solution for me is to define a ScopeTimer instance instead of just call its constructor, I mean:
{
ScopeTimer _scopetimer("work01");
work01();
}
That should work
I guess compiler seems to ignore (optimize) that, 'cause when you just call ScopeTimer("work01").
I am trying to implement Parallel Multi-threaded Matrix multiplication in C++. The method i follow involves dividing Arrays into 4 sub-arrays and carry out parallel Multiplication using 4 threads on these 4 sub arrays.
I have written a C++ code but it is throwing error and terminates explicitly. Error :
"terminate called after throwing an instance of std::system_error
what():invalid Argument"
Here is my complete code. I am relatively new to C++ and multi-threading.
#include <iostream>
#include <thread>
#include <mutex>
#include <vector>
#include <algorithm>
#include <string>
#define N 4
using namespace std;
mutex mu;
void stage_1_multiply(int *a,int *b,int *d){
int *xij;
int *yij;
int *zij;
int COLS = N,ROWS = N;
cout<< " thread "<< this_thread::get_id() << " "<<endl;
for(int i = 0;i<(N/2);++i){
for(int j = 0;j < (N/2); j++){
for(int k = 0; k<(N/2);k++){
mu.lock();
xij = a + ((COLS * i) + k);
yij = b + ((COLS * k) + j);
zij = d + ((COLS * i) + j);
*zij += ( (*xij) * (*yij) );
mu.unlock();
}
}
}
}
int main(){
int A[4][4],B[4][4],C[4][4],D_1[4][4],D_2[4][4];
for(int i = 0;i<4;i++){
for(int j = 0;j<4;j++){
A[i][j] = i + 1;
B[i][j] = i + 1;
C[i][j] = 0;
D_1[i][j] = 0;
D_2[i][j] = 0;
}
}
for(int i = 0;i<4;i++){
for(int j = 0;j< 4;j++){
cout << A[i][j] << " ";
}
cout << endl;
}
for(int i = 0;i<4;i++){
for(int j = 0;j< 4;j++){
cout << B[i][j] << " ";
}
cout << endl;
}
vector< thread> threads(8);
int th = 0;
threads[th++] = thread(stage_1_multiply,&A[0][0],&B[0][0],&D_1[0][0]);
threads[th++] = thread(stage_1_multiply,&A[0][2],&B[2][0],&D_2[0][0]);
threads[th++] = thread(stage_1_multiply,&A[2][0],&B[0][2],&D_1[2][2]);
threads[th++] = thread(stage_1_multiply,&A[2][2],&B[2][2],&D_2[2][2]);
for( auto& t : threads){
t.join();
}
threads[th++] = thread(stage_1_multiply,&A[0][0],&B[0][2],&D_1[0][2]);
threads[th++] = thread(stage_1_multiply,&A[0][2],&B[2][2],&D_2[0][2]);
threads[th++] = thread(stage_1_multiply,&A[2][0],&B[0][0],&D_1[2][0]);
threads[th++] = thread(stage_1_multiply,&A[2][2],&B[2][0],&D_2[2][0]);
for( auto& t : threads){
t.join();
}
// code to add The Matrices D_1 and D_2 goes here.
for(int i = 0;i<4;i++){
for(int j = 0;j< 4;j++){
cout << D_1[i][j] << " ";
}
cout << endl;
}
cout << " Main Close "<<endl;
return 0;
}
What am doing wrong? is it anything related to parallel access of shared memory? If so how can i correct it?
PS: This is a homework Assignment.
I am trying to solve this problem, I think I have come up with a correct answer, but I am keep getting WA (wrong answer) response from the judge.
http://www.spoj.com/problems/FISHER/
The problem distilled, is, given a complete graph with a time and a toll associated with each edge, find a path from the first node to the last node within time constraint and minimize toll.
As with any problems, there are many ways to solve it. My idea is to extend the Floyd-Warshall algorithm to keep track of all non-dominated paths. At the end of the algorithm, we extract the path with minimal cost, and if there are multiple paths with the same cost, choose the one that spent least time.
Complexity aside, the bad thing is, wrong answer. I have no idea what is wrong. I have generated some random graphs and used a brute force solver (one that try all possible paths) and they matches exactly on small (i.e. less than 11 nodes) graphs. Without further ado, here is the code:
#include "stdafx.h"
// http://www.spoj.com/problems/FISHER/
// #define LOG
#include <iostream>
#include <vector>
#include <map>
#include <algorithm>
using namespace std;
int main()
{
while (true)
{
int num_cities;
int time_budget;
vector<vector<int> > distances;
vector<vector<int> > tolls;
cin >> num_cities;
cin >> time_budget;
if (num_cities == 0 && time_budget == 0)
{
break;
}
distances.resize(num_cities);
tolls.resize(num_cities);
for (int i = 0; i < num_cities; i++)
{
distances[i].resize(num_cities);
tolls[i].resize(num_cities);
}
for (int i = 0; i < num_cities; i++)
{
for (int j = 0; j < num_cities; j++)
{
int distance;
cin >> distance;
distances[i][j] = distance;
}
}
for (int i = 0; i < num_cities; i++)
{
for (int j = 0; j < num_cities; j++)
{
int toll;
cin >> toll;
tolls[i][j] = toll;
}
}
// Try Floyd Warshall
// Denote the set of shortest paths from i to j going through {0,1,...k - 1} be shortest_paths[i][j][k],
// It is a set of shortest paths because there can be multiple shortest paths with different time used.
// We should record if using longer time can lead to lower cost, or similarly higher cost but less time
// The first element in the pair is the cost, the second element in the pair is time used
vector<vector<vector<vector<pair<int, int> > > > > shortest_paths;
shortest_paths.resize(num_cities);
for (int i = 0; i < num_cities; i++)
{
shortest_paths[i].resize(num_cities);
for (int j = 0; j < num_cities; j++)
{
shortest_paths[i][j].resize(num_cities + 1);
}
}
// Initialization - there is only one path without going through any node
#ifdef LOG
cout << "k = " << 0 << endl;
cout << "<table border='1'>" << endl;
#endif
for (int i = 0; i < num_cities; i++)
{
#ifdef LOG
cout << "<tr>" << endl;
#endif
for (int j = 0; j < num_cities; j++)
{
#ifdef LOG
cout << "<td>(" << tolls[i][j] << ", " << distances[i][j] << ")</td>";
#endif
shortest_paths[i][j][0].push_back(pair<int, int>(tolls[i][j], distances[i][j]));
}
#ifdef LOG
cout << "</tr>" << endl;
#endif
}
#ifdef LOG
cout << "</table>" << endl;
#endif
// Iteration - the shortest path
for (int k = 1; k <= num_cities; k++)
{
#ifdef LOG
cout << "k = " << k << endl;
cout << "<table border='1'>" << endl;
#endif
for (int i = 0; i < num_cities; i++)
{
#ifdef LOG
cout << "<tr>";
#endif
for (int j = 0; j < num_cities; j++)
{
// Step 1: Generate all candidate shortest paths
map<pair<int, int>, bool> candidates;
for (vector<pair<int, int> >::iterator pi = shortest_paths[i][j][k - 1].begin(); pi != shortest_paths[i][j][k - 1].end(); pi++)
{
candidates.insert(pair<pair<int, int>, bool>(*pi, false));
}
for (vector<pair<int, int> >::iterator fi = shortest_paths[i][k - 1][k - 1].begin(); fi != shortest_paths[i][k - 1][k - 1].end(); fi++)
{
for (vector<pair<int, int> >::iterator si = shortest_paths[k - 1][j][k - 1].begin(); si != shortest_paths[k - 1][j][k - 1].end(); si++)
{
int first_path_cost = fi->first;
int first_path_time_used = fi->second;
int second_path_cost = si->first;
int second_path_time_used = si->second;
int new_path_cost = first_path_cost + second_path_cost;
int new_path_time_used = first_path_time_used + second_path_time_used;
if (new_path_time_used <= time_budget)
{
candidates.insert(pair<pair<int, int>, bool>(pair<int, int>(new_path_cost, new_path_time_used), false));
}
}
}
vector<pair<pair<int, int>, bool> > candidates_list;
for (map<pair<int,int>, bool>::iterator ci = candidates.begin(); ci != candidates.end(); ci++)
{
candidates_list.push_back(*ci);
}
// Eliminate the bad ones
for (unsigned int p = 0; p < candidates_list.size(); p++)
{
for (unsigned int q = 0; q < candidates_list.size(); q++)
{
if (p != q)
{
int first_path_cost = candidates_list[p].first.first;
int first_path_time_used = candidates_list[p].first.second;
int second_path_cost = candidates_list[q].first.first;
int second_path_time_used = candidates_list[q].first.second;
// First take less time and less cost than second, second is eliminated
if (first_path_time_used <= second_path_time_used && first_path_cost <= second_path_cost)
{
candidates_list[q].second = true;
}
}
}
}
#ifdef LOG
cout << "<td>";
#endif
for (unsigned int p = 0; p < candidates_list.size(); p++)
{
if (candidates_list[p].second == false)
{
#ifdef LOG
cout << "(" << candidates_list[p].first.first << ", " << candidates_list[p].first.second << ")<br>";
#endif
shortest_paths[i][j][k].push_back(candidates_list[p].first);
}
}
#ifdef LOG
cout << "</td>";
#endif
}
#ifdef LOG
cout << "</tr>" << endl;;
#endif
}
#ifdef LOG
cout << "</table>" << endl;
#endif
}
bool first = true;
int best_cost = -1;
int best_cost_time = -1;
for (vector<pair<int, int> >::iterator pi = shortest_paths[0][num_cities - 1][num_cities].begin(); pi != shortest_paths[0][num_cities - 1][num_cities].end(); pi++)
{
if (first)
{
best_cost = pi->first;
best_cost_time = pi->second;
first = false;
}
else
{
if (pi->first < best_cost)
{
best_cost = pi->first;
best_cost_time = pi->second;
}
if (pi->first == best_cost && pi->second < best_cost_time)
{
best_cost_time = pi->second;
}
}
}
cout << best_cost << " " << best_cost_time << endl;
}
return 0;
}
/*
4 7
0 5 2 3
5 0 2 3
3 1 0 2
3 3 2 0
0 2 2 7
2 0 1 2
2 2 0 5
7 2 5 0
0 0
*/
Turn on the LOG you will be able to see the Floyd Warshall table for each iteration, each cell has set of a (cost, time) pair. They are supposed to be the cost/time pairs of all non-dominated paths.
I would really appreciate if someone can tell me what's wrong. Thanks a lot in advance!
Try this test:
4 10
0 1 1 1000
1 0 1 1
1 1 0 1
1000 1 1 0
0 1 1 1
1 0 1 1
1 1 0 1
1 1 1 0
Basically you need to ensure distances[i][j] <= time_budget before
shortest_paths[i][j][0].push_back(pair<int, int>(tolls[i][j], distances[i][j]));