I came across this behavior of speed up and I am finding it hard to explain. Following is the background:
Program
Invocation of Gaussian Elimination method to solve linear equation within a loop to parallelize the work load across compute units. We use an augmented matrix of dimension (M by M+1) where one additional column holds the RHS
HPC Setup - Cray XC50 node with Intel Xeon 6148 Gold with the following configuration
available: 2 nodes (0-1)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
node 0 size: 95325 MB
node 0 free: 93811 MB
node 1 cpus: 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
node 1 size: 96760 MB
node 1 free: 96374 MB
node distances:
node 0 1
0: 10 21
1: 21 10
Although not the actual HPC, but the block diagram and the related explanation seems to fully apply (https://www.nas.nasa.gov/hecc/support/kb/skylake-processors_550.html). Specifically sub NUMA clustering seems to be disabled.
Job submitted through APLS is as follows
time aprun -n 1 -d 20 -j 1 -ss -cc 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19 -e N=4000 -e M=200 -e MODE=2 ./gem
time aprun -n 1 -d 20 -j 1 -ss -cc 0,1,2,3,4,5,6,7,8,9,20,21,22,23,24,25,26,27,28,29 -e N=4000 -e M=200 -e MODE=2 ./gem
time aprun -n 1 -d 20 -j 1 -ss -cc 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29 -e N=4000 -e M=200 -e MODE=2 ./gem
time aprun -n 1 -d 20 -j 1 -ss -cc 0,1,2,3,4,5,6,7,8,9,30,31,32,33,34,35,36,37,38,39 -e N=4000 -e M=200 -e MODE=2 ./gem
time aprun -n 1 -d 20 -j 1 -ss -cc 40,41,42,43,44,45,46,47,48,49,60,61,62,63,64,65,66,67,68,69 -e N=4000 -e M=200 -e MODE=2 ./gem
In the above N indicates the number of matrices and M replaces the dimension of the matrix. These are passed as environment variable to the program and used internally. MODE can be ignored for this discussion
cc list specifically lists the CPUs to bind with. OMP_NUM_THREADS is set to 20. The intent is to use 20 threads across 20 compute units.
Time to run sequentially and parallel is recorded within the program using omp_get_wtime() and the results are the following
CPU Binding
Objective
Speed Up
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Load work across 20 physical cores on socket 0
13.081944
0,1,2,3,4,5,6,7,8,9,20,21,22,23,24,25,26,27,28,29
Spread across first 10 physical cores on socket 0 & socket 1
18.332559
10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
Spread across 2nd set of 1o physical cores on socket 0 & first 10 of socket 1
18.636265
40,41,42,43,44,45,46,47,48,49,60,61,62,63,64,65,66,67,68,69
Spread across virtual cores across sockets(40-0, 60-21)
15.922209
Why is the speed up less for the first case when all physical nodes on socket 0 are being used ? The understanding here is that when tasks are spread across sockets, UPI comes into effect and it should be slower whereas it seems to be exactly the opposite. Also what can possibly explain the last scenario when virtual cores are being used.
Note: We have tried multiple iterations and the results for the above combinations are pretty consistent.
Edit1:
Edit2: Source code
#define _GNU_SOURCE
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include "sched.h"
#include "omp.h"
double drand(double low, double high, unsigned int *seed)
{
return ((double)rand_r(seed) * (high - low)) / (double)RAND_MAX + low;
}
void init_vars(int *N, int *M, int *mode)
{
const char *number_of_instances = getenv("N");
if (number_of_instances) {
*N = atoi(number_of_instances);
}
const char *matrix_dim = getenv("M");
if (matrix_dim) {
*M = atoi(matrix_dim);
}
const char *running_mode = getenv("MODE");
if (running_mode) {
*mode = atoi(running_mode);
}
}
void print_matrix(double *instance, int M)
{
for (int row = 0; row < M; row++) {
for (int column = 0; column <= M; column++) {
printf("%lf ", instance[row * (M + 1) + column]);
}
printf("\n");
}
printf("\n");
}
void swap(double *a, double *b)
{
double temp = *a;
*a = *b;
*b = temp;
}
void init_matrix(double *instance, unsigned int M)
{
unsigned int seed = 45613 + 19 * omp_get_thread_num();
for (int row = 0; row < M; row++) {
for (int column = 0; column <= M; column++) {
instance[row * (M + 1) + column] = drand(-1.0, 1.0, &seed);
}
}
}
void initialize_and_solve(int M)
{
double *instance;
instance = malloc(M * (M + 1) * sizeof(double));
// Initialise the matrix
init_matrix(instance, M);
// Performing elementary operations
int i, j, k = 0, c, flag = 0, m = 0;
for (i = 0; i < M; i++) {
if (instance[i * (M + 2)] == 0) {
c = 1;
while ((i + c) < M && instance[(i + c) * (M + 1) + i] == 0)
c++;
if ((i + c) == M) {
flag = 1;
break;
}
for (j = i, k = 0; k <= M; k++) {
swap(&instance[j * (M + 1) + k], &instance[(j + c) * (M + 1) + k]);
}
}
for (j = 0; j < M; j++) {
// Excluding all i == j
if (i != j) {
// Converting Matrix to reduced row
// echelon form(diagonal matrix)
double pro = instance[j * (M + 1) + i] / instance[i * (M + 2)];
for (k = 0; k <= M; k++)
instance[j * (M + 1) + k] -= (instance[i * (M + 1) + k]) * pro;
}
}
}
// Get the solution in the last column
for (int i = 0; i < M; i++) {
instance[i * (M + 1) + M] /= instance[i * (M + 2)];
}
free(instance);
instance = NULL;
}
double solve_serial(int N, int M)
{
double now = omp_get_wtime();
for (int i = 0; i < N; i++) {
initialize_and_solve(M);
}
return omp_get_wtime() - now;
}
double solve_parallel(int N, int M)
{
double now = omp_get_wtime();
#pragma omp parallel for
for (int i = 0; i < N; i++) {
initialize_and_solve(M);
}
return omp_get_wtime() - now;
}
int main(int argc, char **argv)
{
// Default parameters
int N = 200, M = 200, mode = 2;
if (argc == 4) {
N = atoi(argv[1]);
M = atoi(argv[2]);
mode = atoi(argv[3]);
}
init_vars(&N, &M, &mode);
if (mode == 0) {
// Serial only
double l2_norm_serial = 0.0;
double serial = solve_serial(N, M);
printf("Time, %d, %d, %lf\n", N, M, serial);
} else if (mode == 1) {
// Parallel only
double l2_norm_parallel = 0.0;
double parallel = solve_parallel(N, M);
printf("Time, %d, %d, %lf\n", N, M, parallel);
} else {
// Both serial and parallel
// Solve using GEM (serial)
double serial = solve_serial(N, M);
// Solve using GEM (parallel)
double parallel = solve_parallel(N, M);
printf("Time, %d, %d, %lf, %lf, %lf\n", N, M, serial, parallel, serial / parallel);
}
return 0;
}
Edit3: Rephrased the first point to clarify what is actually being done ( based on feedback in comment )
You say you implement a "Simple implementation of Gaussian Elimination". Sorry, there is no such thing. There are multiple different algorithms and they all come with their own analysis. But let's assume you use the textbook one. Even then, Gaussian Elimination is not simple.
First of all, you haven't stated that you initialized your data in parallel. If you don't do that, all the data will wind up on socket 0 and you will get bad performance, never mind the speedup. But let's assume you did the right thing here. (If not, google "first touch".)
In the GE algorithm, each of the sequential k iterations works on a smaller and smaller subset of the data. This means that no simple mapping of data to cores is possible. If you place your data in such a way that initially each core works on local data, this will quickly no longer be the case.
In fact, after half the number of iterations, half your cores will be pulling data from the other socket, leading to NUMA coherence delays. Maybe a spread binding is better here than your compact binding.
Why is the speed up less for the first case when all physical nodes on socket 0 are being used ?
Results are often dependent of the application but some patterns regularly happens. My guess is that your application heavily use the main RAM and 2 sockets results in more DDR4 RAM blocks being used than only one. Indeed, with local NUMA-node allocations, 1 socket can access to the RAM at the speed of 128 GB/s while 2 sockets can access to the RAM at the speed of 256 GB/s. With a balanced use of DDR4 RAM blocks, the performance with be far worst and bounded by UPI (I do not expect 2 socket to be much slower because of the full-duplex data transfer).
The understanding here is that when tasks are spread across sockets, UPI comes into effect and it should be slower whereas it seems to be exactly the opposite.
UPI is only a bottleneck if data are massively transferred between the two sockets, but good NUMA applications should not do that because they should operate on their own NUMA-node memory.
You can check the use of the UPI and RAM throughput using hardware counters.
Also what can possibly explain the last scenario when virtual cores are being used.
I do not have an explanation for this. Note higher IDs are the second hyperthreads of each core so it is certainly related to a low-level behaviour of the hyperthreading (maybe some processes are bound to some PU causing pre-emption the target PUs or simply the second PU have somehow a lower priority). Note also that physical core IDs and logical PU IDs are often not mapped the same way so if you use the wrong one you could end up binding 2 threads to the same core. I advise you to use hwloc to check that.
I work in a logistic department for a company, recently we have been trying to narrow down the amount of different packaging options that we use.
I have all the necessary product data like length, width, height, volume and also sales data.
So I was thinking if it is possible to use an algorithm to cluster the different volumes of the products and maybe also take into account which sizes are selling the most, to determine, which box sizes would be ideal.
(Taking into account how often a product sells is secondary so that is not absolutely necessary)
What I want is that I can give the Algorithm an amount of how many different boxsizes I want and the algorithm should determine where to put the limits, so that there is a solution for every product that we have. With the goal of the optimization being minimum volume wasted while also not using more than the set amount of different boxes.
Also important to note, the orientation of the products and the amount per box is set, so there is no need to determine how to pack the products and how many go into one box idealy or something like that.
What kind of algorithms could be used for a problem like this and what are my options to program them? I was thinking of using Matlab, but would also be open for other possible options. I want to program it, not simply use an existing program like SPSS.
Thanks in advance and forgive me if my english is not the best, I'm not a native speaker.
The following C++ program will find optimal solutions for small instances. For 10 input box sizes, each having dimensions randomly chosen in the range 1..100, and for any number 1..10 of box sizes to choose, it computes the answer in a couple of seconds on my computer. For 15 input box sizes, it takes around 10s. For 20 input box sizes, I could compute up to 4 chosen box sizes in about 3 minutes, with memory becoming an issue (it used around 3GB). I had to increase the linker's default stack size to avoid stack overflows.
#include <iostream>
#include <algorithm>
#include <vector>
#include <array>
#include <map>
#include <set>
#include <functional>
#include <climits>
using namespace std;
ostream& operator<<(ostream& os, array<int, 3> a) {
return os << '(' << a[0] << ", " << a[1] << ", " << a[2] << ')';
}
template <int N>
long long vol(array<int, N> b) {
return static_cast<long long>(b[0]) * b[1] * b[2];
}
template <int N, int M>
bool fits(array<int, N> a, array<int, M> b) {
return a[0] <= b[0] && a[1] <= b[1] && a[2] <= b[2];
}
// Compares first by volume, then lexicographically.
struct CompareByVolumeDesc {
bool operator()(array<int, 3> a, array<int, 3> b) const {
return vol(a) > vol(b) || vol(a) == vol(b) && a < b;
}
};
vector<array<int, 3>> candSizes;
struct State {
vector<array<int, 4>> req;
int n;
int k;
// Needed for map<>
bool operator<(State const& other) const {
return make_tuple(n, k, req) < make_tuple(other.n, other.k, other.req);
}
} dummy = { {}, -1, -1 };
map<State, pair<int, State>> memo;
// Compute the minimum volume required for the given list of box sizes if we use exactly k of the first n candidate box sizes.
pair<long long, State> solve(State const& s) {
if (empty(s.req)) return { 0, dummy };
if (s.k == 0 || s.k > s.n) return { LLONG_MAX / 4, dummy };
auto previousAnswer = memo.find(s);
if (previousAnswer != end(memo)) return (*previousAnswer).second;
// Try using the nth candidate box size.
int nFitting = 0;
vector<array<int, 4>> notFitting;
for (auto r : s.req) {
if (fits(r, candSizes[s.n - 1])) {
nFitting += r[3];
} else {
notFitting.push_back(r);
}
}
pair<long long, State> solution;
solution.second = { s.req, s.n - 1, s.k };
solution.first = solve(solution.second).first;
if (nFitting > 0) {
State useNth = { notFitting, s.n - 1, s.k - 1 };
long long useNthVol = nFitting * vol(candSizes[s.n - 1]) + solve(useNth).first;
if (useNthVol < solution.first) solution = { useNthVol, useNth };
}
memo[s] = solution;
return solution;
}
void printOptimalSolution(State s) {
while (!empty(s.req)) {
State next = solve(s).second;
if (next.k < s.k) cout << candSizes[s.n - 1] << endl;
s = next;
}
}
int main(int argc, char** argv) {
int n, k;
cin >> n >> k;
vector<array<int, 4>> requestedBoxSizes;
set<int> lengths, widths, heights;
for (int i = 0; i < n; ++i) {
array<int, 4> d; // d[3] is actually the number of requests for this box size
cin >> d[0] >> d[1] >> d[2] >> d[3];
sort(begin(d), begin(d) + 3, std::greater<int>());
requestedBoxSizes.push_back(d);
lengths.insert(d[0]);
widths.insert(d[1]);
heights.insert(d[2]);
}
// Generate all candidate box sizes
for (int l : lengths) {
for (int w : widths) {
for (int h : heights) {
array<int, 3> cand = { l, w, h };
sort(begin(cand), end(cand), std::greater<int>());
candSizes.push_back(cand);
}
}
}
sort(begin(candSizes), end(candSizes), CompareByVolumeDesc());
candSizes.erase(unique(begin(candSizes), end(candSizes)), end(candSizes));
cout << "Number of candidate box sizes: " << size(candSizes) << endl;
State startState = { requestedBoxSizes, static_cast<int>(size(candSizes)), k };
long long minVolume = solve(startState).first;
cout << "Minimum achievable volume using " << k << " box sizes: " << minVolume << endl;
cout << "Optimal set of " << k << " box sizes:" << endl;
printOptimalSolution(startState);
return 0;
}
Example input:
15 5
100 61 35 27
17 89 96 47
31 69 30 55
37 23 39 9
94 11 48 19
38 17 29 36
63 79 80 36
59 52 37 51
86 63 54 7
32 30 11 26
50 88 51 5
74 70 33 14
67 46 4 79
83 94 89 58
65 42 37 69
Example output:
Number of candidate box sizes: 2310
Minimum achievable volume using 5 box sizes: 124069460
Optimal set of 5 box sizes:
(94, 48, 11)
(69, 52, 37)
(100, 89, 35)
(88, 79, 63)
(94, 89, 83)
I'll explain the algorithm behind this if there's interest. It's better than considering all possible combinations of k candidate box sizes, but not terribly efficient.
I am trying to write the code for the following question:
Insert an element(sum of neighbors) between every pair of consecutive elements?
Example: if input is
12 23 34 45 for n=4
Output should be:
12 35 23 57 34 79 45
The code I wrote is:
struct node *InsBet(node *head) {
node *i,*j,*t;
i=head;
while(i->next!=NULL) {
t = (node*)malloc(sizeof(node));
t->data = i->data + i->next->data;
i->next = t;t->prev = i;
t->next = i->next;i->next->prev = t;
i = i->next;
}
return head;
}
Upon printing the array it is crashing my terminal.
My print program is:
void PrintList(node *head) {
node *i;
i=head;
while(i!=NULL) {
printf("%d ",i->data);
i=i->next;
}
}
The first problem is that you're overriding i->next before copying it to t->next
Switch the order of
i->next = t;t->prev = i;
t->next = i->next;i->next->prev = t;
into
t->next = i->next; i->next->prev = t;
i->next = t; t->prev = i;
To elaborate, assume you have a chain of 2 elements in your list: A-->B, and you want to add the temporary element between, so you create t, but since the first thing you do is overwrite the forward pointer of the first element (A in this case), you lose any chance of ever accessing B again. Instead, you assign into the forward pointer of the temporary element the address of itselfm creating an infinite loop.
The second problem is that you advance the current pointer (i) by only one link, which means it would now point to the temporary element you've just added, and you would try to add an additional temporary element between t and B. This would cause an infinite loop - instead advance i by -
i = t->next;
The above answer explained it very well but just to give you a working code, here you go:
PS, you don't need to return the head pointer because its passed by reference and there is no use in returning it
void InsBet(node *head) {
node *i,*t;
i=head;
while(i->next!=NULL) {
t = (node*)malloc(sizeof(node));
t->data = i->data + i->next->data;
t->prev = i;
t->next = i->next;
i->next = i->next->next;
i->prev = t;
i = t->next;
}
}
I'm trying to work on a sub-problem of an larger algorithm which I am really struggling on!
The Problem
If I had a array of numbers (say A), how can I efficiently list all the numbers that can be made by multiplying the numbers together (which can be used as many times as you want) and is less than another number (say x).
For example, let's say I had A = [7, 11, 13] and x was 1010, the answers would be:
- 7 = 7
- 11 = 11
- 13 = 13
- 7*7 = 49
- 7*11 = 77
- 7*13 = 91
- 11*11 = 121
- 11*13 = 143
- 13*13 = 169
- 7*7*7 = 343
- 7*7*11 = 539
- 7*7*13 = 637
- 7*11*11 = 847
- 7*11*13 = 1001
I tried my best not to miss any (but feel free to edit if I have)!
I can tell this is probably some type of recursion but am really struggling on this one!
Optional
A naive solution will also be nice (that's how much I'm struggling).
Running time is also optional.
UPDATE
All numbers in A are all the prime numbers (except 1, 2, 3, 5) got from the sieve of eratosthenes.
UPDATE 2
A is also sorted
UPDATE 3
All numbers in A is under the limit
UPDATE 4
The solution does NOT need to be recursion. That was just an idea I had. And Java or Pseudo code more preferable!
I'd go with using a queue. The algorithm I have in mind would be something like the following (in pseudocode):
multiplyUntil(A, X)
{
queue q = A.toQueue();
result;
while(!q.isEmpty())
{
element = q.pop();
result.add(element); // only if the initial elements are guaranteed to be < X otherwise you should add other checks
for(int i = 0; i < A.length; i++)
{
product = element * A[i];
// A is sorted so if this product is >= X the following will also be >= X
if(product >= X)
{
// get out of the inner cycle
break;
}
q.add(product);
}
}
return result;
}
Let me know if something is unclear.
P.S: Keep in mind that the result is not guaranteed to be sorted. If you want the result to be sorted you could use a heap instead of a queue or sort the result in the end of the computation.
Here's solution on Java along with comments. It's pretty straightforward to translate it to other language.
// numbers is original numbers like {7, 11, 13}, not modified
// offset is the offset of the currently processed number (0 = first)
// limit is the maximal allowed product
// current array is the current combination, each element denotes
// the number of times given number is used. E. g. {1, 2, 0} = 7*11*11
private static void getProducts(int[] numbers, int offset, int limit, int[] current) {
if(offset == numbers.length) {
// all numbers proceed: output the current combination
int product = 1;
StringBuilder res = new StringBuilder();
for(int i=0; i<offset; i++) {
for(int j = 0; j<current[i]; j++) {
if(res.length() > 0) res.append(" * ");
res.append(numbers[i]);
product *= numbers[i];
}
}
// instead of printing you may copy the result to some collection
if(product != 1)
System.out.println(" - "+res+" = "+product);
return;
}
int n = numbers[offset];
int count = 0;
while(limit >= 1) {
current[offset] = count;
getProducts(numbers, offset+1, limit, current);
count++;
// here the main trick: we reduce limit for the subsequent recursive calls
// note that in Java it's integer division
limit/=n;
}
}
// Main method to launch
public static void getProducts(int[] numbers, int limit) {
getProducts(numbers, 0, limit, new int[numbers.length]);
}
Usage:
public static void main(String[] args) {
getProducts(new int[] {7, 11, 13}, 1010);
}
Output:
- 13 = 13
- 13 * 13 = 169
- 11 = 11
- 11 * 13 = 143
- 11 * 11 = 121
- 7 = 7
- 7 * 13 = 91
- 7 * 11 = 77
- 7 * 11 * 13 = 1001
- 7 * 11 * 11 = 847
- 7 * 7 = 49
- 7 * 7 * 13 = 637
- 7 * 7 * 11 = 539
- 7 * 7 * 7 = 343
The resulting products are sorted in different way, but I guess sorting is not a big problem.
Here is my solution in C++. I use a recursive function. The principle is:
the recursive function is given a limit, a current which is a composite and a range of primes [start, end(
it will output all combination of powers of the primes in the given range, multiplied by the current composite
At each step, the function takes the first prime p from the range, and compute all its powers. It then multiplies current by the p as long as the product, cp is under the limit.
We use the fact the array is sorted by leaving as soon as cp is above the limit.
Due to the way we compute the numbers they won't be sorted. But it is easy to add this as a final step once you collected the numbers (in which case ou would use a back_inserter output iterator instead of an ostream_iterator, and do a sort on the collection vector)
#include <algorithm>
#include <iostream>
#include <iterator>
using namespace std;
template <class It, class Out>
void f(int limit, int current, It start, It end, Out out) {
// terminal condition
if(start == end) {
if(current != 1)
*(out++) = current;
return;
}
// Output all numbers where current prime is a factor
// starts at p^0 until p^n where p^n > limit
int p = *start;
for(int cp = current; cp < limit; cp *= p) {
f(limit, cp, start+1, end, out);
}
}
int main(int argc, char* argv[]) {
int const N = 1010;
vector<int> primes{7, 11, 13};
f(N, 1, begin(primes), end(primes), ostream_iterator<int>(cout, "\n"));
}