Sorting multiple arrays using CUDA/Thrust - sorting

I have a large array that I need to sort on the GPU. The array itself is a concatenation of multiple smaller subarrays that satisfy the condition that given i < j, the elements of the subarray i are smaller than the elements of the subarray j. An example of such array would be {5 3 4 2 1 6 9 8 7 10 11},
where the elements of the first subarray of 5 elements are smaller than the elements of the second subarray of 6 elements. The array I need is {1, 2, 3, 4, 5, 6, 7, 10, 11}. I know the position where each subarray starts in the large array.
I know I can simply use thrust::sort on the whole array, but I was wondering if it's possible to launch multiple concurrent sorts, one for each subarray. I'm hoping to get a performance improvement by doing that. My assumption is that it would be faster to sort multiple smaller arrays than one large array with all the elements.
I'd appreciate if someone could give me a way to do that or correct my assumption in case it's wrong.

A way to do multiple concurrent sorts (a "vectorized" sort) in thrust is via the marking of the sub arrays, and providing a custom functor that is an ordinary thrust sort functor that also orders the sub arrays by their key.
Another possible method is to use back-to-back thrust::stable_sort_by_key as described here.
As you have pointed out, another method in your case is just to do an ordinary sort, since that is ultimately your objective.
However I think its unlikely that any of the thrust sort methods will give a signficant speed-up over a pure sort, although you can try it. Thrust has a fast-path radix sort which it will use in certain situations, which the pure sort method could probably use in your case. (In other cases, e.g. when you provide a custom functor, thrust will often use a slower merge-sort method.)
If the sizes of the sub arrays are within certain ranges, I think you're likely to get much better results (performance-wise) with block radix sort in cub, one block per sub-array.
Here is an example that uses specific sizes (since you've given no indication of size ranges and other details), comparing a thrust "pure sort" to a thrust segmented sort with functor, to the cub block sort method. For this particular case, the cub sort is fastest:
$ cat t1.cu
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/scan.h>
#include <thrust/equal.h>
#include <cstdlib>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
const int num_blocks = 2048;
const int items_per = 4;
const int nTPB = 512;
const int block_size = items_per*nTPB; // must be a whole-number multiple of nTPB;
typedef float mt;
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
struct my_sort_functor
{
template <typename T, typename T2>
__host__ __device__
bool operator()(T t1, T2 t2){
if (thrust::get<1>(t1) < thrust::get<1>(t2)) return true;
if (thrust::get<1>(t1) > thrust::get<1>(t2)) return false;
if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false;
return true;}
};
// from: https://nvlabs.github.io/cub/example_block_radix_sort_8cu-example.html#_a0
#define CUB_STDERR
#include <stdio.h>
#include <iostream>
#include <algorithm>
#include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh>
#include <cub/block/block_radix_sort.cuh>
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
bool g_verbose = false;
bool g_uniform_keys;
//---------------------------------------------------------------------
// Kernels
//---------------------------------------------------------------------
template <
typename Key,
int BLOCK_THREADS,
int ITEMS_PER_THREAD>
__launch_bounds__ (BLOCK_THREADS)
__global__ void BlockSortKernel(
Key *d_in, // Tile of input
Key *d_out) // Tile of output
{
enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
// Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
typedef BlockLoad<Key, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT;
// Specialize BlockRadixSort type for our thread block
typedef BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
// Shared memory
__shared__ union TempStorage
{
typename BlockLoadT::TempStorage load;
typename BlockRadixSortT::TempStorage sort;
} temp_storage;
// Per-thread tile items
Key items[ITEMS_PER_THREAD];
// Our current block's offset
int block_offset = blockIdx.x * TILE_SIZE;
// Load items into a blocked arrangement
BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
// Barrier for smem reuse
__syncthreads();
// Sort keys
BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items);
// Store output in striped fashion
StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
}
int main(){
const int ds = num_blocks*block_size;
thrust::host_vector<mt> data(ds);
thrust::host_vector<int> keys(ds);
for (int i = block_size; i < ds; i+=block_size) keys[i] = 1; // mark beginning of blocks
thrust::device_vector<int> d_keys = keys;
for (int i = 0; i < ds; i++) data[i] = (rand()%block_size) + (i/block_size)*block_size; // populate data
thrust::device_vector<mt> d_data = data;
thrust::inclusive_scan(d_keys.begin(), d_keys.end(), d_keys.begin()); // fill out keys array 000111222...
thrust::device_vector<mt> d1 = d_data; // make a copy of unsorted data
cudaDeviceSynchronize();
unsigned long long os = dtime_usec(0);
thrust::sort(d1.begin(), d1.end()); // ordinary sort
cudaDeviceSynchronize();
os = dtime_usec(os);
thrust::device_vector<mt> d2 = d_data; // make a copy of unsorted data
cudaDeviceSynchronize();
unsigned long long ss = dtime_usec(0);
thrust::sort(thrust::make_zip_iterator(thrust::make_tuple(d2.begin(), d_keys.begin())), thrust::make_zip_iterator(thrust::make_tuple(d2.end(), d_keys.end())), my_sort_functor());
cudaDeviceSynchronize();
ss = dtime_usec(ss);
if (!thrust::equal(d1.begin(), d1.end(), d2.begin())) {std::cout << "oops1" << std::endl; return 0;}
std::cout << "ordinary thrust sort: " << os/(float)USECPSEC << "s " << "segmented sort: " << ss/(float)USECPSEC << "s" << std::endl;
thrust::device_vector<mt> d3(ds);
cudaDeviceSynchronize();
unsigned long long cs = dtime_usec(0);
BlockSortKernel<mt, nTPB, items_per><<<num_blocks, nTPB>>>(thrust::raw_pointer_cast(d_data.data()), thrust::raw_pointer_cast(d3.data()));
cudaDeviceSynchronize();
cs = dtime_usec(cs);
if (!thrust::equal(d1.begin(), d1.end(), d3.begin())) {std::cout << "oops2" << std::endl; return 0;}
std::cout << "cub sort: " << cs/(float)USECPSEC << "s" << std::endl;
}
$ nvcc -o t1 t1.cu
$ ./t1
ordinary thrust sort: 0.001652s segmented sort: 0.00263s
cub sort: 0.000265s
$
(CUDA 10.2.89, Tesla V100, Ubuntu 18.04)
I have no doubt that your sizes and array dimensions don't correspond to mine. The purpose here is to illustrate some possible methods, not a black-box solution that works for your particular case. You probably should do benchmark comparisons of your own. I also acknowledge that the block radix sort method for cub expects equal-sized sub-arrays, which you may not have. It may not be a suitable method for you, or you may wish to explore some kind of padding arrangement. There's no need to ask this question of me; I won't be able to answer it based on the information in your question.
I don't claim correctness for this code or any other code that I post. Anyone using any code I post does so at their own risk. I merely claim that I have attempted to address the questions in the original posting, and provide some explanation thereof. I am not claiming my code is defect-free, or that it is suitable for any particular purpose. Use it (or not) at your own risk.

Related

Optimizing bit-waste for custom data encoding

I was wondering what's a good solution to make it so that a custom data structure took the least amount of space possible, and I've been searching around without finding anything.
The general idea is I may have a some kind of data structure with a lot of different variables, integers, booleans, etc. With booleans, it's fairly easy to use bitmasks/flags. For integers, perhaps I only need to use 10 of the numbers for one of the integers, and 50 for another. I would like to have some function encode the structure, without wasting any bits. Ideally I would be able to pack them side-by-side in an array, without any padding.
I have a vague idea that I would have to have way of enumerating all the possible permutations of values of all the variables, but I'm unsure where to start with this.
Additionally, though this may be a bit more complicated, what if I have a bunch of restrictions such as not caring about certain variables if other variables meet certain criteria. This reduces the amount of permutations, so there should be a way of saving some bits here as well?
Example: Say I have a server for an online game, containing many players. Each player. The player struct stores a lot of different variables, level, stats, and a bunch of flags for which quests the player has cleared.
struct Player {
int level; //max is 100
int strength //max is
int int // max is 500
/* ... */
bool questFlag30;
bool questFlag31;
bool questFlag32;
/* ... */
};
and I want to have a function that takes an vector of Players called encodedData encode(std::vector<Player> players) and a function decodeData which returns a vector from the encoded data.
This is what I came up with; it's not perfect, but it's something:
#include <vector>
#include <iostream>
#include <bitset>
#include <assert.h>
/* Data structure for packing multiple variables, without padding */
struct compact_collection {
std::vector<bool> data;
/* Returns a uint32_t since we don't want to store the length of each variable */
uint32_t query_bits(int index, int length) {
std::bitset<32> temp;
for (int i = index; i < index + length; i++) temp[i - index] = data[i];
return temp.to_ulong();
};
/* */
void add_bits(int32_t value, int32_t bits) {
assert(std::pow(2, bits) >= value);
auto a = std::bitset<32>(value).to_string();
for (int i = 32 - bits; i < 32; i++) data.insert(data.begin(), (a[i] == '1'));
};
};
int main() {
compact_collection myCollection;
myCollection.add_bits(45,6);
std::cout << myCollection.query_bits(0,6);
std::cin.get();
return 0;
}

Faster way to read/write a std::unordered_map from/to a file

I am working with some very large std::unordered_maps (hundreds of millions of entries) and need to save and load them to and from a file. The way I am currently doing this is by iterating through the map and reading/writing each key and value pair one at a time:
std::unordered_map<unsigned long long int, char> map;
void save(){
std::unordered_map<unsigned long long int, char>::iterator iter;
FILE *f = fopen("map", "wb");
for(iter=map.begin(); iter!=map.end(); iter++){
fwrite(&(iter->first), 8, 1, f);
fwrite(&(iter->second), 1, 1, f);
}
fclose(f);
}
void load(){
FILE *f = fopen("map", "rb");
unsigned long long int key;
char val;
while(fread(&key, 8, 1, f)){
fread(&val, 1, 1, f);
map[key] = val;
}
fclose(f);
}
But with around 624 million entries, reading the map from a file took 9 minutes. Writing to a file was faster but still took several minutes. Is there a faster way to do this?
C++ unordered_map implementations must all use chaining. There are a variety of really good reasons why you might want to do this for a general purpose hash table, which are discussed here.
This has enormous implications for performance. Most importantly, it means that the entries of the hash table are likely to be scattered throughout memory in a way which makes accessing each one an order of magnitude (or so) less efficient that would be the case if they could somehow be accessed serially.
Fortunately, you can build hash tables that, when nearly full, give near-sequential access to adjacent elements. This is done using open addressing.
Since your hash table is not general purpose, you could try this.
Below, I've built a simple hash table container with open addressing and linear probing. It assumes a few things:
Your keys are already somehow randomly distributed. This obviates the need for a hash function (though decent hash functions are fairly simple to build, even if great hash functions are difficult).
You only ever add elements to the hash table, you do not delete them. If this were not the case you'd need to change the used vector into something that could hold three states: USED, UNUSED, and TOMBSTONE where TOMBSTONE is the stated of a deleted element and used to continue linear search probe or halt a linear insert probe.
That you know the size of your hash table ahead of time, so you don't need to resize/rehash it.
That you don't need to traverse your elements in any particular order.
Of course, there are probably all kinds of excellent implementations of open addressing hash tables online which solve many of the above issues. However, the simplicity of my table allows me to convey the important point.
The important point is this: my design allows all the hash table's information to be stored in three vectors. That is: the memory is contiguous.
Contiguous memory is fast to allocate, fast to read from, and fast to write to. The effect of this is profound.
Using the same test setup as my previous answer, I get the following times:
Save. Save time = 82.9345 ms
Load. Load time = 115.111 ms
This is a 95% decrease in save time (22x faster) and a 98% decrease in load time (62x faster).
Code:
#include <cassert>
#include <chrono>
#include <cstdint>
#include <cstdio>
#include <functional>
#include <iostream>
#include <random>
#include <vector>
const int TEST_TABLE_SIZE = 10000000;
template<class K, class V>
class SimpleHash {
public:
int usedslots = 0;
std::vector<K> keys;
std::vector<V> vals;
std::vector<uint8_t> used;
//size0 should be a prime and about 30% larger than the maximum number needed
SimpleHash(int size0){
vals.resize(size0);
keys.resize(size0);
used.resize(size0/8+1,0);
}
//If the key values are already uniformly distributed, using a hash gains us
//nothing
uint64_t hash(const K key){
return key;
}
bool isUsed(const uint64_t loc){
const auto used_loc = loc/8;
const auto used_bit = 1<<(loc%8);
return used[used_loc]&used_bit;
}
void setUsed(const uint64_t loc){
const auto used_loc = loc/8;
const auto used_bit = 1<<(loc%8);
used[used_loc] |= used_bit;
}
void insert(const K key, const V val){
uint64_t loc = hash(key)%keys.size();
//Use linear probing. Can create infinite loops if table too full.
while(isUsed(loc)){ loc = (loc+1)%keys.size(); }
setUsed(loc);
keys[loc] = key;
vals[loc] = val;
}
V& get(const K key) {
uint64_t loc = hash(key)%keys.size();
while(true){
if(!isUsed(loc))
throw std::runtime_error("Item not present!");
if(keys[loc]==key)
return vals[loc];
loc = (loc+1)%keys.size();
}
}
uint64_t usedSize() const {
return usedslots;
}
uint64_t size() const {
return keys.size();
}
};
typedef SimpleHash<uint64_t, char> table_t;
void SaveSimpleHash(const table_t &map){
std::cout<<"Save. ";
const auto start = std::chrono::steady_clock::now();
FILE *f = fopen("/z/map", "wb");
uint64_t size = map.size();
fwrite(&size, 8, 1, f);
fwrite(map.keys.data(), 8, size, f);
fwrite(map.vals.data(), 1, size, f);
fwrite(map.used.data(), 1, size/8+1, f);
fclose(f);
const auto end = std::chrono::steady_clock::now();
std::cout<<"Save time = "<< std::chrono::duration<double, std::milli> (end-start).count() << " ms" << std::endl;
}
table_t LoadSimpleHash(){
std::cout<<"Load. ";
const auto start = std::chrono::steady_clock::now();
FILE *f = fopen("/z/map", "rb");
uint64_t size;
fread(&size, 8, 1, f);
table_t map(size);
fread(map.keys.data(), 8, size, f);
fread(map.vals.data(), 1, size, f);
fread(map.used.data(), 1, size/8+1, f);
fclose(f);
const auto end = std::chrono::steady_clock::now();
std::cout<<"Load time = "<< std::chrono::duration<double, std::milli> (end-start).count() << " ms" << std::endl;
return map;
}
int main(){
//Perfectly horrendous way of seeding a PRNG, but we'll do it here for brevity
auto generator = std::mt19937(12345); //Combination of my luggage
//Generate values within the specified closed intervals
auto key_rand = std::bind(std::uniform_int_distribution<uint64_t>(0,std::numeric_limits<uint64_t>::max()), generator);
auto val_rand = std::bind(std::uniform_int_distribution<int>(std::numeric_limits<char>::lowest(),std::numeric_limits<char>::max()), generator);
table_t map(1.3*TEST_TABLE_SIZE);
std::cout<<"Created table of size "<<map.size()<<std::endl;
std::cout<<"Generating test data..."<<std::endl;
for(int i=0;i<TEST_TABLE_SIZE;i++)
map.insert(key_rand(),(char)val_rand()); //Low chance of collisions, so we get quite close to the desired size
map.insert(23,42);
assert(map.get(23)==42);
SaveSimpleHash(map);
auto newmap = LoadSimpleHash();
//Ensure that the load worked
for(int i=0;i<map.keys.size();i++)
assert(map.keys.at(i)==newmap.keys.at(i));
for(int i=0;i<map.vals.size();i++)
assert(map.vals.at(i)==newmap.vals.at(i));
for(int i=0;i<map.used.size();i++)
assert(map.used.at(i)==newmap.used.at(i));
}
(Edit: I've added a new answer to this question which achieves a 95% decrease in wall-times.)
I made a Minimum Working Example that illustrates the problem you are trying to solve. This is something you should always do in your questions.
I then eliminated the unsigned long long int stuff and replaced it with uint64_t from the cstdint library. This ensures that we are operating on the same data size, since unsigned long long int can mean almost anything depending on what computer/compiler you use.
The resulting MWE looks like:
#include <chrono>
#include <cstdint>
#include <cstdio>
#include <deque>
#include <functional>
#include <iostream>
#include <random>
#include <unordered_map>
#include <vector>
typedef std::unordered_map<uint64_t, char> table_t;
const int TEST_TABLE_SIZE = 10000000;
void Save(const table_t &map){
std::cout<<"Save. ";
const auto start = std::chrono::steady_clock::now();
FILE *f = fopen("/z/map", "wb");
for(auto iter=map.begin(); iter!=map.end(); iter++){
fwrite(&(iter->first), 8, 1, f);
fwrite(&(iter->second), 1, 1, f);
}
fclose(f);
const auto end = std::chrono::steady_clock::now();
std::cout<<"Save time = "<< std::chrono::duration<double, std::milli> (end-start).count() << " ms" << std::endl;
}
//Take advantage of the limited range of values to save time
void SaveLookup(const table_t &map){
std::cout<<"SaveLookup. ";
const auto start = std::chrono::steady_clock::now();
//Create a lookup table
std::vector< std::deque<uint64_t> > lookup(256);
for(auto &kv: map)
lookup.at(kv.second+128).emplace_back(kv.first);
//Save lookup table header
FILE *f = fopen("/z/map", "wb");
for(const auto &row: lookup){
const uint32_t rowsize = row.size();
fwrite(&rowsize, 4, 1, f);
}
//Save values
for(const auto &row: lookup)
for(const auto &val: row)
fwrite(&val, 8, 1, f);
fclose(f);
const auto end = std::chrono::steady_clock::now();
std::cout<<"Save time = "<< std::chrono::duration<double, std::milli> (end-start).count() << " ms" << std::endl;
}
//Take advantage of the limited range of values and contiguous memory to
//save time
void SaveLookupVector(const table_t &map){
std::cout<<"SaveLookupVector. ";
const auto start = std::chrono::steady_clock::now();
//Create a lookup table
std::vector< std::vector<uint64_t> > lookup(256);
for(auto &kv: map)
lookup.at(kv.second+128).emplace_back(kv.first);
//Save lookup table header
FILE *f = fopen("/z/map", "wb");
for(const auto &row: lookup){
const uint32_t rowsize = row.size();
fwrite(&rowsize, 4, 1, f);
}
//Save values
for(const auto &row: lookup)
fwrite(row.data(), 8, row.size(), f);
fclose(f);
const auto end = std::chrono::steady_clock::now();
std::cout<<"Save time = "<< std::chrono::duration<double, std::milli> (end-start).count() << " ms" << std::endl;
}
void Load(table_t &map){
std::cout<<"Load. ";
const auto start = std::chrono::steady_clock::now();
FILE *f = fopen("/z/map", "rb");
uint64_t key;
char val;
while(fread(&key, 8, 1, f)){
fread(&val, 1, 1, f);
map[key] = val;
}
fclose(f);
const auto end = std::chrono::steady_clock::now();
std::cout<<"Load time = "<< std::chrono::duration<double, std::milli> (end-start).count() << " ms" << std::endl;
}
void Load2(table_t &map){
std::cout<<"Load with Reserve. ";
map.reserve(TEST_TABLE_SIZE+TEST_TABLE_SIZE/8);
const auto start = std::chrono::steady_clock::now();
FILE *f = fopen("/z/map", "rb");
uint64_t key;
char val;
while(fread(&key, 8, 1, f)){
fread(&val, 1, 1, f);
map[key] = val;
}
fclose(f);
const auto end = std::chrono::steady_clock::now();
std::cout<<"Load time = "<< std::chrono::duration<double, std::milli> (end-start).count() << " ms" << std::endl;
}
//Take advantage of the limited range of values to save time
void LoadLookup(table_t &map){
std::cout<<"LoadLookup. ";
map.reserve(TEST_TABLE_SIZE+TEST_TABLE_SIZE/8);
const auto start = std::chrono::steady_clock::now();
FILE *f = fopen("/z/map", "rb");
//Read the header
std::vector<uint32_t> inpsizes(256);
for(int i=0;i<256;i++)
fread(&inpsizes[i], 4, 1, f);
uint64_t key;
for(int i=0;i<256;i++){
const char val = i-128;
for(int v=0;v<inpsizes.at(i);v++){
fread(&key, 8, 1, f);
map[key] = val;
}
}
fclose(f);
const auto end = std::chrono::steady_clock::now();
std::cout<<"Load time = "<< std::chrono::duration<double, std::milli> (end-start).count() << " ms" << std::endl;
}
//Take advantage of the limited range of values and contiguous memory to save time
void LoadLookupVector(table_t &map){
std::cout<<"LoadLookupVector. ";
map.reserve(TEST_TABLE_SIZE+TEST_TABLE_SIZE/8);
const auto start = std::chrono::steady_clock::now();
FILE *f = fopen("/z/map", "rb");
//Read the header
std::vector<uint32_t> inpsizes(256);
for(int i=0;i<256;i++)
fread(&inpsizes[i], 4, 1, f);
for(int i=0;i<256;i++){
const char val = i-128;
std::vector<uint64_t> keys(inpsizes[i]);
fread(keys.data(), 8, inpsizes[i], f);
for(const auto &key: keys)
map[key] = val;
}
fclose(f);
const auto end = std::chrono::steady_clock::now();
std::cout<<"Load time = "<< std::chrono::duration<double, std::milli> (end-start).count() << " ms" << std::endl;
}
int main(){
//Perfectly horrendous way of seeding a PRNG, but we'll do it here for brevity
auto generator = std::mt19937(12345); //Combination of my luggage
//Generate values within the specified closed intervals
auto key_rand = std::bind(std::uniform_int_distribution<uint64_t>(0,std::numeric_limits<uint64_t>::max()), generator);
auto val_rand = std::bind(std::uniform_int_distribution<int>(std::numeric_limits<char>::lowest(),std::numeric_limits<char>::max()), generator);
std::cout<<"Generating test data..."<<std::endl;
//Generate a test table
table_t map;
for(int i=0;i<TEST_TABLE_SIZE;i++)
map[key_rand()] = (char)val_rand(); //Low chance of collisions, so we get quite close to the desired size
Save(map);
{ table_t map2; Load (map2); }
{ table_t map2; Load2(map2); }
SaveLookup(map);
SaveLookupVector(map);
{ table_t map2; LoadLookup (map2); }
{ table_t map2; LoadLookupVector(map2); }
}
On the test data set I use, this gives me a write time of 1982ms and a read time (using your original code) of 7467ms. It seemed as though the read time is the biggest bottleneck, so I created a new function Load2 which reserves sufficient space for the unordered_map prior to reading. This dropped the read time to 4700ms (a 37% savings).
Edit 1
Now, I note that the values of your unordered_map can only take 255 distinct values. Thus, I can easily convert the unordered_map into a kind of lookup table in RAM. That is, rather than having:
123123 1
234234 0
345345 1
237872 1
I can rearrange the data to look like:
0 234234
1 123123 345345 237872
What's the advantage of this? It means that I no longer have to write the value to disk. That saves 1 byte per table entry. Since each table entry consists of 8 bytes for the key and 1 byte for the value, this should give me an 11% savings in both read and write time minus the cost of rearranging the memory (which I expect to be low, because RAM).
Finally, once I've done the above rearrangement, if I have a lot of spare RAM on the machine, I can pack everything into a vector and read/write the contiguous data to disk.
Doing all this gives the following times:
Save. Save time = 1836.52 ms
Load. Load time = 7114.93 ms
Load with Reserve. Load time = 4277.58 ms
SaveLookup. Save time = 1688.73 ms
SaveLookupVector. Save time = 1394.95 ms
LoadLookup. Load time = 3927.3 ms
LoadLookupVector. Load time = 3739.37 ms
Note that the transition from Save to SaveLookup gives an 8% speed-up and the transition from Load with Reserve to LoadLookup gives an 8% speed-up as well. This is right in line our theory!
Using contiguous memory as well gives a total of a 24% speed-up over your original save time and a total of a 47% speed-up over your original load time.
Since your data seems to be static and given the amount of items, I would certainly consider using an own structure in a binary file and then use memory mapping on that file.
Opening would be instant (just mmap the file).
If you write the values in sorted order, you can use binary search on the mapped data.
If that is not good enough, you could split your data in buckets and store a list with offsets at the beginning of the file - or maybe even use some hash key.
If your keys are all unique and somewhat contiguous, you could even get a smaller file by only storing the char values in file position [key] (and use a special value for null values). Of course that wouldn't work for the full uint64 range, but depending on the data they could be grouped together in buckets containing an offset.
Using mmap this way would also use a lot less memory.
For faster access you could create your own hash map on disk (still with 'instant load').
For example, say you have 1 million hashes (in your case there would be lot more), you could write 1 million uint64 filepos values at the beginning of the file (the hash value would be the position of the uint64 containing the filepos). Each location would point to a block with one ore more key/value pairs, and each of those blocks would start with a count.
If the blocks are aligned on 2 or 4 bytes, a uint32 filepos could be used instead (multiply pos with 2 or 4).
Since the data is static you don't have to worry about possible insertions or deletions, which makes it rather easy to implement.
This has the advantage that you still can mmap the whole file and all the key/value pairs with the same hash are close together which brings them in the L1 cache (as compared to say linked lists)
I would assume you need the map to write the values ordered in the file. It would be better to load only once the values in a container, possibly a std::deque would be better since the amount is large, and use std::sort once, and then iterate through std::deque to write values. You would gain cache performance and also the run time complexity for std::sort is N*Log(N), which would be better than balancing your map ~624 million times or paying cache misses in an unordered map.
Perhaps a prefix-ordered traversal during save would help to reduce the amount of internal reordering during load?
Of course, you don't have visibility of the internal structure of the STL map containers, so the best you could do would be to simulate that by binary-chopping the iterator as if it was linear. Given that you know the total N nodes, save the node N/2, then N/4, N*3/4, and so-on.
This can be done algorithmically by visiting every odd N/(2^p) node in each pass p: N/2, N*1/4, N*3/4, N*1/8, N*3/8, N*5/8, N*7/8, etc, though you need to ensure that the series maintains step sizes such that N*4/8 = N/2, but without resorting to step sizes of 2^(P-p), and that in the last pass you visit every remaining node. You may find it advantageous to pre-calculate the highest pass number (~log2(N)), and the float value of S=N/(2^P) such that 0.5 < S <= 1.0, and then scale that back up for each p.
But as others have said, you need to profile it first to see if this is your issue, and profile again to see if this approach helps.

Parallel multiplication of many small matrices by fixed vector

Situation is the following: I have a number (1000s) of elements which are given by small matrices of dimensions 4x2, 9x3 ... you get the idea. All matrices have the same dimension.
I want to multiply each of these matrices with a fixed vector of precalculated values. In short:
for(i = 1...n)
X[i] = M[i] . N;
What is the best approach to do this in parallel using Thrust? How do I lay out my data in memory?
NB: There might be specialized, more suitable libraries to do this on GPUs. I'm interested in Thrust because it allows me to deploy to different backends, not just CUDA.
One possible approach:
flatten the arrays (matrices) into a single data vector. This is an advantageous step for enabling general thrust processing anyway.
use a strided range mechanism to take your scaling vector and extend it to the overall length of your flattened data vector
use thrust::transform with thrust::multiplies to multiply the two vectors together.
If you need to access the matrices later out of your flattened data vector (or result vector), you can do so with pointer arithmetic, or a combination of fancy iterators.
If you need to re-use the extended scaling vector, you may want to use the method outlined in step 2 exactly (i.e. create an actual vector using that method, length = N matrices, repeated). If you are only doing this once, you can achieve the same effect with a counting iterator, followed by a transform iterator (modulo the length of your matrix in elements), followed by a permutation iterator, to index into your original scaling vector (length = 1 matrix).
The following example implements the above, without using the strided range iterator method:
#include <iostream>
#include <stdlib.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/functional.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/transform.h>
#define N_MAT 1000
#define H_MAT 4
#define W_MAT 3
#define RANGE 1024
struct my_modulo_functor : public thrust::unary_function<int, int>
{
__host__ __device__
int operator() (int idx) {
return idx%(H_MAT*W_MAT);}
};
int main(){
thrust::host_vector<int> data(N_MAT*H_MAT*W_MAT);
thrust::host_vector<int> scale(H_MAT*W_MAT);
// synthetic; instead flatten/copy matrices into data vector
for (int i = 0; i < N_MAT*H_MAT*W_MAT; i++) data[i] = rand()%RANGE;
for (int i = 0; i < H_MAT*W_MAT; i++) scale[i] = rand()%RANGE;
thrust::device_vector<int> d_data = data;
thrust::device_vector<int> d_scale = scale;
thrust::device_vector<int> d_result(N_MAT*H_MAT*W_MAT);
thrust::transform(d_data.begin(), d_data.end(), thrust::make_permutation_iterator(d_scale.begin(), thrust::make_transform_iterator(thrust::counting_iterator<int>(0), my_modulo_functor())) ,d_result.begin(), thrust::multiplies<int>());
thrust::host_vector<int> result = d_result;
for (int i = 0; i < N_MAT*H_MAT*W_MAT; i++)
if (result[i] != data[i] * scale[i%(H_MAT*W_MAT)]) {std::cout << "Mismatch at: " << i << " cpu result: " << (data[i] * scale[i%(H_MAT*W_MAT)]) << " gpu result: " << result[i] << std::endl; return 1;}
std::cout << "Success!" << std::endl;
return 0;
}
EDIT: Responding to a question below:
The benefit of fancy iterators (i.e. transform(numbers, iterator)) is that they often allow for eliminaion of extra data copies/data movement, as compared to assembling other number (which requires extra steps and data movement) and then passing it to transform(numbers, other numbers). If you're only going to use other numbers once, then the fancy iterators will generally be better. If you're going to use other numbers again, then you may want to assemble it explicitly. This preso is instructive, in particular "Fusion".
For a one-time use of other numbers the overhead of assembling it on the fly using fancy iterators and the functor is generally lower than explicitly creating a new vector, and then passing that new vector to the transform routine.
When looking for a software library which is concisely made for multiplying small matrices, then one may have a look at https://github.com/hfp/libxsmm. Below, the code requests a specialized matrix kernel according to the typical GEMM parameters (please note that some limitations apply).
double alpha = 1, beta = 1;
const char transa = 'N', transb = 'N';
int flags = LIBXSMM_GEMM_FLAGS(transa, transb);
int prefetch = LIBXSMM_PREFETCH_AUTO;
libxsmm_blasint m = 23, n = 23, k = 23;
libxsmm_dmmfunction xmm = NULL;
xmm = libxsmm_dmmdispatch(m, n, k,
&m/*lda*/, &k/*ldb*/, &m/*ldc*/,
&alpha, &beta, &flags, &prefetch);
Given the above code, one can proceed and run "xmm" for an entire series of (small) matrices without a particular data structure (below code also uses "prefetch locations").
if (0 < n) { /* check that n is at least 1 */
# pragma parallel omp private(i)
for (i = 0; i < (n - 1); ++i) {
const double *const ai = a + i * asize;
const double *const bi = b + i * bsize;
double *const ci = c + i * csize;
xmm(ai, bi, ci, ai + asize, bi + bsize, ci + csize);
}
xmm(a + (n - 1) * asize, b + (n - 1) * bsize, c + (n - 1) * csize,
/* pseudo prefetch for last element of batch (avoids page fault) */
a + (n - 1) * asize, b + (n - 1) * bsize, c + (n - 1) * csize);
}
In addition to the manual loop control as shown above, libxsmm_gemm_batch (or libxsmm_gemm_batch_omp) can be used (see ReadTheDocs). The latter is useful if data structures exist that describe the series of operands (A, B, and C matrices).
There are two reasons why this library gives superior performance: (1) on-the-fly code specialization using an in-memory code generation technique, and (2) loading the next matrix operands while calculating the current product.
( Given one is looking for something that blends well with C/C++, this library supports it. However, it does not aim for CUDA/Thrust. )

How to partly sort arrays on CUDA?

Problem
Provided I have two arrays:
const int N = 1000000;
float A[N];
myStruct *B[N];
The numbers in A can be positive or negative (e.g. A[N]={3,2,-1,0,5,-2}), how can I make the array A partly sorted (all positive values first, not need to be sorted, then negative values)(e.g. A[N]={3,2,5,0,-1,-2} or A[N]={5,2,3,0,-2,-1}) on the GPU? The array B should be changed according to A (A is keys, B is values).
Since the scale of A,B can be very large, I think the sort algorithm should be implemented on GPU (especially on CUDA, because I use this platform). Surely I know thrust::sort_by_key can do this work, but it does muck extra work since I do not need the array A&B to be sorted entirely.
Has anyone come across this kind of problem?
Thrust example
thrust::sort_by_key(thrust::device_ptr<float> (A),
thrust::device_ptr<float> ( A + N ),
thrust::device_ptr<myStruct> ( B ),
thrust::greater<float>() );
Thrust's documentation on Github is not up-to-date. As #JaredHoberock said, thrust::partition is the way to go since it now supports stencils. You may need to get a copy from the Github repository:
git clone git://github.com/thrust/thrust.git
Then run scons doc in the Thrust folder to get an updated documentation, and use these updated Thrust sources when compiling your code (nvcc -I/path/to/thrust ...). With the new stencil partition, you can do:
#include <thrust/partition.h>
#include <thrust/execution_policy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/tuple.h>
struct is_positive
{
__host__ __device__
bool operator()(const int &x)
{
return x >= 0;
}
};
thrust::partition(thrust::host, // if you want to test on the host
thrust::make_zip_iterator(thrust::make_tuple(keyVec.begin(), valVec.begin())),
thrust::make_zip_iterator(thrust::make_tuple(keyVec.end(), valVec.end())),
keyVec.begin(),
is_positive());
This returns:
Before:
keyVec = 0 -1 2 -3 4 -5 6 -7 8 -9
valVec = 0 1 2 3 4 5 6 7 8 9
After:
keyVec = 0 2 4 6 8 -5 -3 -7 -1 -9
valVec = 0 2 4 6 8 5 3 7 1 9
Note that the 2 partitions are not necessarily sorted. Also, the order may differ between the original vectors and the partitions. If this is important to you, you can use thrust::stable_partition:
stable_partition differs from partition in that stable_partition is
guaranteed to preserve relative order. That is, if x and y are
elements in [first, last), such that pred(x) == pred(y), and if x
precedes y, then it will still be true after stable_partition that x
precedes y.
If you want a complete example, here it is:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/partition.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/tuple.h>
struct is_positive
{
__host__ __device__
bool operator()(const int &x)
{
return x >= 0;
}
};
void print_vec(const thrust::host_vector<int>& v)
{
for(size_t i = 0; i < v.size(); i++)
std::cout << " " << v[i];
std::cout << "\n";
}
int main ()
{
const int N = 10;
thrust::host_vector<int> keyVec(N);
thrust::host_vector<int> valVec(N);
int sign = 1;
for(int i = 0; i < N; ++i)
{
keyVec[i] = sign * i;
valVec[i] = i;
sign *= -1;
}
// Copy host to device
thrust::device_vector<int> d_keyVec = keyVec;
thrust::device_vector<int> d_valVec = valVec;
std::cout << "Before:\n keyVec = ";
print_vec(keyVec);
std::cout << " valVec = ";
print_vec(valVec);
// Partition key-val on device
thrust::partition(thrust::make_zip_iterator(thrust::make_tuple(d_keyVec.begin(), d_valVec.begin())),
thrust::make_zip_iterator(thrust::make_tuple(d_keyVec.end(), d_valVec.end())),
d_keyVec.begin(),
is_positive());
// Copy result back to host
keyVec = d_keyVec;
valVec = d_valVec;
std::cout << "After:\n keyVec = ";
print_vec(keyVec);
std::cout << " valVec = ";
print_vec(valVec);
}
UPDATE
I made a quick comparison with the thrust::sort_by_key version, and the thrust::partition implementation does seem to be faster (which is what we could naturally expect). Here is what I obtain on NVIDIA Visual Profiler, with N = 1024 * 1024, with the sort version on the left, and the partition version on the right. You may want to do the same kind of tests on your own.
How about this?:
Count how many positive numbers to determine the inflexion point
Evenly divide each side of the inflexion point into groups (negative-groups are all same length but different length to positive-groups. these groups are the memory chunks for the results)
Use one kernel call (one thread) per chunk pair
Each kernel swaps any out-of-place elements in the input groups into the desired output groups. You will need to flag any chunks that have more swaps than the maximum so that you can fix them during subsequent iterations.
Repeat until done
Memory traffic is swaps only (from original element position, to sorted position). I don't know if this algorithm sounds like anything already defined...
You should be able to achieve this in thrust simply with a modification of your comparison operator:
struct my_compare
{
__device__ __host__ bool operator()(const float x, const float y) const
{
return !((x<0.0f) && (y>0.0f));
}
};
thrust::sort_by_key(thrust::device_ptr<float> (A),
thrust::device_ptr<float> ( A + N ),
thrust::device_ptr<myStruct> ( B ),
my_compare() );

CUDA Thrust and sort_by_key

I’m looking for a sorting algorithm on CUDA that can sort an array A of elements (double) and returns an array of keys B for that array A.
I know the sort_by_key function in the Thrust library but I want my array of elements A to remain unchanged.
What can I do?
My code is:
void sortCUDA(double V[], int P[], int N) {
real_t *Vcpy = (double*) malloc(N*sizeof(double));
memcpy(Vcpy,V,N*sizeof(double));
thrust::sort_by_key(V, V + N, P);
free(Vcpy);
}
i'm comparing the thrust algorithm against others that i have on sequencial cpu
N mergesort sortCUDA
113 0.000008 0.000010
226 0.000018 0.000016
452 0.000036 0.000020
905 0.000061 0.000034
1810 0.000135 0.000071
3621 0.000297 0.000156
7242 0.000917 0.000338
14484 0.001421 0.000853
28968 0.003069 0.001931
57937 0.006666 0.003939
115874 0.014435 0.008025
231749 0.031059 0.016718
463499 0.067407 0.039848
926999 0.148170 0.118003
1853998 0.329005 0.260837
3707996 0.731768 0.544357
7415992 1.638445 1.073755
14831984 3.668039 2.150179
115035495 39.276560 19.812200
230070990 87.750377 39.762915
460141980 200.940501 74.605219
Thrust performance is not bad, but I think if I use OMP can probably get easily a better CPU time
I think this is because to memcpy
SOLUTION:
void thrustSort(double V[], int P[], int N)
{
thrust::device_vector<int> d_P(N);
thrust::device_vector<double> d_V(V, V + N);
thrust::sequence(d_P.begin(), d_P.end());
thrust::sort_by_key(d_V.begin(), d_V.end(), d_P.begin());
thrust::copy(d_P.begin(),d_P.end(),P);
}
where V is a my double values to sort
You can modify comparison operator to sort keys instead of values. #Robert Crovella correctly pointed that a raw device pointer cannot be assigned from the host. The modified algorithm is below:
struct cmp : public binary_function<int,int,bool>
{
cmp(const double *ptr) : rawA(ptr) { }
__host__ __device__ bool operator()(const int i, const int j) const
{return rawA[i] > rawA[j];}
const double *rawA; // an array in global mem
};
void sortkeys(double *A, int n) {
// move data to the gpu
thrust::device_vector<double> devA(A, A + n);
double *rawA = thrust::raw_pointer_cast(devA.data());
thrust::device_vector<int> B(n);
// initialize keys
thrust::sequence(B.begin(), B.end());
thrust::sort(B.begin(), B.end(), cmp(rawA));
// B now contains the sorted keys
}
And here is alternative with arrayfire. Though I am not sure which one is more efficient since arrayfire solution uses two additional arrays:
void sortkeys(double *A, int n) {
af::array devA(n, A, af::afHost);
af::array vals, indices;
// sort and populate vals/indices arrays
af::sort(vals, indices, devA);
std::cout << devA << "\n" << indices << "\n";
}
How large is this array? The most efficient way, in terms of speed, will likely be to just duplicate the original array before sorting, if the memory is available.
Building on the answer provided by #asm (I wasn't able to get it working), this code seemed to work for me, and does sort only the keys. However, I believe it is limited to the case where the keys are in sequence 0, 1, 2, 3, 4 ... corresponding to the (double) values. Since this is a "index-value" sort, it could be extended to the case of an arbitrary sequence of keys, perhaps by doing an indexed copy. However I'm not sure the process of generating the index sequence and then rearranging the original keys will be any faster than just copying the original value data to a new vector (for the case of arbitrary keys).
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
using namespace std;
__device__ double *rawA; // an array in global mem
struct cmp : public binary_function<int, int, bool>
{
__host__ __device__ bool operator()(const int i, const int j) const
{return ( rawA[i] < rawA[j]);}
};
void sortkeys(double *A, int n) {
// move data to the gpu
thrust::device_vector<double> devA(A, A + n);
// rawA = thrust::raw_pointer_cast(&(devA[0]));
double *test = raw_pointer_cast(devA.data());
cudaMemcpyToSymbol(rawA, &test, sizeof(double *));
thrust::device_vector<int> B(n);
// initialize keys
thrust::sequence(B.begin(), B.end());
thrust::sort(B.begin(), B.end(), cmp());
// B now contains the sorted keys
thrust::host_vector<int> hostB = B;
for (int i=0; i<hostB.size(); i++)
std::cout << hostB[i] << " ";
std::cout<<std::endl;
for (int i=0; i<hostB.size(); i++)
std::cout << A[hostB[i]] << " ";
std::cout<<std::endl;
}
int main(){
double C[] = {0.7, 0.3, 0.4, 0.2, 0.6, 1.2, -0.5, 0.5, 0.0, 10.0};
sortkeys(C, 9);
std::cout << std::endl;
return 0;
}

Resources