Segmented Sort with CUDPP/Thrust - sorting

Is it possible to do segmented sort in with CUDPP in CUDA? By segmented sort, I mean to sort elements of array which are protected by flags like below.
A[10,9,8,7,6,5,4,3,2,1]
Flag array[1,0,1,0,0,1,0,0,0,0]
Sort elements of A which are between consecutive 1.
Expected output
[9,10,6,7,8,1,2,3,4,5]

you can do this in a single sorting pass: the idea is to adjust the elements in your array such that sort will relocate elements only within the "segments"
for your example:
A[10,9,8,7,6,5,4,3,2,1]
flag[0,0,1,0,0,1,0,0,0,0]
(I removed the first 1 since it's not needed)
first scan the flag array:
scanned_flag[0,0,1,1,1,2,2,2,2,2]
then you have many options depending on the number types, e.g. for unsigned integers you can set the highest bits to distinguish the "segments".
The easiest way is just to add the largest element multiplied by scanned_flags:
A + scanned_flag*10 = [10,9,18,17,16,25,24,23,22,21]
the rest is easy: sort the array and reverse the transformation.
Here are the two versions: using Arrayfire and thrust. Check whichever you like more.
Arrayfire:
void af_test() {
int A[] = {10,9,8,7,6,5,4,3,2,1};
int S[] = {0, 0,1,0,0,1,0,0,0,0};
int n = sizeof(A) / sizeof(int);
af::array devA(n, A, af::afHost);
af::array devS(n, S, af::afHost);
// obtain the max element
int maxi = af::max< int >(devS);
// scan the keys
// keys = 0,0,1,1,1,2,2,2,2,2
af::array keys = af::accum(devS);
// compute: A = A + keys * maxi
// A = 10,9,18,17,16,25,24,23,22,21
devA = devA + keys * maxi;
// sort the array
// A = 9,10,16,17,18,21,22,23,24,25
devA = af::sort(devA);
// compute: A = A - keys * maxi
// A = 9,10,6,7,8,1,2,3,4,5
devA = devA - keys * maxi;
// print the results
print(devA);
}
Thrust:
template<typename T>
struct add_mul : public binary_function<T,T,T>
{
add_mul(const T& _factor) : factor(_factor) {
}
__host__ __device__ T operator()(const T& a, const T& b) const
{
return (a + b * factor);
}
const T factor;
};
void thrust_test()
{
int A[] = {10,9,8,7,6,5,4,3,2,1};
int S[] = {0, 0,1,0,0,1,0,0,0,0};
int n = sizeof(A) / sizeof(int);
thrust::host_vector< int > hA(A, A + n), hS(S, S + n);
thrust::device_vector< int > devA = hA, devS = hS, keys(n);
// scan the keys
thrust::inclusive_scan(devS.begin(), devS.end(), keys.begin());
// obtain the maximal element
int maxi = *(thrust::max_element(devA.begin(), devA.end()));
// compute: A = A + keys * maxi
thrust::transform(devA.begin(), devA.end(), keys.begin(), devA.begin(), add_mul< int >(maxi));
// sort the array
thrust::sort(devA.begin(), devA.end());
// compute: A = A - keys * maxi
thrust::transform(devA.begin(), devA.end(), keys.begin(), devA.begin(), add_mul< int >(-maxi));
// copy back to the host
hA = devA;
std::cout << "\nSorted array\n";
thrust::copy(hA.begin(), hA.end(), std::ostream_iterator<int>(std::cout, "\n"));
}

Related

Cannot understand hoow to recursively merge sort

Currently self-learning C++ with Daniel Liang's Introduction to C++.
On the topic of the merge sort, I cannot seem to understand how his code is recursively calling itself.
I understand the general concept of the merge sort, but I am having trouble understanding this code specifically.
In this example, we first pass the list 1, 7, 3, 4, 9, 3, 3, 1, 2, and its size (9) to the mergeSort function.
From there, we divide the list into two until the array size reaches 1. In this case, we would get: 1,7,3,4 -> 1,7 -> 1. We then move onto the merge sorting the second half. The second half array would be 7 in this case. We merge the two arrays [1] and [7] and proceed to delete the two arrays that were dynamically allocated to prevent any memory leak.
The part I don't understand is how does this code run from here? After delete[] firstHalf and delete[] secondHalf. From my understanding, shouldn't there be another mergeSort function call in order to merge sort the new firstHalf and secondHalf?
#include <iostream>
using namespace std;
// Function prototype
void arraycopy(int source[], int sourceStartIndex,
int target[], int targetStartIndex, int length);
void merge(int list1[], int list1Size,
int list2[], int list2Size, int temp[]);
// The function for sorting the numbers
void mergeSort(int list[], int arraySize)
{
if (arraySize > 1)
{
// Merge sort the first half
int* firstHalf = new int[arraySize / 2];
arraycopy(list, 0, firstHalf, 0, arraySize / 2);
mergeSort(firstHalf, arraySize / 2);
// Merge sort the second half
int secondHalfLength = arraySize - arraySize / 2;
int* secondHalf = new int[secondHalfLength];
arraycopy(list, arraySize / 2, secondHalf, 0, secondHalfLength);
mergeSort(secondHalf, secondHalfLength);
// Merge firstHalf with secondHalf
merge(firstHalf, arraySize / 2, secondHalf, secondHalfLength,
list);
delete [] firstHalf;
delete [] secondHalf;
}
}
void merge(int list1[], int list1Size,
int list2[], int list2Size, int temp[])
{
int current1 = 0; // Current index in list1
int current2 = 0; // Current index in list2
int current3 = 0; // Current index in temp
while (current1 < list1Size && current2 < list2Size)
{
if (list1[current1] < list2[current2])
temp[current3++] = list1[current1++];
else
temp[current3++] = list2[current2++];
}
while (current1 < list1Size)
temp[current3++] = list1[current1++];
while (current2 < list2Size)
temp[current3++] = list2[current2++];
}
void arraycopy(int source[], int sourceStartIndex,
int target[], int targetStartIndex, int length)
{
for (int i = 0; i < length; i++)
{
target[i + targetStartIndex] = source[i + sourceStartIndex];
}
}
int main()
{
const int SIZE = 9;
int list[] = {1, 7, 3, 4, 9, 3, 3, 1, 2};
mergeSort(list, SIZE);
for (int i = 0; i < SIZE; i++)
cout << list[i] << " ";
return 0;
}
From my understanding, shouldn't there be another mergeSort function
call in order to merge sort the new firstHalf and secondHalf?
It is happening implicitly during the recursive call. When you reach these two lines:
delete [] firstHalf;
delete [] secondHalf;
It means that one call to mergeSort is completed. If this call belongs to merging a first half, then code starts from the line after, i.e. these lines:
// Merge sort the second half
int secondHalfLength = arraySize - arraySize / 2;
...
But, if this call belongs to merging of the second half, then the control goes back to the line just after that call, i.e. these lines:
// Merge firstHalf with secondHalf
merge(firstHalf, arraySize / 2, secondHalf, secondHalfLength,
list);
And everything if doing well as planned.

Find all anagrams in a string O(n) solution

Here is the problem:
Given a string s and a non-empty string p, find all the start indices of p's anagrams in s.
Input: s: "cbaebabacd" p: "abc"
Output: [0, 6]
Input: s: "abab" p: "ab"
Output: [0, 1, 2]
Here is my solution
vector<int> findAnagrams(string s, string p) {
vector<int> res, s_map(26,0), p_map(26,0);
int s_len = s.size();
int p_len = p.size();
if (s_len < p_len) return res;
for (int i = 0; i < p_len; i++) {
++s_map[s[i] - 'a'];
++p_map[p[i] - 'a'];
}
if (s_map == p_map)
res.push_back(0);
for (int i = p_len; i < s_len; i++) {
++s_map[s[i] - 'a'];
--s_map[s[i - p_len] - 'a'];
if (s_map == p_map)
res.push_back(i - p_len + 1);
}
return res;
}
However, I think it is O(n^2) solution because I have to compare vectors s_map and p_map.
Does a O(n) solution exist for this problem?
lets say p has size n.
lets say you have an array A of size 26 that is filled with the number of a,b,c,... which p contains.
then you create a new array B of size 26 filled with 0.
lets call the given (big) string s.
first of all you initialize B with the number of a,b,c,... in the first n chars of s.
then you iterate through each word of size n in s always updating B to fit this n-sized word.
always B matches A you will have an index where we have an anagram.
to change B from one n-sized word to another, notice you just have to remove in B the first char of the previous word and add the new char of the next word.
Look at the example:
Input
s: "cbaebabacd"
p: "abc" n = 3 (size of p)
A = {1, 1, 1, 0, 0, 0, ... } // p contains just 1a, 1b and 1c.
B = {1, 1, 1, 0, 0, 0, ... } // initially, the first n-sized word contains this.
compare(A,B)
for i = n; i < size of s; i++ {
B[ s[i-n] ]--;
B[ s[ i ] ]++;
compare(A,B)
}
and suppose that compare(A,B) prints the index always A matches B.
the total complexity will be:
first fill of A = O(size of p)
first fill of B = O(size of s)
first comparison = O(26)
for-loop = |s| * (2 + O(26)) = |s| * O(28) = O(28|s|) = O(size of s)
____________________________________________________________________
2 * O(size of s) + O(size of p) + O(26)
which is linear in size of s.
Your solution is the O(n) solution. The size of the s_map and p_map vectors is a constant (26) that doesn't depend on n. So the comparison between s_map and p_map takes a constant amount of time regardless of how big n is.
Your solution takes about 26 * n integer comparisons to complete, which is O(n).
// In papers on string searching algorithms, the alphabet is often
// called Sigma, and it is often not considered a constant. Your
// algorthm works in (Sigma * n) time, where n is the length of the
// longer string. Below is an algorithm that works in O(n) time even
// when Sigma is too large to make an array of size Sigma, as long as
// values from Sigma are a constant number of "machine words".
// This solution works in O(n) time "with high probability", meaning
// that for all c > 2 the probability that the algorithm takes more
// than c*n time is 1-o(n^-c). This is a looser bound than O(n)
// worst-cast because it uses hash tables, which depend on randomness.
#include <functional>
#include <iostream>
#include <type_traits>
#include <vector>
#include <unordered_map>
#include <vector>
using namespace std;
// Finding a needle in a haystack. This works for any iterable type
// whose members can be stored as keys of an unordered_map.
template <typename T>
vector<size_t> AnagramLocations(const T& needle, const T& haystack) {
// Think of a contiguous region of an ordered container as
// representing a function f with the domain being the type of item
// stored in the container and the codomain being the natural
// numbers. We say that f(x) = n when there are n x's in the
// contiguous region.
//
// Then two contiguous regions are anagrams when they have the same
// function. We can track how close they are to being anagrams by
// subtracting one function from the other, pointwise. When that
// difference is uniformly 0, then the regions are anagrams.
unordered_map<remove_const_t<remove_reference_t<decltype(*needle.begin())>>,
intmax_t> difference;
// As we iterate through the haystack, we track the lead (part
// closest to the end) and lag (part closest to the beginning) of a
// contiguous region in the haystack. When we move the region
// forward by one, one part of the function f is increased by +1 and
// one part is decreased by -1, so the same is true of difference.
auto lag = haystack.begin(), lead = haystack.begin();
// To compare difference to the uniformly-zero function in O(1)
// time, we make sure it does not contain any points that map to
// 0. The the property of being uniformly zero is the same as the
// property of having an empty difference.
const auto find = [&](const auto& x) {
difference[x]++;
if (0 == difference[x]) difference.erase(x);
};
const auto lose = [&](const auto& x) {
difference[x]--;
if (0 == difference[x]) difference.erase(x);
};
vector<size_t> result;
// First we initialize the difference with the first needle.size()
// items from both needle and haystack.
for (const auto& x : needle) {
lose(x);
find(*lead);
++lead;
if (lead == haystack.end()) return result;
}
size_t i = 0;
if (difference.empty()) result.push_back(i++);
// Now we iterate through the haystack with lead, lag, and i (the
// position of lag) updating difference in O(1) time at each spot.
for (; lead != haystack.end(); ++lead, ++lag, ++i) {
find(*lead);
lose(*lag);
if (difference.empty()) result.push_back(i);
}
return result;
}
int main() {
string needle, haystack;
cin >> needle >> haystack;
const auto result = AnagramLocations(needle, haystack);
for (auto x : result) cout << x << ' ';
}
import java.util.*;
public class FindAllAnagramsInAString_438{
public static void main(String[] args){
String s="abab";
String p="ab";
// String s="cbaebabacd";
// String p="abc";
System.out.println(findAnagrams(s,p));
}
public static List<Integer> findAnagrams(String s, String p) {
int i=0;
int j=p.length();
List<Integer> list=new ArrayList<>();
while(j<=s.length()){
//System.out.println("Substring >>"+s.substring(i,j));
if(isAnamgram(s.substring(i,j),p)){
list.add(i);
}
i++;
j++;
}
return list;
}
public static boolean isAnamgram(String s,String p){
HashMap<Character,Integer> map=new HashMap<>();
if(s.length()!=p.length()) return false;
for(int i=0;i<s.length();i++){
char chs=s.charAt(i);
char chp=p.charAt(i);
map.put(chs,map.getOrDefault(chs,0)+1);
map.put(chp,map.getOrDefault(chp,0)-1);
}
for(int val:map.values()){
if(val!=0) return false;
}
return true;
}
}

Parallel radix sort with virtual memory and write-combining

I'm attempting to implement the variant of parallel radix sort described in http://arxiv.org/pdf/1008.2849v2.pdf (Algorithm 2), but my C++ implementation (for 4 digits in base 10) contains a bug that I'm unable to locate.
For debugging purposes I'm using no parallelism, but the code should still sort correctly.
For instance the line arr.at(i) = item accesses indices outside its bounds in the following
std::vector<int> v = {4612, 4598};
radix_sort2(v);
My implementation is as follows
#include <set>
#include <array>
#include <vector>
void radix_sort2(std::vector<int>& arr) {
std::array<std::set<int>, 10> buckets3;
for (const int item : arr) {
int d = item / 1000;
buckets3.at(d).insert(item);
}
//Prefix sum
std::array<int, 10> outputIndices;
outputIndices.at(0) = 0;
for (int i = 1; i < 10; ++i) {
outputIndices.at(i) = outputIndices.at(i - 1) +
buckets3.at(i - 1).size();
}
for (const auto& bucket3 : buckets3) {
std::array<std::set<int>, 10> buckets0, buckets1;
std::array<int, 10> histogram2 = {};
for (const int item : bucket3) {
int d = item % 10;
buckets0.at(d).insert(item);
}
for (const auto& bucket0 : buckets0) {
for (const int item : bucket0) {
int d = (item / 10) % 10;
buckets1.at(d).insert(item);
int d2 = (item / 100) % 10;
++histogram2.at(d2);
}
}
for (const auto& bucket1 : buckets1) {
for (const int item : bucket1) {
int d = (item / 100) % 10;
int i = outputIndices.at(d) + histogram2.at(d);
++histogram2.at(d);
arr.at(i) = item;
}
}
}
}
Can anyone spot my mistake?
I took at look at the paper you linked. You haven't made any mistakes, none that I can see. In fact, in my estimation, you corrected a mistake in the algorithm.
I wrote out the algorithm and ended up with the exact same problem as you did. After reviewing Algorithm 2, either I woefully mis-understand how it is supposed to work, or it is flawed. There are at least a couple of problems with the algorithm, specifically revolving around outputIndices, and histogram2.
Looking at the algorithm, the final index of an item is determined by the counting sort stored in outputIndices. (lets ignore the histogram for now).
If you had an inital array of numbers {0100, 0103, 0102, 0101} The prefix sum of that would be 4.
The algorithm makes no indication I can determine to lag the result by 1. That being said, in order for the algorithm to work the way they intend, it does have to be lagged, so, moving on.
Now, the prefix sums are 0, 4, 4.... The algorithm doesn't use the MSD as the index into the outputIndices array, it uses "MSD - 1"; So taking 1 as the index into the array, the starting index for the first item without the histogram is 4! Outside the array on the first try.
The outputIndices is built with the MSD, it makes sense for it to be accessed by MSD.
Further, even if you tweak the algorithm to correctly to use the MSD into the outputIndices, it still won't sort correctly. With your initial inputs (swapped) {4598, 4612}, they will stay in that order. They are sorted (locally) as if they are 2 digit numbers. If you increase it to have other numbers not starting with 4, they will be globally, sorted, but the local sort is never finished.
According to the paper the goal is to use the histogram to do that, but I don't see that happening.
Ultimately, I'm assuming, what you want is an algorithm that works the way described. I've modified the algorithm, keeping with the overall stated goal of the paper of using the MSD to do a global sort, and the rest of the digits by reverse LSD.
I don't think these changes should have any impact on your desire to parallel-ize the function.
void radix_sort2(std::vector<int>& arr)
{
std::array<std::vector<int>, 10> buckets3;
for (const int item : arr)
{
int d = item / 1000;
buckets3.at(d).push_back(item);
}
//Prefix sum
std::array<int, 10> outputIndices;
outputIndices.at(0) = 0;
for (int i = 1; i < 10; ++i)
{
outputIndices.at(i) = outputIndices.at(i - 1) + buckets3.at(i - 1).size();
}
for (const auto& bucket3 : buckets3)
{
if (bucket3.size() <= 0)
continue;
std::array<std::vector<int>, 10> buckets0, buckets1, buckets2;
for (const int item : bucket3)
buckets0.at(item % 10).push_back(item);
for (const auto& bucket0 : buckets0)
for (const int item : bucket0)
buckets1.at((item / 10) % 10).push_back(item);
for (const auto& bucket1 : buckets1)
for (const int item : bucket1)
buckets2.at((item / 100) % 10).push_back(item);
int count = 0;
for (const auto& bucket2 : buckets2)
{
for (const int item : bucket2)
{
int d = (item / 1000) % 10;
int i = outputIndices.at(d) + count;
++count;
arr.at(i) = item;
}
}
}
}
For extensiblility, it would probably make sense to create a helper function that does the local sorting. You should be able to extend it to handle any number of digit numbers that way.

Filter only digit sequences containing a given set of digits

I have a large list of digit strings like this one. The individual strings are relatively short (say less than 50 digits).
data = [
'300303334',
'53210234',
'123456789',
'5374576807063874'
]
I need to find out a efficient data structure (speed first, memory second) and algorithm which returns only those strings that are composed of a given set of digits.
Example results:
filter(data, [0,3,4]) = ['300303334']
filter(data, [0,1,2,3,4,5]) = ['300303334', '53210234']
The data list will usually fit into memory.
For each digit, precompute a postings list that don't contain the digit.
postings = [[] for _ in xrange(10)]
for i, d in enumerate(data):
for j in xrange(10):
digit = str(j)
if digit not in d:
postings[j].append(i)
Now, to find all strings that contain, for example, just the digits [1, 3, 5] you can merge the postings lists for the other digits (ie: 0, 2, 4, 6, 7, 8, 9).
def intersect_postings(p0, p1):
i0, i1 = next(p0), next(p1)
while True:
if i0 == i1:
yield i0
i0, i1 = next(p0), next(p1)
elif i0 < i1: i0 = next(p0)
else: i1 = next(p1)
def find_all(digits):
p = None
for d in xrange(10):
if d not in digits:
if p is None: p = iter(postings[d])
else: p = intersect_postings(p, iter(postings[d]))
return (data[i] for i in p) if p else iter(data)
print list(find_all([0, 3, 4]))
print list(find_all([0, 1, 2, 3, 4, 5]))
A string can be encoded by a 10-bit number. There are 2^10, or 1,024 possible values.
So create a dictionary that uses an integer for a key and a list of strings for the value.
Calculate the value for each string and add that string to the list of strings for that value.
General idea:
Dictionary Lookup;
for each (string in list)
value = 0;
for each character in string
set bit N in value, where N is the character (0-9)
Lookup[value] += string // adds string to list for this value in dictionary
Then, to get a list of the strings that match your criteria, just compute the value and do a direct dictionary lookup.
So if the user asks for strings that contain only 3, 5, and 7:
value = (1 << 3) || (1 << 5) || (1 << 7);
list = Lookup[value];
Note that, as Matt pointed out in comment below, this will only return strings that contain all three digits. So, for example, it wouldn't return 37. That seems like a fatal flaw to me.
Edit
If the number of symbols you have to deal with is very large, then the number of possible combinations becomes too large for this solution to be practical.
With a large number of symbols, I'd recommend an inverted index as suggested in the comments, combined with a secondary filter that removes the strings that contain extraneous digits.
Consider a function f which constructs a bitmask for each string with bit i set if digit i is in the string.
For example,
f('0') = 0b0000000001
f('00') = 0b0000000001
f('1') = 0b0000000010
f('1100') = 0b0000000011
Then I suggest storing a list of strings for each bitmask.
For example,
Bitmask 0b0000000001 -> ['0','00']
Once you have prepared this data structure (which is the same size as your original list), you can then easily access all the strings for a particular filter by accessing all lists where the bitmask is a subset of the digits in your filter.
So for your example of filter [0,3,4] you would return the lists from:
Strings containing just 0
Strings containing just 3
Strings containing just 4
Strings containing 0 and 3
Strings containing 0 and 4
Strings containing 3 and 4
Strings containing 0 and 3 and 4
Example Python Code
from collections import defaultdict
import itertools
raw_data = [
'300303334',
'53210234',
'123456789',
'5374576807063874'
]
def preprocess(raw_data):
data = defaultdict(list)
for s in raw_data:
bitmask = 0
for digit in s:
bitmask |= 1<<int(digit)
data[bitmask].append(s)
return data
def filter(data,mask):
for r in range(len(mask)):
for m in itertools.combinations(mask,r+1):
bitmask = sum(1<<digit for digit in m)
for s in data[bitmask]:
yield s
data = preprocess(raw_data)
for a in filter(data, [0,1,2,3,4,5]):
print a
Just for kicks, I have coded up Jim's lovely algorithm and the Perl is here if anyone wants to play with it. Please do not accept this as an answer or anything, pass all credit to Jim:
#!/usr/bin/perl
use strict;
use warnings;
my $Debug=1;
my $Nwords=1000;
my ($word,$N,$value,$i,$j,$k);
my (#dictionary,%Lookup);
################################################################################
# Generate "words" with random number of characters 5-30
################################################################################
print "DEBUG: Generating $Nwords word dictionary\n" if $Debug;
for($i=0;$i<$Nwords;$i++){
$j = rand(25) + 5; # length of this word
$word="";
for($k=0;$k<$j;$k++){
$word = $word . int(rand(10));
}
$dictionary[$i]=$word;
print "$word\n" if $Debug;
}
# Add some obvious test cases
$dictionary[++$i]="0" x 50;
$dictionary[++$i]="1" x 50;
$dictionary[++$i]="2" x 50;
$dictionary[++$i]="3" x 50;
$dictionary[++$i]="4" x 50;
$dictionary[++$i]="5" x 50;
$dictionary[++$i]="6" x 50;
$dictionary[++$i]="7" x 50;
$dictionary[++$i]="8" x 50;
$dictionary[++$i]="9" x 50;
$dictionary[++$i]="0123456789";
################################################################################
# Encode words
################################################################################
for $word (#dictionary){
$value=0;
for($i=0;$i<length($word);$i++){
$N=substr($word,$i,1);
$value |= 1 << $N;
}
push(#{$Lookup{$value}},$word);
print "DEBUG: $word encoded as $value\n" if $Debug;
}
################################################################################
# Do lookups
################################################################################
while(1){
print "Enter permitted digits, separated with commas: ";
my $line=<STDIN>;
my #digits=split(",",$line);
$value=0;
for my $d (#digits){
$value |= 1<<$d;
}
print "Value: $value\n";
print join(", ",#{$Lookup{$value}}),"\n\n" if defined $Lookup{$value};
}
I like Jim Mischel's approach. It has pretty efficient look up and bounded memory usage. Code in C follows:
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <readline/readline.h>
#include <readline/history.h>
enum {
zero = '0',
nine = '9',
numbers = nine - zero + 1,
masks = 1 << numbers,
};
typedef uint16_t mask;
struct list {
char *s;
struct list *next;
};
typedef struct list list_cell;
typedef struct list *list;
static inline int is_digit(char c) { return c >= zero && c <= nine; }
static inline mask char2mask(char c) { return 1 << (c - zero); }
static inline mask add_char2mask(mask m, char c) {
return m | (is_digit(c) ? char2mask(c) : 0);
}
static inline int is_set(mask m, mask n) { return (m & n) != 0; }
static inline int is_set_char(mask m, char c) { return is_set(m, char2mask(c)); }
static inline int is_submask(mask sub, mask m) { return (sub & m) == sub; }
static inline char *sprint_mask(char buf[11], mask m) {
char *s = buf;
char i;
for(i = zero; i <= nine; i++)
if(is_set_char(m, i)) *s++ = i;
*s = 0;
return buf;
}
static inline mask get_mask(char *s) {
mask m=0;
for(; *s; s++)
m = add_char2mask(m, *s);
return m;
}
static inline int is_empty(list l) { return !l; }
static inline list insert(list *l, char *s) {
list cell = (list)malloc(sizeof(list_cell));
cell->s = s;
cell->next = *l;
return *l = cell;
}
static void *foreach(void *f(char *, void *), list l, void *init) {
for(; !is_empty(l); l = l->next)
init = f(l->s, init);
return init;
}
struct printer_state {
int first;
FILE *f;
};
static void *prin_list_member(char *s, void *data) {
struct printer_state *st = (struct printer_state *)data;
if(st->first) {
fputs(", ", st->f);
} else
st->first = 1;
fputs(s, st->f);
return data;
}
static void print_list(list l) {
struct printer_state st = {.first = 0, .f = stdout};
foreach(prin_list_member, l, (void *)&st);
putchar('\n');
}
static list *init_lu(void) { return (list *)calloc(sizeof(list), masks); }
static list *insert2lu(list lu[masks], char *s) {
mask i, m = get_mask(s);
if(m) // skip string without any number
for(i = m; i < masks; i++)
if(is_submask(m, i))
insert(lu+i, s);
return lu;
}
int usage(const char *name) {
fprintf(stderr, "Usage: %s filename\n", name);
return EXIT_FAILURE;
}
#define handle_error(msg) \
do { perror(msg); exit(EXIT_FAILURE); } while (0)
static inline void chomp(char *s) { if( (s = strchr(s, '\n')) ) *s = '\0'; }
list *load_file(FILE *f) {
char *line = NULL;
size_t len = 0;
ssize_t read;
list *lu = init_lu();
for(; (read = getline(&line, &len, f)) != -1; line = NULL) {
chomp(line);
insert2lu(lu, line);
}
return lu;
}
void read_reqs(list *lu) {
char *line;
char buf[11];
for(; (line = readline("> ")); free(line))
if(*line) {
add_history(line);
mask m = get_mask(line);
printf("mask: %s\nstrings: ", sprint_mask(buf, m));
print_list(lu[m]);
};
putchar('\n');
}
int main(int argc, const char* argv[] ) {
const char *name = argv[0];
FILE *f;
list *lu;
if(argc != 2) return usage(name);
f = fopen(argv[1], "r");
if(!f) handle_error("open");
lu = load_file(f);
fclose(f);
read_reqs(lu);
return EXIT_SUCCESS;
}
To compile use
gcc -lreadline -o digitfilter digitfilter.c
And test run:
$ cat data.txt
300303334
53210234
123456789
5374576807063874
$ ./digitfilter data.txt
> 034
mask: 034
strings: 300303334
> 0,1,2,3,4,5
mask: 012345
strings: 53210234, 300303334
> 0345678
mask: 0345678
strings: 5374576807063874, 300303334
Put each value into a set-- Eg.: '300303334'={3, 0, 4}.
Since the length of your data items are bound by a constant (50),
you can do these at O(1) time for each item using Java HashSet. The overall complexity of this phase adds up to O(n).
For each filter set, use containsAll() of HashSet to see whether
each of these data items is a subset of your filter. Takes O(n).
Takes O(m*n) in the overall where n is the number of data items and m the number of filters.

CUDA Thrust and sort_by_key

I’m looking for a sorting algorithm on CUDA that can sort an array A of elements (double) and returns an array of keys B for that array A.
I know the sort_by_key function in the Thrust library but I want my array of elements A to remain unchanged.
What can I do?
My code is:
void sortCUDA(double V[], int P[], int N) {
real_t *Vcpy = (double*) malloc(N*sizeof(double));
memcpy(Vcpy,V,N*sizeof(double));
thrust::sort_by_key(V, V + N, P);
free(Vcpy);
}
i'm comparing the thrust algorithm against others that i have on sequencial cpu
N mergesort sortCUDA
113 0.000008 0.000010
226 0.000018 0.000016
452 0.000036 0.000020
905 0.000061 0.000034
1810 0.000135 0.000071
3621 0.000297 0.000156
7242 0.000917 0.000338
14484 0.001421 0.000853
28968 0.003069 0.001931
57937 0.006666 0.003939
115874 0.014435 0.008025
231749 0.031059 0.016718
463499 0.067407 0.039848
926999 0.148170 0.118003
1853998 0.329005 0.260837
3707996 0.731768 0.544357
7415992 1.638445 1.073755
14831984 3.668039 2.150179
115035495 39.276560 19.812200
230070990 87.750377 39.762915
460141980 200.940501 74.605219
Thrust performance is not bad, but I think if I use OMP can probably get easily a better CPU time
I think this is because to memcpy
SOLUTION:
void thrustSort(double V[], int P[], int N)
{
thrust::device_vector<int> d_P(N);
thrust::device_vector<double> d_V(V, V + N);
thrust::sequence(d_P.begin(), d_P.end());
thrust::sort_by_key(d_V.begin(), d_V.end(), d_P.begin());
thrust::copy(d_P.begin(),d_P.end(),P);
}
where V is a my double values to sort
You can modify comparison operator to sort keys instead of values. #Robert Crovella correctly pointed that a raw device pointer cannot be assigned from the host. The modified algorithm is below:
struct cmp : public binary_function<int,int,bool>
{
cmp(const double *ptr) : rawA(ptr) { }
__host__ __device__ bool operator()(const int i, const int j) const
{return rawA[i] > rawA[j];}
const double *rawA; // an array in global mem
};
void sortkeys(double *A, int n) {
// move data to the gpu
thrust::device_vector<double> devA(A, A + n);
double *rawA = thrust::raw_pointer_cast(devA.data());
thrust::device_vector<int> B(n);
// initialize keys
thrust::sequence(B.begin(), B.end());
thrust::sort(B.begin(), B.end(), cmp(rawA));
// B now contains the sorted keys
}
And here is alternative with arrayfire. Though I am not sure which one is more efficient since arrayfire solution uses two additional arrays:
void sortkeys(double *A, int n) {
af::array devA(n, A, af::afHost);
af::array vals, indices;
// sort and populate vals/indices arrays
af::sort(vals, indices, devA);
std::cout << devA << "\n" << indices << "\n";
}
How large is this array? The most efficient way, in terms of speed, will likely be to just duplicate the original array before sorting, if the memory is available.
Building on the answer provided by #asm (I wasn't able to get it working), this code seemed to work for me, and does sort only the keys. However, I believe it is limited to the case where the keys are in sequence 0, 1, 2, 3, 4 ... corresponding to the (double) values. Since this is a "index-value" sort, it could be extended to the case of an arbitrary sequence of keys, perhaps by doing an indexed copy. However I'm not sure the process of generating the index sequence and then rearranging the original keys will be any faster than just copying the original value data to a new vector (for the case of arbitrary keys).
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
using namespace std;
__device__ double *rawA; // an array in global mem
struct cmp : public binary_function<int, int, bool>
{
__host__ __device__ bool operator()(const int i, const int j) const
{return ( rawA[i] < rawA[j]);}
};
void sortkeys(double *A, int n) {
// move data to the gpu
thrust::device_vector<double> devA(A, A + n);
// rawA = thrust::raw_pointer_cast(&(devA[0]));
double *test = raw_pointer_cast(devA.data());
cudaMemcpyToSymbol(rawA, &test, sizeof(double *));
thrust::device_vector<int> B(n);
// initialize keys
thrust::sequence(B.begin(), B.end());
thrust::sort(B.begin(), B.end(), cmp());
// B now contains the sorted keys
thrust::host_vector<int> hostB = B;
for (int i=0; i<hostB.size(); i++)
std::cout << hostB[i] << " ";
std::cout<<std::endl;
for (int i=0; i<hostB.size(); i++)
std::cout << A[hostB[i]] << " ";
std::cout<<std::endl;
}
int main(){
double C[] = {0.7, 0.3, 0.4, 0.2, 0.6, 1.2, -0.5, 0.5, 0.0, 10.0};
sortkeys(C, 9);
std::cout << std::endl;
return 0;
}

Resources