Processes read data from the same file - parallel-processing

I have a mesh file and I did a partitioning of it using METIS(in 4 parts/processes).METIS provided me with the partitioning file of the mesh(gave me a file with the number of process where each element of the mesh belongs to).My job now is to input these information to my parallel code.I tried to do it by letting each process to have access to the same mesh file and read the data that it wants based on partitioning file.
#include <iostream>
#include <fstream>
#include <sstream>
#include "mpi.h"
using namespace std;
//each process stores the partitioning
int* PartitionFile(){
ifstream file ("epart.txt");
int NE=14;
int part,i=0;
int *partition=new int[14];
while(file>>part){
partition[i]=part;
i++;
}
file.close();
return partition;
}
int FindSizeOfLocalElements(int *epart,int rank){
int size=0;
for (int i=0;i<14;i++){
if(epart[i]==rank){
size+=1;
}
}
return size;
}
//stores the elements of each process
int * LocalPartition(int* epart,int size,int rank){
int *localPart=new int[size];
int j=0;
for(int i=0;i<14;i++){
if (epart[i]==rank){
localPart[j]=i+1;//+1 because elements start from 1(global numbering)
j+=1;
}
}
return localPart;
}
int **ElementConnectivityMeshFile(int* localPart,int size){
ifstream file ("mesh.txt");
int node1,node2,node3;
int elem=1;
int i=0;
int **elemConn=new int*[size];
for(int j=0;j<size;j++){
elemConn[j]=new int[3];//each element has 3 nodes.Here elements has local numbering.Global numbering is stored in localPart
}
while(file>>node1>>node2>>node3){
if (elem==localPart[i]){
elemConn[i][0]=node1;
elemConn[i][1]=node2;
elemConn[i][2]=node3;
i+=1;
}
elem+=1;
if(elem>14){break;}
}
file.close();
return elemConn;
}
int main(){
MPI_Init(NULL, NULL);
int numProc;
MPI_Comm_size(MPI_COMM_WORLD, &numProc);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int *epart=PartitionFile();
int size=FindSizeOfLocalElements(epart,rank);
int *elem=LocalPartition(epart,size,rank);
int **elemConn=ElementConnectivityMeshFile(elem,size);
MPI_Finalize();
return 0;
}
This part of code gives me the desired results,however I want to know how efficient is letting MPI processes read the same file,by using c++ standard functions, and if that can affect scalability and performance.For this demostration i used a mesh of 14 elements and 4 processes.
mesh file
1 3 2
2 3 4
3 5 4
4 5 6
5 7 6
8 7 5
3 8 5
9 7 8
9 8 3
1 9 3
10 9 1
11 10 1
11 1 12
12 1 2
epart file
2
2
0
0
0
1
0
1
1
3
3
3
2
2

I think this program illustrates the basic approach using MPI-IO with binary files:
#include <stdio.h>
#include <mpi.h>
#define NELEM 14
#define NVERT 3
int meshfile[NELEM][NVERT] =
{ { 1, 3, 2},
{ 2, 3, 4},
{ 3, 5, 4},
{ 4, 5, 6},
{ 5, 7, 6},
{ 8, 7, 5},
{ 3, 8, 5},
{ 9, 7, 8},
{ 9, 8, 3},
{ 1, 9, 3},
{10, 9, 1},
{11, 10, 1},
{11, 1, 12},
{12, 1, 2}, };
int partfile[NELEM] = {2, 2, 0, 0, 0, 1, 0, 1, 1, 3, 3, 3, 2, 2};
int main(void)
{
int i;
int part[NELEM];
int mesh[NELEM][NVERT];
/* Should really malloc smaller mesh based on local size */
FILE *fp;
int rank, size;
MPI_Comm comm;
MPI_Status status;
MPI_File fh;
MPI_Datatype filetype;
int disp[NELEM];
int nelemlocal;
/* Should really malloc smaller displ based on nelemlocal */
comm = MPI_COMM_WORLD;
MPI_Init(NULL, NULL);
MPI_Comm_size(comm, &size);
MPI_Comm_rank(comm, &rank);
if (rank == 0)
{
printf("Running on %d processes\n", size);
// data files should already exist but create them here so we
// have a self-contained program
fp = fopen("mesh.dat", "w");
fwrite(meshfile, sizeof(int), NELEM*NVERT, fp);
fclose(fp);
fp = fopen("part.dat", "w");
fwrite(partfile, sizeof(int), NELEM, fp);
fclose(fp);
}
// could read on rank 0 and broadcast, but using MPI-IO then
// "readall" should take an efficient collective approach
// every rank read the whole partition file
MPI_File_open(comm, "part.dat", MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);
MPI_File_set_view(fh, 0, MPI_INT, MPI_INT, "native", MPI_INFO_NULL);
MPI_File_read_all(fh, part, NELEM, MPI_INT, &status);
MPI_File_close(&fh);
nelemlocal = 0;
// pick out local elements and record displacements
for (i=0; i < NELEM; i++)
{
if (part[i] == rank)
{
disp[nelemlocal] = i*NVERT;
nelemlocal += 1;
}
}
printf("on rank %d, nelemlocal = %d\n", rank, nelemlocal);
// create the MPI datatype to use as the filetype, which is
// effectively a mask that selects only the elements for this rank
MPI_Type_create_indexed_block(nelemlocal, NVERT, disp, MPI_INT, &filetype);
MPI_Type_commit(&filetype);
MPI_File_open(comm, "mesh.dat", MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);
// set the file view appropriate to this rank
MPI_File_set_view(fh, 0, MPI_INT, filetype, "native", MPI_INFO_NULL);
// each rank only reads its own set of elements based on file view
MPI_File_read_all(fh, mesh, nelemlocal*NVERT, MPI_INT, &status);
MPI_File_close(&fh);
// check we got the correct data
for (i=0; i < nelemlocal; i++)
{
printf("on rank %d, mesh[%d] = %d, %d, %d\n",
rank, i, mesh[i][0], mesh[i][1], mesh[i][2]);
}
MPI_Finalize();
}
and it seems to give the right answer;
dsh#laptop$ mpicc -o metisio metisio.c
dsh#laptop$ mpirun -n 4 ./metisio | sort
on rank 0, mesh[0] = 3, 5, 4
on rank 0, mesh[1] = 4, 5, 6
on rank 0, mesh[2] = 5, 7, 6
on rank 0, mesh[3] = 3, 8, 5
on rank 0, nelemlocal = 4
on rank 1, mesh[0] = 8, 7, 5
on rank 1, mesh[1] = 9, 7, 8
on rank 1, mesh[2] = 9, 8, 3
on rank 1, nelemlocal = 3
on rank 2, mesh[0] = 1, 3, 2
on rank 2, mesh[1] = 2, 3, 4
on rank 2, mesh[2] = 11, 1, 12
on rank 2, mesh[3] = 12, 1, 2
on rank 2, nelemlocal = 4
on rank 3, mesh[0] = 1, 9, 3
on rank 3, mesh[1] = 10, 9, 1
on rank 3, mesh[2] = 11, 10, 1
on rank 3, nelemlocal = 3
Running on 4 processes

Related

Boost Graph max-flow algorithm to find out the arcs on the minimal S/T cut

I have an application where for a given fixed number of vertices, there is a need to solve large number of different max-flow algorithms from a given fixed source (S) to a given fixed sink (T). Each max-flow problem differs in that the directed arcs themselves change along with their capacities. As an example, see below.
The number of vertices remains fixed, but the actual arcs and their capacities differ from one problem to the next.
I have the following code that solves the max-flow problem iteratively for Graph 1 and Graph 2 in the figure above using boost thus (apologies for the wall of text, I have tried to make it as minimal as possible. The code below fully compiles on g++ on my linux box, but I am unable to have this correcly compile on online compilers such as wandbox, etc.):
#include <boost/config.hpp>
#include <iostream>
#include <boost/graph/adjacency_list.hpp>
#include <boost/graph/boykov_kolmogorov_max_flow.hpp>
using namespace boost;
typedef adjacency_list_traits<vecS, vecS, directedS> Traits;
typedef adjacency_list<
vecS, vecS, directedS,
property<
vertex_name_t, std::string,
property<vertex_index_t, int,
property<vertex_color_t, boost::default_color_type,
property<vertex_distance_t, double,
property<vertex_predecessor_t, Traits::edge_descriptor>
> > > >,
property<
edge_index_t, int,
property<edge_capacity_t, double,
property<edge_weight_t, double,
property<edge_residual_capacity_t, double,
property<edge_reverse_t, Traits::edge_descriptor>
> > > > >
Graph;
Graph g;
property_map<Graph, edge_index_t>::type e;
property_map<Graph, edge_capacity_t>::type cap;
property_map<Graph, edge_weight_t>::type cost;
property_map<Graph, edge_residual_capacity_t>::type rescap;
property_map<Graph, edge_reverse_t>::type rev;
property_map<Graph, vertex_color_t>::type colors;
void initialize(int nnodes) {
e = get(edge_index, g);
cap = get(edge_capacity, g);
cost = get(edge_weight, g);
rescap = get(edge_residual_capacity, g);
rev = get(edge_reverse, g);
colors = get(vertex_color, g);
for(int i = 0; i < nnodes; i++)
add_vertex(g);
}
void clearedges() {
Graph::vertex_iterator v, vend;
for (boost::tie(v, vend) = vertices(g); v != vend; ++v)
boost::clear_out_edges(*v, g);
}
void createedges(std::vector<std::pair<int, int>>& arcs, std::vector<double>& capacity) {
Traits::edge_descriptor edf, edr;//forward and reverse
for (int eindex = 0, sz = static_cast<int>(arcs.size()); eindex < sz; eindex++) {
int fr, to;
fr = arcs[eindex].first;
to = arcs[eindex].second;
edf = add_edge(fr, to, g).first;
edr = add_edge(to, fr, g).first;
e[edf] = 2 * eindex;
e[edr] = e[edf] + 1;
cap[edf] = capacity[eindex];
cap[edr] = capacity[eindex];
rev[edf] = edr;
rev[edr] = edf;
}
}
double solve_max_flow(int s, int t) {
double retval = boykov_kolmogorov_max_flow(g, s, t);
return retval;
}
bool is_part_of_source(int i) {
if (colors[i] == boost::black_color)
return true;
return false;
}
int main() {
initialize(6);
std::vector<std::pair<int, int>> arcs1 = { std::make_pair<int,int>(0,1),
std::make_pair<int,int>(0,2),
std::make_pair<int,int>(1,2),
std::make_pair<int,int>(1,3),
std::make_pair<int,int>(1,4),
std::make_pair<int,int>(2,4),
std::make_pair<int,int>(3,4),
std::make_pair<int,int>(3,5),
std::make_pair<int,int>(4,5)
};
std::vector<double> capacities1 = { 10, 10, 10, 10, 1, 4, 3, 2, 10 };
clearedges();
createedges(arcs1, capacities1);
double maxflow = solve_max_flow(0, 5);
printf("max flow is %f\n", maxflow);
for (int i = 0; i < 6; i++)
if (is_part_of_source(i))
printf("Node %d belongs to subset source is in\n", i);
Graph::edge_iterator e_, eend_;
int Eindex = 0;
for (boost::tie(e_, eend_) = edges(g); e_ != eend_; ++e_) {
int fr = source(*e_, g);
int to = target(*e_, g);
printf("(%d) Edge %d: (%d -> %d), capacity %f\n", Eindex, e[*e_], fr, to, cap[*e_]);
Eindex++;
if (is_part_of_source(fr) && is_part_of_source(to) == false)
printf("----is part of ST Cut-----\n");
else
printf("x\n");
}
std::vector<std::pair<int, int>> arcs2 = { std::make_pair<int,int>(0,1),
std::make_pair<int,int>(0,2),
std::make_pair<int,int>(1,3),
std::make_pair<int,int>(2,4),
std::make_pair<int,int>(3,5),
std::make_pair<int,int>(4,5)
};
std::vector<double> capacities2 = { 10, 10, 10, 4, 2, 0 };
clearedges();
createedges(arcs2, capacities2);
maxflow = solve_max_flow(0, 5);
printf("max flow is %f\n", maxflow);
for (int i = 0; i < 6; i++)
if (is_part_of_source(i))
printf("Node %d belongs to subset source is in\n", i);
Eindex = 0;
for (boost::tie(e_, eend_) = edges(g); e_ != eend_; ++e_) {
int fr = source(*e_, g);
int to = target(*e_, g);
printf("(%d) Edge %d: (%d -> %d), capacity %f\n", Eindex, e[*e_], fr, to, cap[*e_]);
Eindex++;
if (is_part_of_source(fr) && is_part_of_source(to) == false)
printf("----is part of ST Cut-----\n");
else
printf("x\n");
}
getchar();
}
I have the following questions.
(a) If the underlying vertices remain fixed, but only the arcs and their capacities change from iteration to iteration, is there anything faster than using clear_out_edges to clear the arcs and then using add_edge to add the new arcs with their new capacities? Also, does clear_out_edges correctly also clear the property map entries that may have the edge descriptor just deleted as key?
(b) Boost max-flow algorithms seem to want the explicit addition of reverse arcs. As of now, in function createedges I explicitly do this via a forward edge descriptor (edf) and a reverse edge descriptor (edr). Is there any performance penalty for this especially when the number of max flow problems that need to be solved is in the 1000s? Is there anything that is more efficient than this?
(c) I am able to correctly enumerate the arcs of the minimal S/T cut via the following portion of the code:
int Eindex = 0;
for (boost::tie(e_, eend_) = edges(g); e_ != eend_; ++e_) {
int fr = source(*e_, g);
int to = target(*e_, g);
printf("(%d) Edge %d: (%d -> %d), capacity %f\n", Eindex, e[*e_], fr, to, cap[*e_]);
Eindex++;
if (is_part_of_source(fr) && is_part_of_source(to) == false)
printf("----is part of ST Cut-----\n");
else
printf("x\n");
}
Is there any more efficient way or enumerating the arcs of the S/T cut than the above?
There's many issues. If you use modern C++ and compiler warnings, you can reduce the code and spot the bugs in printing vertex descriptors (printf is just not safe; use the diagnostics!).
Here's my take after review.
Notable changes:
bundled properties instead of separate interior properties
this implies passing named arguments (but see https://stackoverflow.com/a/64744086/85371)
no more global variables, no more loopy initialization if the simple constructor suffices
no more duplicated code (nothing invites error quite like having capacities1 and capacities2 lying around)
using clear_vertex instead of just clear_out_edges - this may not make a difference (?) but seems to express intent a bit better
no more printf (I'll use libfmt, which is also in c++23), so e.g.
fmt::print("Max flow {}\nNodes {} are in source subset\n", maxflow,
vertices(_g) | filtered(is_source));
prints
Max flow 10
Nodes {0, 1, 2, 3} are in source subset
all in one go
print what you think you are printing. In particular, use the library support for printing edges if you can
Live On Compiler Explorer
#include <boost/graph/adjacency_list.hpp>
#include <boost/graph/boykov_kolmogorov_max_flow.hpp>
#include <boost/range/adaptors.hpp>
#include <fmt/ostream.h>
#include <fmt/ranges.h>
using boost::adaptors::filtered;
using Traits = boost::adjacency_list_traits<boost::vecS, boost::vecS, boost::directedS>;
using V = Traits::vertex_descriptor;
using E = Traits::edge_descriptor;
using Capacity = double;
using Color = boost::default_color_type;
struct VertexProps {
// std::string name;
Color color;
Capacity distance;
E predecessor;
};
struct EdgeProps {
int id;
Capacity weight, residual;
E reverse;
};
using Graph = boost::adjacency_list<
boost::vecS, boost::vecS, boost::directedS,
VertexProps,
// see https://stackoverflow.com/a/64744086/85371 :(
boost::property<boost::edge_capacity_t, Capacity, EdgeProps>>;
struct MyGraph {
MyGraph(size_t nnodes) : _g(nnodes) {}
void runSimulation(auto const& arcs, auto const& capacities)
{
reconfigure(arcs, capacities);
Capacity maxflow = solve_max_flow(0, 5);
auto cap = get(boost::edge_capacity, _g);
auto is_source = [this](V v) { return _g[v].color == Color::black_color; };
fmt::print("Max flow {}\nNodes {} are in source subset\n", maxflow,
vertices(_g) | filtered(is_source));
for (E e : boost::make_iterator_range(edges(_g))) {
bool st_cut =
is_source(source(e, _g)) and
not is_source(target(e, _g));
fmt::print("Edge {} (id #{:2}), capacity {:3} {}\n", e, _g[e].id,
cap[e], st_cut ? "(ST Cut)" : "");
}
}
private:
Graph _g;
void reconfigure(auto const& arcs, auto const& capacities)
{
assert(arcs.size() == capacities.size());
for (auto v : boost::make_iterator_range(vertices(_g))) {
// boost::clear_out_edges(v, g);
boost::clear_vertex(v, _g);
}
auto cap = get(boost::edge_capacity, _g);
auto eidx = get(&EdgeProps::id, _g);
auto rev = get(&EdgeProps::reverse, _g);
auto eindex = 0;
for (auto [fr, to] : arcs) {
auto edf = add_edge(fr, to, _g).first;
auto edr = add_edge(to, fr, _g).first;
eidx[edf] = 2 * eindex;
eidx[edr] = eidx[edf] + 1;
cap[edf] = cap[edr] = capacities[eindex];
rev[edf] = edr;
rev[edr] = edf;
++eindex;
}
}
Capacity solve_max_flow(V src, V sink)
{
return boykov_kolmogorov_max_flow(
_g, src, sink,
// named arguments
boost::reverse_edge_map(get(&EdgeProps::reverse, _g))
.residual_capacity_map(get(&EdgeProps::residual, _g))
.vertex_color_map(get(&VertexProps::color, _g))
.predecessor_map(get(&VertexProps::predecessor, _g))
.distance_map(get(&VertexProps::distance, _g))
// end named arguments
);
}
};
int main() {
MyGraph g{6};
using namespace std;
for (auto&& [arcs, capacities] : { tuple
// 1
{vector{pair{0, 1}, {0, 2}, {1, 2}, {1, 3}, {1, 4},
{2, 4}, {3, 4}, {3, 5}, {4, 5}},
vector{10, 10, 10, 10, 1, 4, 3, 2, 10}},
// 2
{vector{pair{0, 1}, {0, 2}, {1, 3}, {2, 4}, {3, 5}, {4, 5}},
vector{10, 10, 10, 4, 2, 0}},
})
{
g.runSimulation(arcs, capacities);
}
}
Prints
Max flow 10
Nodes {0, 1, 2, 3} are in source subset
Edge (0,1) (id # 0), capacity 10
Edge (0,2) (id # 2), capacity 10
Edge (1,0) (id # 1), capacity 10
Edge (1,2) (id # 4), capacity 10
Edge (1,3) (id # 6), capacity 10
Edge (1,4) (id # 8), capacity 1 (ST Cut)
Edge (2,0) (id # 3), capacity 10
Edge (2,1) (id # 5), capacity 10
Edge (2,4) (id #10), capacity 4 (ST Cut)
Edge (3,1) (id # 7), capacity 10
Edge (3,4) (id #12), capacity 3 (ST Cut)
Edge (3,5) (id #14), capacity 2 (ST Cut)
Edge (4,1) (id # 9), capacity 1
Edge (4,2) (id #11), capacity 4
Edge (4,3) (id #13), capacity 3
Edge (4,5) (id #16), capacity 10
Edge (5,3) (id #15), capacity 2
Edge (5,4) (id #17), capacity 10
Max flow 2
Nodes {0, 1, 2, 3, 4} are in source subset
Edge (0,1) (id # 0), capacity 10
Edge (0,2) (id # 2), capacity 10
Edge (1,0) (id # 1), capacity 10
Edge (1,3) (id # 4), capacity 10
Edge (2,0) (id # 3), capacity 10
Edge (2,4) (id # 6), capacity 4
Edge (3,1) (id # 5), capacity 10
Edge (3,5) (id # 8), capacity 2 (ST Cut)
Edge (4,2) (id # 7), capacity 4
Edge (4,5) (id #10), capacity 0 (ST Cut)
Edge (5,3) (id # 9), capacity 2
Edge (5,4) (id #11), capacity 0
Side Note
If you think main is overcomplicated, here's another way to write it for just the two invocations:
Live On Compiler Explorer
g.runSimulation({{0, 1}, {0, 2}, {1, 2}, {1, 3}, {1, 4}, {2, 4}, {3, 4}, {3, 5}, {4, 5}},
{10, 10, 10, 10, 1, 4, 3, 2, 10});
g.runSimulation({{0, 1}, {0, 2}, {1, 3}, {2, 4}, {3, 5}, {4, 5}},
{10, 10, 10, 4, 2, 0});

Cannot understand hoow to recursively merge sort

Currently self-learning C++ with Daniel Liang's Introduction to C++.
On the topic of the merge sort, I cannot seem to understand how his code is recursively calling itself.
I understand the general concept of the merge sort, but I am having trouble understanding this code specifically.
In this example, we first pass the list 1, 7, 3, 4, 9, 3, 3, 1, 2, and its size (9) to the mergeSort function.
From there, we divide the list into two until the array size reaches 1. In this case, we would get: 1,7,3,4 -> 1,7 -> 1. We then move onto the merge sorting the second half. The second half array would be 7 in this case. We merge the two arrays [1] and [7] and proceed to delete the two arrays that were dynamically allocated to prevent any memory leak.
The part I don't understand is how does this code run from here? After delete[] firstHalf and delete[] secondHalf. From my understanding, shouldn't there be another mergeSort function call in order to merge sort the new firstHalf and secondHalf?
#include <iostream>
using namespace std;
// Function prototype
void arraycopy(int source[], int sourceStartIndex,
int target[], int targetStartIndex, int length);
void merge(int list1[], int list1Size,
int list2[], int list2Size, int temp[]);
// The function for sorting the numbers
void mergeSort(int list[], int arraySize)
{
if (arraySize > 1)
{
// Merge sort the first half
int* firstHalf = new int[arraySize / 2];
arraycopy(list, 0, firstHalf, 0, arraySize / 2);
mergeSort(firstHalf, arraySize / 2);
// Merge sort the second half
int secondHalfLength = arraySize - arraySize / 2;
int* secondHalf = new int[secondHalfLength];
arraycopy(list, arraySize / 2, secondHalf, 0, secondHalfLength);
mergeSort(secondHalf, secondHalfLength);
// Merge firstHalf with secondHalf
merge(firstHalf, arraySize / 2, secondHalf, secondHalfLength,
list);
delete [] firstHalf;
delete [] secondHalf;
}
}
void merge(int list1[], int list1Size,
int list2[], int list2Size, int temp[])
{
int current1 = 0; // Current index in list1
int current2 = 0; // Current index in list2
int current3 = 0; // Current index in temp
while (current1 < list1Size && current2 < list2Size)
{
if (list1[current1] < list2[current2])
temp[current3++] = list1[current1++];
else
temp[current3++] = list2[current2++];
}
while (current1 < list1Size)
temp[current3++] = list1[current1++];
while (current2 < list2Size)
temp[current3++] = list2[current2++];
}
void arraycopy(int source[], int sourceStartIndex,
int target[], int targetStartIndex, int length)
{
for (int i = 0; i < length; i++)
{
target[i + targetStartIndex] = source[i + sourceStartIndex];
}
}
int main()
{
const int SIZE = 9;
int list[] = {1, 7, 3, 4, 9, 3, 3, 1, 2};
mergeSort(list, SIZE);
for (int i = 0; i < SIZE; i++)
cout << list[i] << " ";
return 0;
}
From my understanding, shouldn't there be another mergeSort function
call in order to merge sort the new firstHalf and secondHalf?
It is happening implicitly during the recursive call. When you reach these two lines:
delete [] firstHalf;
delete [] secondHalf;
It means that one call to mergeSort is completed. If this call belongs to merging a first half, then code starts from the line after, i.e. these lines:
// Merge sort the second half
int secondHalfLength = arraySize - arraySize / 2;
...
But, if this call belongs to merging of the second half, then the control goes back to the line just after that call, i.e. these lines:
// Merge firstHalf with secondHalf
merge(firstHalf, arraySize / 2, secondHalf, secondHalfLength,
list);
And everything if doing well as planned.

How make a stride chunk iterator thrust cuda

I need a class iterator like this
https://github.com/thrust/thrust/blob/master/examples/strided_range.cu
but that this new iterator do the next sequence
[k * size_stride, k * size_stride+1, ...,k * size_stride+size_chunk-1,...]
with
k = 0,1,...,N
Example:
size_stride = 8
size_chunk = 3
N = 3
then the sequence is
[0,1,2,8,9,10,16,17,18,24,25,26]
I don't know how do this efficiently...
The strided range interator is basically a carefully crafted permutation iterator with a functor that gives the appropriate indices for permutation.
Here is a modification to the strided range iterator example. The main changes were:
include the chunk size as an iterator parameter
modify the functor that provides the indices for the permutation iterator to spit out the desired sequence
adjust the definitions of .end() iterator to provide the appropriate length of sequence.
Worked example:
$ cat t1280.cu
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/functional.h>
#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/sequence.h>
#include <iostream>
#include <assert.h>
// this example illustrates how to make strided-chunk access to a range of values
// examples:
// strided_chunk_range([0, 1, 2, 3, 4, 5, 6], 1,1) -> [0, 1, 2, 3, 4, 5, 6]
// strided_chunk_range([0, 1, 2, 3, 4, 5, 6], 2,1) -> [0, 2, 4, 6]
// strided_chunk_range([0, 1, 2, 3, 4, 5, 6], 3,2) -> [0 ,1, 3, 4, 6]
// ...
template <typename Iterator>
class strided_chunk_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
int chunk;
stride_functor(difference_type stride, int chunk)
: stride(stride), chunk(chunk) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
int pos = i/chunk;
return ((pos * stride) + (i-(pos*chunk)));
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_chunk_range(Iterator first, Iterator last, difference_type stride, int chunk)
: first(first), last(last), stride(stride), chunk(chunk) {assert(chunk<=stride);}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride, chunk)));
}
iterator end(void) const
{
int lmf = last-first;
int nfs = lmf/stride;
int rem = lmf-(nfs*stride);
return begin() + (nfs*chunk) + ((rem<chunk)?rem:chunk);
}
protected:
Iterator first;
Iterator last;
difference_type stride;
int chunk;
};
int main(void)
{
thrust::device_vector<int> data(50);
thrust::sequence(data.begin(), data.end());
typedef thrust::device_vector<int>::iterator Iterator;
// create strided_chunk_range
std::cout << "stride 3, chunk 2, length 7" << std::endl;
strided_chunk_range<Iterator> scr1(data.begin(), data.begin()+7, 3, 2);
thrust::copy(scr1.begin(), scr1.end(), std::ostream_iterator<int>(std::cout, " ")); std::cout << std::endl;
std::cout << "stride 8, chunk 3, length 50" << std::endl;
strided_chunk_range<Iterator> scr(data.begin(), data.end(), 8, 3);
thrust::copy(scr.begin(), scr.end(), std::ostream_iterator<int>(std::cout, " ")); std::cout << std::endl;
return 0;
}
$ nvcc -arch=sm_35 -o t1280 t1280.cu
$ ./t1280
stride 3, chunk 2, length 7
0 1 3 4 6
stride 8, chunk 3, length 50
0 1 2 8 9 10 16 17 18 24 25 26 32 33 34 40 41 42 48 49
$
This is probably not the most optimal implementation, in particular because we are doing division in the permutation functor, but it should get you started.
I assume (and test for) chunk<=stride, because this seemed reasonable to me, and simplified my thought process. I'm sure it could be modified, with an appropriate example of what sequence you would like to see, for the case where chunk>stride.

Replace pointer to pointer by initializer_list

#include <initializer_list>
#include <iostream>
#include <vector>
//this api is anti intuition
void original(int const **data)
{
for(size_t i = 0; i != 3; ++i){
int const *ptr = *data;
//std::cout<<*ptr++<<", "<<*ptr<<std::endl; //this line may cause undefined behavior
std::cout<<ptr[0]<<", "<<ptr[1]<<std::endl;
++data;
}
}
//my eyes prefer this api than original like api
void replace_original(std::initializer_list<std::initializer_list<int>> list)
{
std::vector<int const*> results(list.size());
for(auto data : list){
results.push_back(std::begin(data)); //#1
}
original(&results[0]);
}
int main()
{
int first[] = {0, 1};
int second[] = {2, 3};
int third[] = {4, 5};
int const *array[] = {first, second, third};
original(array);
replace_original({ {0, 1}, {2, 3}, {4, 5} });
return 0;
}
The results are
0, 1
2, 3
4, 5
expected results are
0, 1
2, 3
4, 5
0, 1
2, 3
4, 5
I want to encapsulate the api of original(old, c style api) by the api like replace_original
But can't figure out why #1 can't work.
Ah, stupid mistake, I should change the loop to
size_t const size = list.size();
std::vector<int const*> results(size);
for(size_t i = 0; i != size; ++i){
results[i] = std::begin( *(std::begin(list) + i) );
}
Do you have a better solution to encapsulate this kind of api?
After google, I find out that in c++14, size() of initializer_list will
become constexpr so we should be able to use std::array to replace std::vector

Usage of _mm_shuffle_epi8 intrinsic

Can someone please explain the _mm_shuffle_epi8 SSSE3 intrinsic?
I know it shuffles 16 8-bit integers in an __m128i but not sure how I could use this.
I basically want to use _mm_shuffle_epi8 to modify the function below to get better performance.
while(not done)
dest[i+0] = (src+j).a;
dest[i+1] = (src+j).b;
dest[i+2] = (src+j).c;
dest[i+3] = (src+j+1).a;
dest[i+4] = (src+j+1).b;
dest[i+5] = (src+j+1).c;
i+=6;
j+=2;
_mm_shuffle_epi8 (better known as pshufb), essentially does this:
temp = dst;
for (int i = 0; i < 16; i++)
dst[i] = (src[i] & 0x80) == 0 ? temp[src[i] & 15] : 0;
As for whether you can use it here, it's impossible to tell without knowing the types involved. It won't be "nice" anyway because the destination is a block of 6 bytes (or words? or dwords?). You could make that work by unrolling and doing a lot of shifting and or-ing.
here's an example of using the intrinsic; you'll have to find out how to apply it to your particular situation. this code endian-swaps 4 32-bit integers at a time:
unsigned int *bswap(unsigned int *destination, unsigned int *source, int length) {
int i;
__m128i mask = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
for (i = 0; i < length; i += 4) {
_mm_storeu_si128((__m128i *)&destination[i],
_mm_shuffle_epi8(_mm_loadu_si128((__m128i *)&source[i]), mask));
}
return destination;
}

Resources