Error: Input buffer filter is accessed at 63, which is beyond the max (15) in dimension 2 Aborted (core dumped) - halide

I want to test my algorithm written in halide on tiramisu compiler
once i run it i got an error like this one
Error: Input buffer filter is accessed at 63, which is beyond the max (15) in dimension 2
Aborted (core dumped)
So i decided to only test the call of the method even i have same parameter but i get same error or error similar like
Error: Input buffer bias is accessed at 15, which is beyond the max (4) in dimension 0
Aborted (core dumped)
here is my wrapper_vgg.h
#ifndef HALIDE__build___wrapper_vgg_o_h
#define HALIDE__build___wrapper_vgg_o_h
#include <tiramisu/utils.h>
#define RADIUS 3
#ifdef __cplusplus
extern "C" {
#endif
int vgg_tiramisu(halide_buffer_t *, halide_buffer_t *_b_input_buffer ,halide_buffer_t *filter,halide_buffer_t *bias,halide_buffer_t *conv,halide_buffer_t *filter2, halide_buffer_t *bias2 ,halide_buffer_t *conv2,halide_buffer_t *_b_output_buffer,halide_buffer_t *_negative_slope);
int vgg_tiramisu_argv(void **args);
int vgg_ref( halide_buffer_t *_b_input_buffer ,halide_buffer_t *filter,halide_buffer_t *bias,halide_buffer_t *filter2, halide_buffer_t *bias2 ,halide_buffer_t *_b_output_buffer);
int vgg_ref_argv(void **args);
// Result is never null and points to constant static data
const struct halide_filter_metadata_t *vgg_tiramisu_metadata();
const struct halide_filter_metadata_t *vgg_ref_metadata();
#ifdef __cplusplus
} // extern "C"
#endif
and here is my vgg_ref.cpp
#include "Halide.h"
#include "configure.h"
using namespace Halide;
int main(int argc, char **argv)
{
ImageParam input{Float(32), 4, "input"};
ImageParam filter{Float(32), 4, "filter"};
ImageParam bias{Float(32), 1, "bias"};
ImageParam filter2{Float(32), 4, "filter2"};
ImageParam bias2{Float(32), 1, "bias2"};
/* THE ALGORITHM */
Var x("x"), y("y"), z("z"), n("n");
Func f_conv("conv"), f_conv2("conv2");
Func f_ReLU("ReLU"), f_ReLU2("ReLU2") ;
//Func f_Maxpool("Maxpool");
Func f_vgg("vgg");
RDom r(0, K+1, 0, K+1, 0, FIn);
RDom r2(0, K+1, 0, K+1, 0, FOut);
// First conv computations
f_conv(x, y, z, n) = bias(z);
f_conv(x, y, z, n) += filter(r.x, r.y, r.z, z) * input(x + r.x, y + r.y, r.z, n);
//first relu
f_ReLU(x, y, z, n) = max(0, f_conv(x, y, z, n));
.....
.....
/* THE SCHEDULE */
// Provide estimates on the input image
.....
.....
f_vgg.compile_to_object("build/generated_fct_vgg_ref.o", {input, filter, bias, filter2, bias2}, "vgg_ref");
f_vgg.compile_to_lowered_stmt("build/generated_fct_vgg_ref.txt", {input, filter, bias, filter2, bias2}, Text);
return 0;
}
and here is the wrapper where i call vgg_ref method
...
#include "configure.h"
#include "wrapper_vgg.h"
#include <tiramisu/utils.h>
using namespace std;
int main(int, char**)
{
Halide::Buffer<float> input(N+K, N+K, FIn, BATCH_SIZE);
Halide::Buffer<float> filter(K+1, K+1, FIn, FOut);
Halide::Buffer<float> bias(FOut);
Halide::Buffer<float> conv(N, N, FOut, BATCH_SIZE);
Halide::Buffer<float> filter2(K+1, K+1, FOut, FOut);
Halide::Buffer<float> bias2(FOut);
Halide::Buffer<float> conv2_tiramisu(N-K, N-K, FOut, BATCH_SIZE);
Halide::Buffer<float> vgg_tiramisu_buff(N-2*K, N-2*K, FOut, BATCH_SIZE);
Halide::Buffer<int> parameters(5);
Halide::Buffer<float> negative_slope(1);negative_slope(0) = 1;
// Buffer for Halide
Halide::Buffer<float> vgg_halide(N-2*K, N-2*K, FOut, BATCH_SIZE);
std::vector<std::chrono::duration<double,std::milli>> duration_vector_1;
std::vector<std::chrono::duration<double,std::milli>> duration_vector_2;
/****************************************** Initialize Buffers *********************************************/
....
....
....
std::cout << "\t\tBuffers initialized" << std::endl;
/****************************************** Halide Part ********************************************************/
for (int i=0; i<NB_TESTS; i++)
{
auto start1 = std::chrono::high_resolution_clock::now();
vgg_ref(input.raw_buffer(), filter.raw_buffer(), bias.raw_buffer(), filter2.raw_buffer(), bias2.raw_buffer(), vgg_halide.raw_buffer());
auto end1 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double,std::milli> duration = end1 - start1;
duration_vector_2.push_back(duration);
}
std::cout << "\t\tHalide vgg duration" << ": " << median(duration_vector_1)/1000 << "; " << std::endl;
std::cout << "\t\t Result" << ": ";
/****************************************** Tiramisu Part ********************************************************/
/* // Initialize parameters[]
parameters(0) = N;
parameters(1) = K;
parameters(2) = FIn;
parameters(3) = FOut;
parameters(4) = BATCH_SIZE;
for (int i=0; i<NB_TESTS; i++)
{
// srand (1);
auto start1 = std::chrono::high_resolution_clock::now();
vgg_tiramisu(parameters.raw_buffer(), input.raw_buffer(), filter.raw_buffer(), bias.raw_buffer(), conv.raw_buffer(), filter2.raw_buffer(), bias2.raw_buffer(), conv2_tiramisu.raw_buffer(),vgg_tiramisu_buff.raw_buffer(),negative_slope.raw_buffer());
auto end1 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double,std::milli> duration = end1 - start1;
duration_vector_1.push_back(duration);
}
std::cout << "\t\tTiramisu vgg duration" << ": " << median(duration_vector_2)/1000 << "; " << std::endl;
std::cout << "\t\t Result" << ": ";
*/
}
i noticed that once i comment this line in halide part everything work well
vgg_ref(input.raw_buffer(), filter.raw_buffer(), bias.raw_buffer(), filter2.raw_buffer(), bias2.raw_buffer(), vgg_halide.raw_buffer());
so the problem is in this call of the halide function "vgg_ref" .
but i do not know this error related to what i tried to call only one parameter i do always have same problem. i do not know how to fix it.
thank you for sharing any advice or paying my attention to something.
Thank you.

I have been able to fix the problem later AlhamduAllah.
I wanna pay the attention here that it's impossible to be able to run the benchmarks without creating the ".o" file so without this line
f_vgg.compile_to_object("build/generated_fct_vgg_ref.o", {input, filter, bias, filter2, bias2}, "vgg_ref");
But how it comes that it was run in my case!!!
Ok this is basically because ".o" file was generated somewhere in the previous execution.
Be careful here :The trick of the old ".o" should be a reflex many issues of the false result is due to the existence of an old copy that object file.
Even I pay attention for that later, I still have same error or error similar :(.
What does this error refer to ? it mean generally in your code their is an index that does not mach it's definition in the wrapper.
So here is two (02) things to verify to help fix this issue:
Verify the call of the function, it's parameter : ex if the function require put 5 parameter verify if you put 5 not more not less.
Verify all the index their interval.
My problem was in this 2 lines
RDom r(0, K, 0, K, 0, FIn);
RDom r2(0, K, 0, K, 0, FOut);
RDom (A multi-dimensional domain over which to iterate.) help you to browse a small matrix in the input matrix like apply a filter for the input. This RDom above define the intervals of x, y and z of the filter matrix.
In the wrapper i define the parameter of the filter like this
Halide::Buffer<float> filter(K+1, K+1, FIn, FOut);
So in RDom too i have to put that x varies from 0 to k+1 but i have only k that's why i got that problem shown in the question.
So it should be done like this
RDom r(0, K+1, 0, K+1, 0, FIn);
RDom r2(0, K+1, 0, K+1, 0, FOut);
And that do fix my problem.
So just pay attention to those small errors that may ruins your day but it's ok since it will help you learn more.

Related

Construct SimplicialLLT object from provided L matrix

I performed LLT factorization of a sparse matrix (SimplicialLLT) and modified the L matrix to Lmod (EDIT: I made a copy of the matrix with some modifications). I would like to construct a new SimplicialLLT object from this modified matrix Lmod so that I can directly use it to obtain a solution of system (Lmod*Lmod')x = B. Is this somehow possible in Eigen? Here is a short code example:
#include <iostream>
#include <Eigen/Dense>
#include <Eigen/Sparse>
using namespace Eigen;
using namespace std;
int main()
{
// Initial system A0 x0 = B
// Create dense matrix first
MatrixXd A0dense(5, 5);
A0dense << 1,0,0,0,0,0,2,-1,0,0,0,-1,2,-1,0,0,0,-1,1,0,0,0,0,0,1;
// Convert it to sparse
SparseMatrix<double> A0;
A0 = A0dense.sparseView();
// Create LLT decomposition od A0
SimplicialLLT<SparseMatrix<double>, Lower, NaturalOrdering<int>> LLTofA0(A0);
// Extract the L matrix
SparseMatrix<double> L0 = LLTofA0.matrixL();
cout << "L0 =" << endl << L0 << endl;
// RHS vector B
VectorXd B(5);
B << 0, 0, 0, 1, 1;
// Create a copy of L0 with some modifications
vector<Triplet<double>> Triplets; // Vector of triplets
for (int k = 0; k < L0.outerSize(); ++k)
{
for (SparseMatrix<double>::InnerIterator it(L0, k); it; ++it) // Iterate over nonzero entries of L0
{
if (it.col() != 2)
{
// If the column is unequal to 2, store the location and value
Triplet<double> T(it.row(), it.col(), it.value());
Triplets.push_back(T);
}
}
}
// Create modified Lmod matrix from the stored triplets
SparseMatrix<double> Lmod(L0.outerSize(), L0.outerSize());
Lmod.setFromTriplets(Triplets.begin(), Triplets.end());
cout << "Lmod =" << endl << Lmod << endl;
// Now I want to solve different system (Lmod*Lmod') x = B and use the matrix Lmod for it
}

Halide: Reduction over a domain for the specific values

I got a func f(x, y, z) in which the values is either 1 and 0, and I need to get the the first 100 coordinates of the values which equals to 1, to reduction/update them to 0.
This is very simple to realize in c and other languages, However, I've been trying to solve it with Halide for a couple of days. Is there any Function or Algorithm that I can use to solve it in Halide Generators?
The question amounts to "How do I implement stream compaction in Halide?" There is much written on parallel stream compaction and it is somewhat non-trivial to do well. See this Stack Overflow answer on doing it in cuda for some discussion and references: CUDA stream compaction algorithm
An quick implementation of simple stream compaction in Halide using a prefix sum looks like so:
#include "Halide.h"
#include <iostream>
using namespace Halide;
static void print_1d(const Buffer<int32_t> &result) {
std::cout << "{ ";
const char *prefix = "";
for (int i = 0; i < result.dim(0).extent(); i++) {
std::cout << prefix << result(i);
prefix = ", ";
}
std::cout << "}\n";
}
int main(int argc, char **argv) {
uint8_t vals[] = {0, 10, 99, 76, 5, 200, 88, 15};
Buffer<uint8_t> in(vals);
Var x;
Func prefix_sum;
RDom range(1, in.dim(0).extent() - 1);
prefix_sum(x) = (int32_t)0;
prefix_sum(range) = select(in(range - 1) > 42, prefix_sum(range - 1) + 1, prefix_sum(range - 1));
RDom in_range(0, in.dim(0).extent());
Func compacted_indices;
compacted_indices(x) = -1;
compacted_indices(clamp(prefix_sum(in_range), 0, in.dim(0).extent() - 1)) = select(in(in_range) > 42, in_range, - 1);
Buffer<int32_t> sum = prefix_sum.realize(8);
Buffer<int32_t> indices = compacted_indices.realize(8);
print_1d(sum);
print_1d(indices);
return 0;
}

C++11 app that uses dispatch_apply not working under Mac OS Sierra

I had a completely functioning codebase written in C++11 that used Grand Central Dispatch parallel processing, specifically dispatch_apply to do the basic parallel for loop for some trivial game calculations.
Since upgrading to Sierra, this code still runs, but each block is run in serial -- the cout statement shows that they are being executed in serial order, and CPU usage graph shows no parallel working on.
Queue is defined as:
workQueue = dispatch_queue_create("workQueue", DISPATCH_QUEUE_CONCURRENT);
And the relevant program code is:
case Concurrency::Parallel: {
dispatch_apply(stateMap.size(), workQueue, ^(size_t stateIndex) {
string thisCode = stateCodes[stateIndex];
long thisCount = stateCounts[stateIndex];
GameResult sliceResult = playStateOfCode(thisCode, thisCount);
results[stateIndex] = sliceResult;
if ((stateIndex + 1) % updatePeriod == 0) {
cout << stateIndex << endl;
}
});
break;
}
I strongly suspect that this either a bug, but if this is GCD forcing me to use new C++ methods for this, I'm all ears.
I'm not sure if it is a bug in Sierra or not. But it seems to work if you explicitly associate a global concurrent queue as target:
dispatch_queue_t target =
dispatch_get_global_queue(QOS_CLASS_USER_INITIATED, 0);
dispatch_queue_t workQueue =
dispatch_queue_create_with_target("workQueue", DISPATCH_QUEUE_CONCURRENT, target);
// ^~~~~~~~~~~ ^~~~~~
Here is a working example
#include <iostream>
#include <fstream>
#include <vector>
#include <cmath>
#include <sstream>
#include <dispatch/dispatch.h>
void load_problem(const std::string, std::vector<std::pair<double,double>>&);
int main() {
// n-factor polynomial - test against a given problem provided as a set of space delimited x y values in 2d.txt
std::vector<std::pair<double,double>> problem;
std::vector<double> test = {14.1333177226503,-0.0368874860476915,
0.0909424058436257,2.19080982673558,1.24632025036125,0.0444549880462031,
1.06824631867947,0.551482840616757, 1.04948148731933};
load_problem("weird.txt",problem); //a list of space delimited doubles representing x, y.
size_t a_count = test.size();
dispatch_queue_t queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
__block double diffs = 0.0; //sum of all values..
dispatch_apply(problem.size(), queue, ^(size_t i) {
double g = 0;
for (size_t j=0; j < a_count - 1; j++) {
g += test[j]*pow(problem[i].first,a_count - j - 1);
}
g += test[a_count - 1];
diffs += pow(g - problem[i].second,2);
});
double delta = 1/(1+sqrt(diffs));
std::cout << "test: fit delta: " << delta << std::endl;
}
void load_problem(const std::string file, std::vector<std::pair<double,double>>& repo) {
repo.clear();
std::ifstream ifs(file);
if (ifs.is_open()) {
std::string line;
while(getline(ifs, line)) {
double x= std::nan("");
double y= std::nan("");
std::istringstream istr(line);
istr >> std::skipws >> x >> y;
if (!isnan(x) && !isnan(y)) {
repo.push_back({x, y});
};
}
ifs.close();
}
}

MPI hangs during execution

I'm trying to write a simple program with MPI that finds all numbers less than 514, that are equal to the exponent of the sum of their digits(for example, 512 = (5+1+2)^3. The problem I have is with the main loop - it works just fine on a few iterations(c=10), but when I try to increase the number of iterations(c=x), mpiexec.exe just hangs - seemingly in the middle of printf routine.
I'm pretty sure that deadlocks are to blame, but I couldn't find any.
The source code:
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
#include "mpi.h"
int main(int argc, char* argv[])
{
//our number
int x=514;
//amount of iterations
int c = 10;
//tags for message identification
int tag = 42;
int tagnumber = 43;
int np, me, y1, y2;
MPI_Status status;
/* Initialize MPI */
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &np);
MPI_Comm_rank(MPI_COMM_WORLD, &me);
/* Check that we run on more than two processors */
if (np < 2)
{
printf("You have to use at least 2 processes to run this program\n");
MPI_Finalize();
exit(0);
}
//begin iterations
while(c>0)
{
//if main thread, then send messages to all created threads
if (me == 0)
{
printf("Amount of threads: %d\n", np);
int b = 1;
while(b<np)
{
int q = x-b;
//sends a number to a secondary thread
MPI_Send(&q, 1, MPI_INT, b, tagnumber, MPI_COMM_WORLD);
printf("Process %d sending to process %d, value: %d\n", me, b, q);
//get a number from secondary thread
MPI_Recv(&y2, 1, MPI_INT, b, tag, MPI_COMM_WORLD, &status);
printf ("Process %d received value %d\n", me, y2);
//compare it with the sent one
if (q==y2)
{
//if they're equal, then print the result
printf("\nValue found: %d\n", q);
}
b++;
}
x = x-b+1;
b = 1;
}
else
{
//if not a main thread, then process the message sent and send the result back.
MPI_Recv (&y1, 1, MPI_INT, 0, tagnumber, MPI_COMM_WORLD, &status);
int sum = 0;
int y2 = y1;
while (y1!=0)
{
//find the number's sum of digits
sum += y1%10;
y1 /= 10;
}
int sum2 = sum;
while(sum2<y2)
{
//calculate the exponentiation
sum2 = sum2*sum;
}
MPI_Send (&sum2, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
}
c--;
}
MPI_Finalize();
exit(0);
}
And I run the compiled exe-file as "mpiexec.exe -n 4 lab2.exe". I use HPC Pack 2008 SDK, if that's of any use to you guys.
Is there any way to fix it? Or maybe some way to debug that situation properly?
Thanks a lot in advance!
Not sure if you already found where's the problem, but your infinite run happens in this loop:
while(sum2<y2)
{
//calculate the exponentiation
sum2 = sum2*sum;
}
You can confirm this by setting c to about 300 or above then make a printf call in this while loop. I haven't completely pinpoint your error of logic, but I marked three comments below at your code location where I feel is strange:
while(c>0)
{
if (me == 0)
{
...
while(b<np)
{
int q = x-b; //<-- you subtract b from x here
...
b++;
}
x = x-b+1; //<-- you subtract b again. sure this is what you want?
b = 1; //<-- this is useless
}
Hope this helps.

CUDA Thrust and sort_by_key

I’m looking for a sorting algorithm on CUDA that can sort an array A of elements (double) and returns an array of keys B for that array A.
I know the sort_by_key function in the Thrust library but I want my array of elements A to remain unchanged.
What can I do?
My code is:
void sortCUDA(double V[], int P[], int N) {
real_t *Vcpy = (double*) malloc(N*sizeof(double));
memcpy(Vcpy,V,N*sizeof(double));
thrust::sort_by_key(V, V + N, P);
free(Vcpy);
}
i'm comparing the thrust algorithm against others that i have on sequencial cpu
N mergesort sortCUDA
113 0.000008 0.000010
226 0.000018 0.000016
452 0.000036 0.000020
905 0.000061 0.000034
1810 0.000135 0.000071
3621 0.000297 0.000156
7242 0.000917 0.000338
14484 0.001421 0.000853
28968 0.003069 0.001931
57937 0.006666 0.003939
115874 0.014435 0.008025
231749 0.031059 0.016718
463499 0.067407 0.039848
926999 0.148170 0.118003
1853998 0.329005 0.260837
3707996 0.731768 0.544357
7415992 1.638445 1.073755
14831984 3.668039 2.150179
115035495 39.276560 19.812200
230070990 87.750377 39.762915
460141980 200.940501 74.605219
Thrust performance is not bad, but I think if I use OMP can probably get easily a better CPU time
I think this is because to memcpy
SOLUTION:
void thrustSort(double V[], int P[], int N)
{
thrust::device_vector<int> d_P(N);
thrust::device_vector<double> d_V(V, V + N);
thrust::sequence(d_P.begin(), d_P.end());
thrust::sort_by_key(d_V.begin(), d_V.end(), d_P.begin());
thrust::copy(d_P.begin(),d_P.end(),P);
}
where V is a my double values to sort
You can modify comparison operator to sort keys instead of values. #Robert Crovella correctly pointed that a raw device pointer cannot be assigned from the host. The modified algorithm is below:
struct cmp : public binary_function<int,int,bool>
{
cmp(const double *ptr) : rawA(ptr) { }
__host__ __device__ bool operator()(const int i, const int j) const
{return rawA[i] > rawA[j];}
const double *rawA; // an array in global mem
};
void sortkeys(double *A, int n) {
// move data to the gpu
thrust::device_vector<double> devA(A, A + n);
double *rawA = thrust::raw_pointer_cast(devA.data());
thrust::device_vector<int> B(n);
// initialize keys
thrust::sequence(B.begin(), B.end());
thrust::sort(B.begin(), B.end(), cmp(rawA));
// B now contains the sorted keys
}
And here is alternative with arrayfire. Though I am not sure which one is more efficient since arrayfire solution uses two additional arrays:
void sortkeys(double *A, int n) {
af::array devA(n, A, af::afHost);
af::array vals, indices;
// sort and populate vals/indices arrays
af::sort(vals, indices, devA);
std::cout << devA << "\n" << indices << "\n";
}
How large is this array? The most efficient way, in terms of speed, will likely be to just duplicate the original array before sorting, if the memory is available.
Building on the answer provided by #asm (I wasn't able to get it working), this code seemed to work for me, and does sort only the keys. However, I believe it is limited to the case where the keys are in sequence 0, 1, 2, 3, 4 ... corresponding to the (double) values. Since this is a "index-value" sort, it could be extended to the case of an arbitrary sequence of keys, perhaps by doing an indexed copy. However I'm not sure the process of generating the index sequence and then rearranging the original keys will be any faster than just copying the original value data to a new vector (for the case of arbitrary keys).
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
using namespace std;
__device__ double *rawA; // an array in global mem
struct cmp : public binary_function<int, int, bool>
{
__host__ __device__ bool operator()(const int i, const int j) const
{return ( rawA[i] < rawA[j]);}
};
void sortkeys(double *A, int n) {
// move data to the gpu
thrust::device_vector<double> devA(A, A + n);
// rawA = thrust::raw_pointer_cast(&(devA[0]));
double *test = raw_pointer_cast(devA.data());
cudaMemcpyToSymbol(rawA, &test, sizeof(double *));
thrust::device_vector<int> B(n);
// initialize keys
thrust::sequence(B.begin(), B.end());
thrust::sort(B.begin(), B.end(), cmp());
// B now contains the sorted keys
thrust::host_vector<int> hostB = B;
for (int i=0; i<hostB.size(); i++)
std::cout << hostB[i] << " ";
std::cout<<std::endl;
for (int i=0; i<hostB.size(); i++)
std::cout << A[hostB[i]] << " ";
std::cout<<std::endl;
}
int main(){
double C[] = {0.7, 0.3, 0.4, 0.2, 0.6, 1.2, -0.5, 0.5, 0.0, 10.0};
sortkeys(C, 9);
std::cout << std::endl;
return 0;
}

Resources