Related
Currently I'm learning how to create games (at a low level) at my degree. I'm programming on Ubuntu mate 16.04, Codeblocks 13.12 and this happen:
-------------- Build: Debug in s04 (compiler: GNU GCC Compiler)---------------
g++ -o bin/Debug/s04 obj/Debug/main.o obj/Debug/Pantalla.o
obj/Debug/main.o: file not recognized: File format not recognized
collect2: error: ld returned 1 exit status
Process terminated with status 1 (0 minute(s), 0 second(s))
0 error(s), 0 warning(s) (0 minute(s), 0 second(s))
I must create a new window where my "game" will run... (I add code)
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "Pantalla.h"
//Ej.1
struct BalaRep
{
int x;
int y;
int vx;
int vy;
};
typedef struct BalaRep * Bala;
//Ej.2
Bala crea_bala ( double x, double y, double vx, double vy )
{
Bala b=malloc(sizeof(struct BalaRep));
b->x = x;
b->y = y;
b->vx = vx;
b->vy = vy;
return b;
}
//Ej.3
void libera_bala( Bala b )
{
free(b);
}
//Ej.4
void mueve_bala( Bala b )
{
b->x = (b->x)+ (b->vx);
b->y = (b->y)+ (b->vy);
}
//Ej.5
void dibuja_bala( Bala b )
{
Pantalla_DibujaRectangulo( b->x, b->y, 7, 7);
}
//Ej.6
/*
double get_x_bala( Bala b )
{
return b->x;
}
*/
//Ej.7
/*
double get_y_bala( Bala b )
{
return b->y;
}
*/
int main( int argc, char *argv[] )
{
Pantalla_Crea("Ejemplo 3", 640,480);
Pantalla_ColorTrazo(255,0,0, 255);
int x = 280;
int y = 425;
int x2 = 200;
int y2 = 100;
int vx2 = 5;
Bala b = NULL;
while ( Pantalla_Activa() )
{
//Crear bala
if (Pantalla_TeclaPulsada(SDL_SCANCODE_SPACE))
{
libera_bala(b);
b=NULL;
b=crea_bala(x,y,0,-10);
}
//Movimiento del rectángulo
if (Pantalla_TeclaPulsada(SDL_SCANCODE_RIGHT))
{
x = x + 5;
}
if (Pantalla_TeclaPulsada(SDL_SCANCODE_LEFT))
{
x = x - 5;
}
/*if (Pantalla_TeclaPulsada(SDL_SCANCODE_UP))
{
y = y - 5;
}
if (Pantalla_TeclaPulsada(SDL_SCANCODE_DOWN))
{
y = y + 5;
}*/
//Bordes no-salir
if (x > 640-80)
{
x = 640 - 80;
}
if (x < 0)
{
x = 0;
}
if (y > 480-40)
{
y = 480 - 40;
}
if (y < 0)
{
y = 0;
}
//Mov enemigo
x2 = x2 + vx2;
//Bordes no-salir enemigo
if (x2 > 640-80)
{
x2 = 640 - 80;
vx2 = vx2 * (-1);
}
if (x2 < 0)
{
x2 = 0;
vx2 = vx2 * (-1);
}
//BALA
if (b!=NULL)
{
mueve_bala(b);
}
if ((b!=NULL) && ((b->y) <= 0))
{
libera_bala(b);
b=NULL;
}
Pantalla_DibujaRellenoFondo( 255,255,255, 255 );
Pantalla_DibujaRectangulo( x, y, 80,40 );
Pantalla_DibujaRectangulo( x2, y2, 80,40 );
if (b!=NULL)
{
dibuja_bala(b);
}
Pantalla_Actualiza();
Pantalla_Espera(40);
}
Pantalla_Libera();
return 0;
}
There is a file that teachers give us to run it properly. Furthermore, my classmate run the same code (what I add) on his laptop and it works. Excuse me, I know my English is bad...
Obviously the object files are incompatible either because your build process is broken or if your professor just gives you the compiled object file, because it is not ABI compatible with your implementation i.e. compiler, OS, architecture.
I got this error in C++. I am trying to implement Strassen matrix multiplication with multi_array. I assign one array to another which they same dimension.
Like that A11[i][j][k] = A[i][j][k]. I think reason is that kind of lines.
Assertion failed: (size_type(idx - index_bases[0]) < extents[0]),
function access, file
/usr/local/Cellar/boost/1.65.1/include/boost/multi_array/base.hpp,
line 136. Abort trap: 6
Do you know the reason? What does this error mean?
typedef boost::multi_array<int_type, 3> array_type;
array_type::extent_gen extents;
array_type A(boost::extents[size][size][noc]);
array_type B(boost::extents[size][size][noc]);
array_type C(boost::extents[size][size][noc]);
std::fill( A.origin(), A.origin() + A.num_elements(), 0 );
std::fill( B.origin(), B.origin() + B.num_elements(), 0 );
std::fill( C.origin(), C.origin() + C.num_elements(), 0 );
array_type Strr(int size,int noc,array_type A,array_type B, array_type C) {
if(size == 2) { //2-order
C=Matrix_Multiply(size,noc, A, B, C);
} else {
//
for(int i=0; i<size/2; i++) {
for(int j=0; j<size/2; j++) {
for(int k=0; k<noc; j++) {
A11[i][j][k] = A[i][j][k] ;
A12[i][j][k] = A[i][j+size/2][k] ;
}
}
}
My code is like that: I do not know what the problem is.
Error:Assertion failed: (size_type(idx - index_bases[0]) < extents[0]), function access, file /usr/local/Cellar/boost/1.65.1/include/boost/multi_array/base.hpp, line 136.
In the inner most loop you have:
for (int k = 0; k < noc; j++) {
You must have meant ++k instead of ++j:
for (int k = 0; k < noc; ++k) {
I'd simplify main too:
int dim[] = {size,size,noc};
array_type A(dim), B(dim), C(dim);
Value-initialization is done by default.
The idea of multi_array is that the arrays self-describe, instead of you passing separate parameters (size and noc e.g.):
array_type Strr(array_type A, array_type B) {
static_assert(array_type::dimensionality == 3, "static invariant");
size_t size = A.shape()[0];
size_t noc = A.shape()[2];
assert(A.shape()[0] == A.shape()[1]);
assert(std::equal_range(A.shape(), A.shape()+3, B.shape()));
assert(std::equal_range(A.shape(), A.shape()+3, C.shape()));
I was investigating the delay_ms function of avr-gcc. In delay.h I found its definition:
void _delay_ms(double __ms)
{
double __tmp ;
#if __HAS_DELAY_CYCLES && defined(__OPTIMIZE__) && \
!defined(__DELAY_BACKWARD_COMPATIBLE__) && \
__STDC_HOSTED__
uint32_t __ticks_dc;
extern void __builtin_avr_delay_cycles(unsigned long);
__tmp = ((F_CPU) / 1e3) * __ms;
#if defined(__DELAY_ROUND_DOWN__)
__ticks_dc = (uint32_t)fabs(__tmp);
#elif defined(__DELAY_ROUND_CLOSEST__)
__ticks_dc = (uint32_t)(fabs(__tmp)+0.5);
#else
//round up by default
__ticks_dc = (uint32_t)(ceil(fabs(__tmp)));
#endif
__builtin_avr_delay_cycles(__ticks_dc);
#else
...
}
I am interested in how the __builtin_avr_delay_cycles function looks like internally and where it is defined? Where can I find the source?
As said in my comment to this very question on electronics.SE:
Compiler builtins are kinda funky to find, always, because they are not just C functions, but things that get inserted while parsing/compiling the code (at various levels of abstraction from the textual representation of the code itself. compiler theory stuff). What you're looking for is the function avr_expand_builtin in the GCC source tree. There's a case AVR_BUILTIN_DELAY_CYCLES in there. Look for what happens there.
Which is:
/* Implement `TARGET_EXPAND_BUILTIN'. */
/* Expand an expression EXP that calls a built-in function,
with result going to TARGET if that's convenient
(and in mode MODE if that's convenient).
SUBTARGET may be used as the target for computing one of EXP's operands.
IGNORE is nonzero if the value is to be ignored. */
static rtx
avr_expand_builtin (tree exp, rtx target,
rtx subtarget ATTRIBUTE_UNUSED,
machine_mode mode ATTRIBUTE_UNUSED,
int ignore)
{
tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
const char *bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
unsigned int id = DECL_FUNCTION_CODE (fndecl);
const struct avr_builtin_description *d = &avr_bdesc[id];
tree arg0;
rtx op0;
gcc_assert (id < AVR_BUILTIN_COUNT);
switch (id)
{
case AVR_BUILTIN_NOP:
emit_insn (gen_nopv (GEN_INT (1)));
return 0;
case AVR_BUILTIN_DELAY_CYCLES:
{
arg0 = CALL_EXPR_ARG (exp, 0);
op0 = expand_expr (arg0, NULL_RTX, VOIDmode, EXPAND_NORMAL);
if (!CONST_INT_P (op0))
error ("%s expects a compile time integer constant", bname);
else
avr_expand_delay_cycles (op0);
return NULL_RTX;
}
…
thus, the function you're looking for is avr_expand_delay_cycles in the same file:
static void
avr_expand_delay_cycles (rtx operands0)
{
unsigned HOST_WIDE_INT cycles = UINTVAL (operands0) & GET_MODE_MASK (SImode);
unsigned HOST_WIDE_INT cycles_used;
unsigned HOST_WIDE_INT loop_count;
if (IN_RANGE (cycles, 83886082, 0xFFFFFFFF))
{
loop_count = ((cycles - 9) / 6) + 1;
cycles_used = ((loop_count - 1) * 6) + 9;
emit_insn (gen_delay_cycles_4 (gen_int_mode (loop_count, SImode),
avr_mem_clobber()));
cycles -= cycles_used;
}
if (IN_RANGE (cycles, 262145, 83886081))
{
loop_count = ((cycles - 7) / 5) + 1;
if (loop_count > 0xFFFFFF)
loop_count = 0xFFFFFF;
cycles_used = ((loop_count - 1) * 5) + 7;
emit_insn (gen_delay_cycles_3 (gen_int_mode (loop_count, SImode),
avr_mem_clobber()));
cycles -= cycles_used;
}
if (IN_RANGE (cycles, 768, 262144))
{
loop_count = ((cycles - 5) / 4) + 1;
if (loop_count > 0xFFFF)
loop_count = 0xFFFF;
cycles_used = ((loop_count - 1) * 4) + 5;
emit_insn (gen_delay_cycles_2 (gen_int_mode (loop_count, HImode),
avr_mem_clobber()));
cycles -= cycles_used;
}
if (IN_RANGE (cycles, 6, 767))
{
loop_count = cycles / 3;
if (loop_count > 255)
loop_count = 255;
cycles_used = loop_count * 3;
emit_insn (gen_delay_cycles_1 (gen_int_mode (loop_count, QImode),
avr_mem_clobber()));
cycles -= cycles_used;
}
while (cycles >= 2)
{
emit_insn (gen_nopv (GEN_INT (2)));
cycles -= 2;
}
if (cycles == 1)
{
emit_insn (gen_nopv (GEN_INT (1)));
cycles--;
}
}
Of biggest interest here is that this modifies a node in the Abstract Syntax Tree, and emits instructions there.
I'm building a CUDA kernel to compute the numerical N*N jacobian of a function, using finite differences; in the example I provided, it is the square function (each entry of the vector is squared). The host coded allocates in linear memory, while I'm using a 2-dimensional indexing in the kernel.
My issue is that I haven't found a way to sum on the diagonal of the matrices cudaMalloc'ed. My attempt has been to use the statement threadIdx.x == blockIdx.x as a condition for the diagonal, but instead it evaluates to true only for them both at 0.
Here is the kernel and EDIT: I posted the whole code as an answer, based on the suggestions in the comments (the main() is basically the same, while the kernel is not)
template <typename T>
__global__ void jacobian_kernel (
T * J,
const T t0,
const T tn,
const T h,
const T * u0,
const T * un,
const T * un_old)
{
T cgamma = 2 - sqrtf(2);
const unsigned int t = threadIdx.x;
const unsigned int b = blockIdx.x;
const unsigned int tid = t + b * blockDim.x;
/*__shared__*/ T temp_sx[BLOCK_SIZE][BLOCK_SIZE];
/*__shared__*/ T temp_dx[BLOCK_SIZE][BLOCK_SIZE];
__shared__ T sm_temp_du[BLOCK_SIZE];
T* temp_du = &sm_temp_du[0];
if (tid < N )
{
temp_sx[b][t] = un[t];
temp_dx[b][t] = un[t];
if ( t == b )
{
if ( tn == t0 )
{
temp_du[t] = u0[t]*0.001;
temp_sx[b][t] += temp_du[t]; //(*)
temp_dx[b][t] -= temp_du[t];
temp_sx[b][t] += ( abs( temp_sx[b][t] ) < 10e-6 ? 0.1 : 0 );
temp_dx[b][t] += ( abs( temp_dx[b][t] ) < 10e-6 ? 0.1 : 0 );
temp_sx[b][t] = ( temp_sx[b][t] == 0 ? 0.1 : temp_sx[b][t] );
temp_dx[b][t] = ( temp_dx[b][t] == 0 ? 0.1 : temp_dx[b][t] );
}
else
{
temp_du[t] = MAX( un[t] - un_old[t], 10e-6 );
temp_sx[b][t] += temp_du[t];
temp_dx[b][t] -= temp_du[t];
}
}
__syncthreads();
//J = f(tn, un + du)
d_func(tn, (temp_sx[b]), (temp_sx[b]), 1.f);
d_func(tn, (temp_dx[b]), (temp_dx[b]), 1.f);
__syncthreads();
J[tid] = (temp_sx[b][t] - temp_dx[b][t]) * powf((2 * temp_du[t]), -1);
//J[tid]*= - h*cgamma/2;
//J[tid]+= ( t == b ? 1 : 0);
//J[tid] = temp_J[tid];
}
}
The general procedure for computing the jacobian is
Copy un into every row of temp_sx and temp_dx
Compute du as a 0.01 magnitude from u0
Sum du to the diagonal of temp_sx, subtract du from the diagonal of temp_dx
Compute the square function on each entry of temp_sx and temp_dx
Subtract them and divide every entry by 2*du
This procedure can be summarized with (f(un + du*e_i) - f(un - du*e_i))/2*du.
My problem is to sum du to the diagonal of the matrices of temp_sx and temp_dx like I tried in (*). How can I achieve that?
EDIT: Now calling 1D blocks and threads; in fact, .y axis wasn't used at all in the kernel. I'm calling the kernel with a fixed amount of shared memory
Note that in int main() I'm calling the kernel with
#define REAL sizeof(float)
#define N 32
#define BLOCK_SIZE 16
#define NUM_BLOCKS ((N*N + BLOCK_SIZE - 1)/ BLOCK_SIZE)
...
dim3 dimGrid(NUM_BLOCKS,);
dim3 dimBlock(BLOCK_SIZE);
size_t shm_size = N*N*REAL;
jacobian_kernel <<< dimGrid, dimBlock, size_t shm_size >>> (...);
So that I attempt to deal with block-splitting the function calls. In the kernel to sum on the diagonal I used if(threadIdx.x == blockIdx.x){...}. Why isn't this correct? I'm asking it because while debugging and making the code print the statement, It only evaluates true if they both are 0. Thus du[0] is the only numerical value and the matrix becomes nan. Note that this approach worked with the first code I built, where instead I called the kernel with
jacobian_kernel <<< N, N >>> (...)
So that when threadIdx.x == blockIdx.x the element is on the diagonal. This approach doesn't fit anymore though, since now I need to deal with larger N (possibly larger than 1024, which is the maximum number of threads per block).
What statement should I put there that works even if the matrices are split into blocks and threads?
Let me know if I should share some other info.
Here is how I managed to solve my problem, based on the suggestion in the comments on the answer. The example is compilable, provided you put helper_cuda.h and helper_string.h in the same directory or you add -I directive to the CUDA examples include path, installed along with the CUDA toolkit. The relevant changes are only in the kernel; there's a minor change in the main() though, since I was calling double the resources to execute the kernel, but the .y axis of the grid of thread blocks wasn't even used at all, so it didn't generate any error.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <math.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include "helper_cuda.h"
#include "helper_string.h"
#include <fstream>
#ifndef MAX
#define MAX(a,b) ((a > b) ? a : b)
#endif
#define REAL sizeof(float)
#define N 128
#define BLOCK_SIZE 128
#define NUM_BLOCKS ((N*N + BLOCK_SIZE - 1)/ BLOCK_SIZE)
template <typename T>
inline void printmatrix( T mat, int rows, int cols);
template <typename T>
__global__ void jacobian_kernel ( const T * A, T * J, const T t0, const T tn, const T h, const T * u0, const T * un, const T * un_old);
template<typename T>
__device__ void d_func(const T t, const T u[], T res[], const T h = 1);
template<typename T>
int main ()
{
float t0 = 0.; //float tn = 0.;
float h = 0.1;
float* u0 = (float*)malloc(REAL*N); for(int i = 0; i < N; ++i){u0[i] = i+1;}
float* un = (float*)malloc(REAL*N); memcpy(un, u0, REAL*N);
float* un_old = (float*)malloc(REAL*N); memcpy(un_old, u0, REAL*N);
float* J = (float*)malloc(REAL*N*N);
float* A = (float*)malloc(REAL*N*N); host_heat_matrix(A);
float *d_u0;
float *d_un;
float *d_un_old;
float *d_J;
float *d_A;
checkCudaErrors(cudaMalloc((void**)&d_u0, REAL*N)); //printf("1: %p\n", d_u0);
checkCudaErrors(cudaMalloc((void**)&d_un, REAL*N)); //printf("2: %p\n", d_un);
checkCudaErrors(cudaMalloc((void**)&d_un_old, REAL*N)); //printf("3: %p\n", d_un_old);
checkCudaErrors(cudaMalloc((void**)&d_J, REAL*N*N)); //printf("4: %p\n", d_J);
checkCudaErrors(cudaMalloc((void**)&d_A, REAL*N*N)); //printf("4: %p\n", d_J);
checkCudaErrors(cudaMemcpy(d_u0, u0, REAL*N, cudaMemcpyHostToDevice)); assert(d_u0 != NULL);
checkCudaErrors(cudaMemcpy(d_un, un, REAL*N, cudaMemcpyHostToDevice)); assert(d_un != NULL);
checkCudaErrors(cudaMemcpy(d_un_old, un_old, REAL*N, cudaMemcpyHostToDevice)); assert(d_un_old != NULL);
checkCudaErrors(cudaMemcpy(d_J, J, REAL*N*N, cudaMemcpyHostToDevice)); assert(d_J != NULL);
checkCudaErrors(cudaMemcpy(d_A, A, REAL*N*N, cudaMemcpyHostToDevice)); assert(d_A != NULL);
dim3 dimGrid(NUM_BLOCKS); std::cout << "NUM_BLOCKS \t" << dimGrid.x << "\n";
dim3 dimBlock(BLOCK_SIZE); std::cout << "BLOCK_SIZE \t" << dimBlock.x << "\n";
size_t shm_size = N*REAL; //std::cout << shm_size << "\n";
//HERE IS A RELEVANT CHANGE OF THE MAIN, SINCE I WAS CALLING
//THE KERNEL WITH A 2D GRID BUT WITHOUT USING THE .y AXIS,
//WHILE NOW THE GRID IS 1D
jacobian_kernel <<< dimGrid, dimBlock, shm_size >>> (d_A, d_J, t0, t0, h, d_u0, d_un, d_un_old);
checkCudaErrors(cudaMemcpy(J, d_J, REAL*N*N, cudaMemcpyDeviceToHost)); //printf("4: %p\n", d_J);
printmatrix( J, N, N);
checkCudaErrors(cudaDeviceReset());
free(u0);
free(un);
free(un_old);
free(J);
}
template <typename T>
__global__ void jacobian_kernel (
const T * A,
T * J,
const T t0,
const T tn,
const T h,
const T * u0,
const T * un,
const T * un_old)
{
T cgamma = 2 - sqrtf(2);
const unsigned int t = threadIdx.x;
const unsigned int b = blockIdx.x;
const unsigned int tid = t + b * blockDim.x;
/*__shared__*/ T temp_sx[BLOCK_SIZE][BLOCK_SIZE];
/*__shared__*/ T temp_dx[BLOCK_SIZE][BLOCK_SIZE];
__shared__ T sm_temp_du;
T* temp_du = &sm_temp_du;
//HERE IS A RELEVANT CHANGE (*)
if ( t < BLOCK_SIZE && b < NUM_BLOCKS )
{
temp_sx[b][t] = un[t]; //printf("temp_sx[%d] = %f\n", t,(temp_sx[b][t]));
temp_dx[b][t] = un[t];
//printf("t = %d, b = %d, t + b * blockDim.x = %d \n",t, b, tid);
//HERE IS A NOTE (**)
if ( t == b )
{
//printf("t = %d, b = %d \n",t, b);
if ( tn == t0 )
{
*temp_du = u0[t]*0.001;
temp_sx[b][t] += *temp_du;
temp_dx[b][t] -= *temp_du;
temp_sx[b][t] += ( abs( temp_sx[b][t] ) < 10e-6 ? 0.1 : 0 );
temp_dx[b][t] += ( abs( temp_dx[b][t] ) < 10e-6 ? 0.1 : 0 );
temp_sx[b][t] = ( temp_sx[b][t] == 0 ? 0.1 : temp_sx[b][t] );
temp_dx[b][t] = ( temp_dx[b][t] == 0 ? 0.1 : temp_dx[b][t] );
}
else
{
*temp_du = MAX( un[t] - un_old[t], 10e-6 );
temp_sx[b][t] += *temp_du;
temp_dx[b][t] -= *temp_du;
}
;
}
//printf("du[%d] %f\n", tid, (*temp_du));
__syncthreads();
//printf("temp_sx[%d][%d] = %f\n", b, t, temp_sx[b][t]);
//printf("temp_dx[%d][%d] = %f\n", b, t, temp_dx[b][t]);
//d_func(tn, (temp_sx[b]), (temp_sx[b]), 1.f);
//d_func(tn, (temp_dx[b]), (temp_dx[b]), 1.f);
matvec_dev( tn, A, (temp_sx[b]), (temp_sx[b]), N, N, 1.f );
matvec_dev( tn, A, (temp_dx[b]), (temp_dx[b]), N, N, 1.f );
__syncthreads();
//printf("temp_sx_later[%d][%d] = %f\n", b, t, (temp_sx[b][t]));
//printf("temp_sx_later[%d][%d] - temp_dx_later[%d][%d] = %f\n", b,t,b,t, (temp_sx[b][t] - temp_dx[b][t]) / 2 * *temp_du);
//if (t == b ) printf( "2du[%d]^-1 = %f\n",t, powf((2 * *temp_du), -1));
J[tid] = (temp_sx[b][t] - temp_dx[b][t]) / (2 * *temp_du);
}
}
template<typename T>
__device__ void d_func(const T t, const T u[], T res[], const T h )
{
__shared__ float temp_u;
temp_u = u[threadIdx.x];
res[threadIdx.x] = h*powf( (temp_u), 2);
}
template <typename T>
inline void printmatrix( T mat, int rows, int cols)
{
std::ofstream matrix_out;
matrix_out.open( "heat_matrix.txt", std::ofstream::out);
for( int i = 0; i < rows; i++)
{
for( int j = 0; j <cols; j++)
{
double next = mat[i + N*j];
matrix_out << ( (next >= 0) ? " " : "") << next << " ";
}
matrix_out << "\n";
}
}
The relevant change is on (*). Before I used if (tid < N) which has two downsides:
First, it is wrong, since it should be tid < N*N, as my data is 2D, while tid is a global index which tracks all the data.
Even if I wrote tid < N*N, since I'm splitting the function calls into blocks, the t < BLOCK_SIZE && b < NUM_BLOCKS seems clearer to me in how the indexing is arranged in the code.
Moreover, the statement t == b in (**) is actually the right one to operate on the diagonal elements of the matrix. The fact that it was evaluated true only on 0 was because of my error right above.
Thanks for the suggestions!
I have a list of N 64-bit integers whose bits represent small sets. Each integer has at most k bits set to 1. Given a bit mask, I would like to find the first element in the list that matches the mask, i.e. element & mask == element.
Example:
If my list is:
index abcdef
0 001100
1 001010
2 001000
3 000100
4 000010
5 000001
6 010000
7 100000
8 000000
and my mask is 111000, the first element matching the mask is at index 2.
Method 1:
Linear search through the entire list. This takes O(N) time and O(1) space.
Method 2:
Precompute a tree of all possible masks, and at each node keep the answer for that mask. This takes O(1) time for the query, but takes O(2^64) space.
Question:
How can I find the first element matching the mask faster than O(N), while still using a reasonable amount of space? I can afford to spend polynomial time in precomputation, because there will be a lot of queries. The key is that k is small. In my application, k <= 5 and N is in the thousands. The mask has many 1s; you can assume that it is drawn uniformly from the space of 64-bit integers.
Update:
Here is an example data set and a simple benchmark program that runs on Linux: http://up.thirld.com/binmask.tar.gz. For large.in, N=3779 and k=3. The first line is N, followed by N unsigned 64-bit ints representing the elements. Compile with make. Run with ./benchmark.e >large.out to create the true output, which you can then diff against. (Masks are generated randomly, but the random seed is fixed.) Then replace the find_first() function with your implementation.
The simple linear search is much faster than I expected. This is because k is small, and so for a random mask, a match is found very quickly on average.
A suffix tree (on bits) will do the trick, with the original priority at the leaf nodes:
000000 -> 8
1 -> 5
10 -> 4
100 -> 3
1000 -> 2
10 -> 1
100 -> 0
10000 -> 6
100000 -> 7
where if the bit is set in the mask, you search both arms, and if not, you search only the 0 arm; your answer is the minimum number you encounter at a leaf node.
You can improve this (marginally) by traversing the bits not in order but by maximum discriminability; in your example, note that 3 elements have bit 2 set, so you would create
2:0 0:0 1:0 3:0 4:0 5:0 -> 8
5:1 -> 5
4:1 5:0 -> 4
3:1 4:0 5:0 -> 3
1:1 3:0 4:0 5:0 -> 6
0:1 1:0 3:0 4:0 5:0 -> 7
2:1 0:0 1:0 3:0 4:0 5:0 -> 2
4:1 5:0 -> 1
3:1 4:0 5:0 -> 0
In your example mask this doesn't help (since you have to traverse both the bit2==0 and bit2==1 sides since your mask is set in bit 2), but on average it will improve the results (but at a cost of setup and more complex data structure). If some bits are much more likely to be set than others, this could be a huge win. If they're pretty close to random within the element list, then this doesn't help at all.
If you're stuck with essentially random bits set, you should get about (1-5/64)^32 benefit from the suffix tree approach on average (13x speedup), which might be better than the difference in efficiency due to using more complex operations (but don't count on it--bit masks are fast). If you have a nonrandom distribution of bits in your list, then you could do almost arbitrarily well.
This is the bitwise Kd-tree. It typically needs less than 64 visits per lookup operation. Currently, the selection of the bit (dimension) to pivot on is random.
#include <limits.h>
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
typedef unsigned long long Thing;
typedef unsigned long Number;
unsigned thing_ffs(Thing mask);
Thing rand_mask(unsigned bitcnt);
#define WANT_RANDOM 31
#define WANT_BITS 3
#define BITSPERTHING (CHAR_BIT*sizeof(Thing))
#define NONUMBER ((Number)-1)
struct node {
Thing value;
Number num;
Number nul;
Number one;
char pivot;
} *nodes = NULL;
unsigned nodecount=0;
unsigned itercount=0;
struct node * nodes_read( unsigned *sizp, char *filename);
Number *find_ptr_to_insert(Number *ptr, Thing value, Thing mask);
unsigned grab_matches(Number *result, Number num, Thing mask);
void initialise_stuff(void);
int main (int argc, char **argv)
{
Thing mask;
Number num;
unsigned idx;
srand (time(NULL));
nodes = nodes_read( &nodecount, argv[1]);
fprintf( stdout, "Nodecount=%u\n", nodecount );
initialise_stuff();
#if WANT_RANDOM
mask = nodes[nodecount/2].value | nodes[nodecount/3].value ;
#else
mask = 0x38;
#endif
fprintf( stdout, "\n#### Search mask=%llx\n", (unsigned long long) mask );
itercount = 0;
num = NONUMBER;
idx = grab_matches(&num,0, mask);
fprintf( stdout, "Itercount=%u\n", itercount );
fprintf(stdout, "KdTree search %16llx\n", (unsigned long long) mask );
fprintf(stdout, "Count=%u Result:\n", idx);
idx = num;
if (idx >= nodecount) idx = nodecount-1;
fprintf( stdout, "num=%4u Value=%16llx\n"
,(unsigned) nodes[idx].num
,(unsigned long long) nodes[idx].value
);
fprintf( stdout, "\nLinear search %16llx\n", (unsigned long long) mask );
for (idx = 0; idx < nodecount; idx++) {
if ((nodes[idx].value & mask) == nodes[idx].value) break;
}
fprintf(stdout, "Cnt=%u\n", idx);
if (idx >= nodecount) idx = nodecount-1;
fprintf(stdout, "Num=%4u Value=%16llx\n"
, (unsigned) nodes[idx].num
, (unsigned long long) nodes[idx].value );
return 0;
}
void initialise_stuff(void)
{
unsigned num;
Number root, *ptr;
root = 0;
for (num=0; num < nodecount; num++) {
nodes[num].num = num;
nodes[num].one = NONUMBER;
nodes[num].nul = NONUMBER;
nodes[num].pivot = -1;
}
nodes[num-1].value = 0; /* last node is guaranteed to match anything */
root = 0;
for (num=1; num < nodecount; num++) {
ptr = find_ptr_to_insert (&root, nodes[num].value, 0ull );
if (*ptr == NONUMBER) *ptr = num;
else fprintf(stderr, "Found %u for %u\n"
, (unsigned)*ptr, (unsigned) num );
}
}
Thing rand_mask(unsigned bitcnt)
{struct node * nodes_read( unsigned *sizp, char *filename)
{
struct node *ptr;
unsigned size,used;
FILE *fp;
if (!filename) {
size = (WANT_RANDOM+0) ? WANT_RANDOM : 9;
ptr = malloc (size * sizeof *ptr);
#if (!WANT_RANDOM)
ptr[0].value = 0x0c;
ptr[1].value = 0x0a;
ptr[2].value = 0x08;
ptr[3].value = 0x04;
ptr[4].value = 0x02;
ptr[5].value = 0x01;
ptr[6].value = 0x10;
ptr[7].value = 0x20;
ptr[8].value = 0x00;
#else
for (used=0; used < size; used++) {
ptr[used].value = rand_mask(WANT_BITS);
}
#endif /* WANT_RANDOM */
*sizp = size;
return ptr;
}
fp = fopen( filename, "r" );
if (!fp) return NULL;
fscanf(fp,"%u\n", &size );
fprintf(stderr, "Size=%u\n", size);
ptr = malloc (size * sizeof *ptr);
for (used = 0; used < size; used++) {
fscanf(fp,"%llu\n", &ptr[used].value );
}
fclose( fp );
*sizp = used;
return ptr;
}
Thing value = 0;
unsigned bit, cnt;
for (cnt=0; cnt < bitcnt; cnt++) {
bit = 54321*rand();
bit %= BITSPERTHING;
value |= 1ull << bit;
}
return value;
}
Number *find_ptr_to_insert(Number *ptr, Thing value, Thing done)
{
Number num=NONUMBER;
while ( *ptr != NONUMBER) {
Thing wrong;
num = *ptr;
wrong = (nodes[num].value ^ value) & ~done;
if (nodes[num].pivot < 0) { /* This node is terminal */
/* choose one of the wrong bits for a pivot .
** For this bit (nodevalue==1 && searchmask==0 )
*/
if (!wrong) wrong = ~done ;
nodes[num].pivot = thing_ffs( wrong );
}
ptr = (wrong & 1ull << nodes[num].pivot) ? &nodes[num].nul : &nodes[num].one;
/* Once this bit has been tested, it can be masked off. */
done |= 1ull << nodes[num].pivot ;
}
return ptr;
}
unsigned grab_matches(Number *result, Number num, Thing mask)
{
Thing wrong;
unsigned count;
for (count=0; num < *result; ) {
itercount++;
wrong = nodes[num].value & ~mask;
if (!wrong) { /* we have a match */
if (num < *result) { *result = num; count++; }
/* This is cheap pruning: the break will omit both subtrees from the results.
** But because we already have a result, and the subtrees have higher numbers
** than our current num, we can ignore them. */
break;
}
if (nodes[num].pivot < 0) { /* This node is terminal */
break;
}
if (mask & 1ull << nodes[num].pivot) {
/* avoid recursion if there is only one non-empty subtree */
if (nodes[num].nul >= *result) { num = nodes[num].one; continue; }
if (nodes[num].one >= *result) { num = nodes[num].nul; continue; }
count += grab_matches(result, nodes[num].nul, mask);
count += grab_matches(result, nodes[num].one, mask);
break;
}
mask |= 1ull << nodes[num].pivot;
num = (wrong & 1ull << nodes[num].pivot) ? nodes[num].nul : nodes[num].one;
}
return count;
}
unsigned thing_ffs(Thing mask)
{
unsigned bit;
#if 1
if (!mask) return (unsigned)-1;
for ( bit=random() % BITSPERTHING; 1 ; bit += 5, bit %= BITSPERTHING) {
if (mask & 1ull << bit ) return bit;
}
#elif 0
for (bit =0; bit < BITSPERTHING; bit++ ) {
if (mask & 1ull <<bit) return bit;
}
#else
mask &= (mask-1); // Kernighan-trick
for (bit =0; bit < BITSPERTHING; bit++ ) {
mask >>=1;
if (!mask) return bit;
}
#endif
return 0xffffffff;
}
struct node * nodes_read( unsigned *sizp, char *filename)
{
struct node *ptr;
unsigned size,used;
FILE *fp;
if (!filename) {
size = (WANT_RANDOM+0) ? WANT_RANDOM : 9;
ptr = malloc (size * sizeof *ptr);
#if (!WANT_RANDOM)
ptr[0].value = 0x0c;
ptr[1].value = 0x0a;
ptr[2].value = 0x08;
ptr[3].value = 0x04;
ptr[4].value = 0x02;
ptr[5].value = 0x01;
ptr[6].value = 0x10;
ptr[7].value = 0x20;
ptr[8].value = 0x00;
#else
for (used=0; used < size; used++) {
ptr[used].value = rand_mask(WANT_BITS);
}
#endif /* WANT_RANDOM */
*sizp = size;
return ptr;
}
fp = fopen( filename, "r" );
if (!fp) return NULL;
fscanf(fp,"%u\n", &size );
fprintf(stderr, "Size=%u\n", size);
ptr = malloc (size * sizeof *ptr);
for (used = 0; used < size; used++) {
fscanf(fp,"%llu\n", &ptr[used].value );
}
fclose( fp );
*sizp = used;
return ptr;
}
UPDATE:
I experimented a bit with the pivot-selection, favouring bits with the highest discriminatory value ("information content"). This involves:
making a histogram of the usage of bits (can be done while initialising)
while building the tree: choosing the one with frequency closest to 1/2 in the remaining subtrees.
The result: the random pivot selection performed better.
Construct a a binary tree as follows:
Every level corresponds to a bit
It corresponding bit is on go right, otherwise left
This way insert every number in the database.
Now, for searching: if the corresponding bit in the mask is 1, traverse both children. If it is 0, traverse only the left node. Essentially keep traversing the tree until you hit the leaf node (BTW, 0 is a hit for every mask!).
This tree will have O(N) space requirements.
Eg of tree for 1 (001), 2(010) and 5 (101)
root
/ \
0 1
/ \ |
0 1 0
| | |
1 0 1
(1) (2) (5)
With precomputed bitmasks. Formally is is still O(N), since the and-mask operations are O(N). The final pass is also O(N), because it needs to find the lowest bit set, but that could be sped up, too.
#include <limits.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
/* For demonstration purposes.
** In reality, this should be an unsigned long long */
typedef unsigned char Thing;
#define BITSPERTHING (CHAR_BIT*sizeof (Thing))
#define COUNTOF(a) (sizeof a / sizeof a[0])
Thing data[] =
/****** index abcdef */
{ 0x0c /* 0 001100 */
, 0x0a /* 1 001010 */
, 0x08 /* 2 001000 */
, 0x04 /* 3 000100 */
, 0x02 /* 4 000010 */
, 0x01 /* 5 000001 */
, 0x10 /* 6 010000 */
, 0x20 /* 7 100000 */
, 0x00 /* 8 000000 */
};
/* Note: this is for demonstration purposes.
** Normally, one should choose a machine wide unsigned int
** for bitmask arrays.
*/
struct bitmap {
char data[ 1+COUNTOF (data)/ CHAR_BIT ];
} nulmaps [ BITSPERTHING ];
#define BITSET(a,i) (a)[(i) / CHAR_BIT ] |= (1u << ((i)%CHAR_BIT) )
#define BITTEST(a,i) ((a)[(i) / CHAR_BIT ] & (1u << ((i)%CHAR_BIT) ))
void init_tabs(void);
void map_empty(struct bitmap *dst);
void map_full(struct bitmap *dst);
void map_and2(struct bitmap *dst, struct bitmap *src);
int main (void)
{
Thing mask;
struct bitmap result;
unsigned ibit;
mask = 0x38;
init_tabs();
map_full(&result);
for (ibit = 0; ibit < BITSPERTHING; ibit++) {
/* bit in mask is 1, so bit at this position is in fact a don't care */
if (mask & (1u <<ibit)) continue;
/* bit in mask is 0, so we can only select items with a 0 at this bitpos */
map_and2(&result, &nulmaps[ibit] );
}
/* This is not the fastest way to find the lowest 1 bit */
for (ibit = 0; ibit < COUNTOF (data); ibit++) {
if (!BITTEST(result.data, ibit) ) continue;
fprintf(stdout, " %u", ibit);
}
fprintf( stdout, "\n" );
return 0;
}
void init_tabs(void)
{
unsigned ibit, ithing;
/* 1 bits in data that dont overlap with 1 bits in the searchmask are showstoppers.
** So, for each bitpos, we precompute a bitmask of all *entrynumbers* from data[], that contain 0 in bitpos.
*/
memset(nulmaps, 0 , sizeof nulmaps);
for (ithing=0; ithing < COUNTOF(data); ithing++) {
for (ibit=0; ibit < BITSPERTHING; ibit++) {
if ( data[ithing] & (1u << ibit) ) continue;
BITSET(nulmaps[ibit].data, ithing);
}
}
}
/* Logical And of two bitmask arrays; simular to dst &= src */
void map_and2(struct bitmap *dst, struct bitmap *src)
{
unsigned idx;
for (idx = 0; idx < COUNTOF(dst->data); idx++) {
dst->data[idx] &= src->data[idx] ;
}
}
void map_empty(struct bitmap *dst)
{
memset(dst->data, 0 , sizeof dst->data);
}
void map_full(struct bitmap *dst)
{
unsigned idx;
/* NOTE this loop sets too many bits to the left of COUNTOF(data) */
for (idx = 0; idx < COUNTOF(dst->data); idx++) {
dst->data[idx] = ~0;
}
}