Closed. This question is opinion-based. It is not currently accepting answers.
Want to improve this question? Update the question so it can be answered with facts and citations by editing this post.
Closed last month.
Improve this question
Is there a performant way to generate an unbiased 64b random integer without 3 set bits in a row, assuming a fast-and-unbiased input PRNG? I don't care about 'wasting bits' of the input source.
That is, something better than the naive rejection-sampling approach:
uint64_t r;
do {
r = get_rand_64();
} while (r & (r >> 1) & (r >> 2));
...which "works", but is very slow. It looks like it's iterating ~187x on average or so.
One possibility I've explored is roughly:
bool p2 = get_rand_bit();
bool p1 = get_rand_bit();
uint64_t r = (p1 << 1) | p2;
for (int i = 2; i < 64; i++) {
bool p0 = (p1 && p2) ? false : get_rand_bit();
r |= p0 << i;
p2 = p1;
p1 = p0;
}
...however, this is still slow. Mainly because using this approach the entire calculation is bit-serial. EDIT: and it's also biased. Easiest to see with a 3-bit integer - 0b011 occurs 1/8th of the time, which is wrong (should be 1/7th).
I've tried doing various parallel fixups, but haven't been able to come up with anything unbiased. It's useful to play around with 4-bit integers first - e.g. setting all bits involved in a conflict to random values ends up biased, and drawing out the Markov chain for 4 bits makes that obvious
Is there a better way to do this?
I optimized the lexicographic decoder, resulting in a four-fold speedup relative to my previous answer. There are two new ideas:
Use the one-to-one correspondence implied by the recurrence T(n) = T(k−1) T(n−k) + T(k−2) T(n−k−1) + T(k−2) T(n−k−2) + T(k−3) T(n−k−1) to avoid working one bit at a time;
Cache the small words without 111 in addition to the recurrence values, incurring an L1 cache hit to save a number of arithmetic operations.
#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
enum { kTribonacci14 = 5768 };
static uint64_t g_tribonacci[65];
static void InitTribonacci(void) {
for (unsigned i = 0; i < 65; i++) {
g_tribonacci[i] =
i < 3 ? 1 << i
: g_tribonacci[i - 1] + g_tribonacci[i - 2] + g_tribonacci[i - 3];
}
assert(g_tribonacci[14] == kTribonacci14);
}
static uint16_t g_words_no_111[kTribonacci14];
static void InitCachedWordsNo111(void) {
unsigned i = 0;
for (unsigned word = 0; word < ((unsigned)1 << 14); word++) {
if ((word & (word >> 1) & (word >> 2)) == 0) {
assert(i < kTribonacci14);
g_words_no_111[i++] = (uint16_t)word;
}
}
assert(i == kTribonacci14);
}
static bool CaseNo111(uint64_t *restrict result, unsigned *restrict n,
uint64_t *restrict index, unsigned left_n,
unsigned right_n) {
uint64_t left_count = g_tribonacci[left_n];
uint64_t right_count = g_tribonacci[right_n];
uint64_t product = left_count * right_count;
if (*index >= product) {
*index -= product;
return false;
}
*result = (*result << left_n) + g_words_no_111[*index / right_count];
*n = right_n;
*index %= right_count;
return true;
}
static void Append(uint64_t *result, uint64_t bit) {
*result = (*result << 1) + bit;
}
static uint64_t DecodeNo111(unsigned n, uint64_t index) {
assert(0 <= n && n <= 64);
assert(index < g_tribonacci[n]);
uint64_t result = 0;
while (n > 14) {
assert(g_tribonacci[n] == g_tribonacci[12] * g_tribonacci[n - 13] +
g_tribonacci[11] * g_tribonacci[n - 14] +
g_tribonacci[11] * g_tribonacci[n - 15] +
g_tribonacci[10] * g_tribonacci[n - 14]);
if (CaseNo111(&result, &n, &index, 12, n - 13)) {
Append(&result, 0);
} else if (CaseNo111(&result, &n, &index, 11, n - 14)) {
Append(&result, 0);
Append(&result, 1);
Append(&result, 0);
} else if (CaseNo111(&result, &n, &index, 11, n - 15)) {
Append(&result, 0);
Append(&result, 1);
Append(&result, 1);
Append(&result, 0);
} else if (CaseNo111(&result, &n, &index, 10, n - 14)) {
Append(&result, 0);
Append(&result, 1);
Append(&result, 1);
Append(&result, 0);
} else {
assert(false);
}
}
return (result << n) + g_words_no_111[index];
}
static void PrintWord(unsigned n, uint64_t word) {
assert(0 <= n && n <= 64);
while (n-- > 0) {
putchar('0' + ((word >> n) & 1));
}
putchar('\n');
}
int main(void) {
InitTribonacci();
InitCachedWordsNo111();
if ((false)) {
enum { kN = 20 };
for (uint64_t i = 0; i < g_tribonacci[kN]; i++) {
PrintWord(kN, DecodeNo111(kN, i));
}
}
uint64_t sum = 0;
uint64_t index = 0;
for (uint32_t i = 0; i < 10000000; i++) {
sum += DecodeNo111(64, index % g_tribonacci[64]);
index = (index * 2862933555777941757) + 3037000493;
}
return sum & 127;
}
From #John Coleman's comment, here's the start of an approach based on Tribonacci numbers. Basic idea:
Generate an unbiased number in the range [0..T(bits)), where T(0) = 1, T(1) = 2, T(2) = 4, T(n) = T(n-1) + T(n-2) + T(n-3).
Convert to Tribonacci representation.
You're done.
A minimal example is as follows:
// 1, 2, 4, TRIBO[n-3]+TRIBO[n-2]+TRIBO[n-1]
// possible minor perf optimization: reverse TRIBO
static const uint64_t TRIBO[65] = {1, 2, 4, 7, 13, 24, 44, 81, 149, 274, 504, 927, 1705, 3136, 5768, 10609, 19513, 35890, 66012, 121415, 223317, 410744, 755476, 1389537, 2555757, 4700770, 8646064, 15902591, 29249425, 53798080, 98950096, 181997601, 334745777, 615693474, 1132436852, 2082876103, 3831006429, 7046319384, 12960201916, 23837527729, 43844049029, 80641778674, 148323355432, 272809183135, 501774317241, 922906855808, 1697490356184, 3122171529233, 5742568741225, 10562230626642, 19426970897100, 35731770264967, 65720971788709, 120879712950776, 222332455004452, 408933139743937, 752145307699165, 1383410902447554, 2544489349890656, 4680045560037375, 8607945812375585, 15832480722303616, 29120472094716576, 53560898629395777, 98513851446415969];
// exclusive of max
extern uint64_t get_rand_64_range(uint64_t max);
uint64_t get_rand_no111(void) {
uint64_t idx = get_rand_64_range(TRIBO[64]);
uint64_t ret = 0;
for (int i = 63; i >= 0; i--) {
if (idx >= TRIBO[i]) {
ret |= ((uint64_t) 1) << i;
idx -= TRIBO[i];
}
// optional: if (idx == 0) {break;}
}
return ret;
}
(Warning: retyped from Python code. I suggest testing.)
This satisfies the 'unbiased' portion, and is indeed faster than the naive rejection-sampling approach, but unfortunately is still pretty slow, because it's looping ~64 times.
The idea behind the code below is to generate the upper 32 bits with the proper (non-uniform!) distribution, then generate the lower 32 conditional on the upper. On my laptop, it’s significantly faster than the baseline, and slightly faster than lexicographic decoding.
You can see the logic behind the non-uniform upper distribution with 4-bit outputs: 00 and 10 have four 2-bit lowers, 01 has three lowers, and 11 has two lowers.
#include <cstdint>
#include <random>
namespace {
using Generator = std::mt19937_64;
template <int bits> std::uint64_t GenerateUniform(Generator &gen) {
static_assert(0 <= bits && bits <= 63);
return gen() & ((std::uint64_t{1} << bits) - 1);
}
template <> std::uint64_t GenerateUniform<64>(Generator &gen) { return gen(); }
template <int bits> std::uint64_t GenerateNo111Baseline(Generator &gen) {
std::uint64_t r;
do {
r = GenerateUniform<bits>(gen);
} while (r & (r >> 1) & (r >> 2));
return r;
}
template <int bits> struct Tribonacci {
static constexpr std::uint64_t value = Tribonacci<bits - 1>::value +
Tribonacci<bits - 2>::value +
Tribonacci<bits - 3>::value;
};
template <> struct Tribonacci<0> { static constexpr std::uint64_t value = 1; };
template <> struct Tribonacci<-1> { static constexpr std::uint64_t value = 1; };
template <> struct Tribonacci<-2> { static constexpr std::uint64_t value = 0; };
template <int bits> std::uint64_t GenerateNo111(Generator &gen) {
constexpr int upper_bits = 16;
constexpr int lower_bits = bits - upper_bits;
const std::uint64_t upper = GenerateNo111Baseline<upper_bits>(gen);
for (;;) {
if ((upper & 1) == 0) {
return (upper << lower_bits) + GenerateNo111<lower_bits>(gen);
}
std::uint64_t outcome = std::uniform_int_distribution<std::uint64_t>{
0, Tribonacci<upper_bits>::value - 1}(gen);
if ((upper & 2) == 0) {
if (outcome < Tribonacci<upper_bits - 2>::value) {
return (upper << lower_bits) + (std::uint64_t{1} << (lower_bits - 1)) +
GenerateNo111<lower_bits - 2>(gen);
}
outcome -= Tribonacci<upper_bits - 2>::value;
}
if (outcome < Tribonacci<lower_bits - 1>::value) {
return (upper << lower_bits) + GenerateNo111<lower_bits - 1>(gen);
}
}
}
#define BASELINE(bits) \
template <> std::uint64_t GenerateNo111<bits>(Generator & gen) { \
return GenerateNo111Baseline<bits>(gen); \
}
BASELINE(0)
BASELINE(1)
BASELINE(2)
BASELINE(3)
BASELINE(4)
BASELINE(5)
BASELINE(6)
BASELINE(7)
BASELINE(8)
BASELINE(9)
BASELINE(10)
BASELINE(11)
BASELINE(12)
BASELINE(13)
BASELINE(14)
BASELINE(15)
BASELINE(16)
#undef BASELINE
static const std::uint64_t TRIBO[65] = {1,
2,
4,
7,
13,
24,
44,
81,
149,
274,
504,
927,
1705,
3136,
5768,
10609,
19513,
35890,
66012,
121415,
223317,
410744,
755476,
1389537,
2555757,
4700770,
8646064,
15902591,
29249425,
53798080,
98950096,
181997601,
334745777,
615693474,
1132436852,
2082876103,
3831006429,
7046319384,
12960201916,
23837527729,
43844049029,
80641778674,
148323355432,
272809183135,
501774317241,
922906855808,
1697490356184,
3122171529233,
5742568741225,
10562230626642,
19426970897100,
35731770264967,
65720971788709,
120879712950776,
222332455004452,
408933139743937,
752145307699165,
1383410902447554,
2544489349890656,
4680045560037375,
8607945812375585,
15832480722303616,
29120472094716576,
53560898629395777,
98513851446415969};
std::uint64_t get_rand_no111(Generator &gen) {
std::uint64_t idx =
std::uniform_int_distribution<std::uint64_t>{0, TRIBO[64] - 1}(gen);
std::uint64_t ret = 0;
for (int i = 63; i >= 0; --i) {
if (idx >= TRIBO[i]) {
ret |= std::uint64_t{1} << i;
idx -= TRIBO[i];
}
}
return ret;
}
} // namespace
int main() {
Generator gen{std::random_device{}()};
std::uint64_t sum = 0;
for (std::int32_t i = 0; i < 10000000; i++) {
if constexpr (true) {
sum += GenerateNo111<64>(gen);
} else {
sum += get_rand_no111(gen);
}
}
return sum & 127;
}
What about following simple idea:
Generate random r.
Find within this r window(s)-mask, contains 3 or more sequenced 1s.
If mask is 0 (no 3 or more sequenced bits) - return the r.
Substitute "incorrect" bits under that mask to new random ones.
Goto 2
Code sample (did not tested, compiled only):
uint64_t rand_no3() {
uint64_t r, mask;
for(r = get_rand_64() ; ; ) {
mask = r & (r >> 1) & (r >> 2);
mask |= (mask << 1) | (mask << 2);
if(mask == 0)
return r;
r ^= mask & get_rand_64();
}
}
Another variant of same code, with just single call get_rand_64():
uint64_t rand_no3() {
uint64_t r, mask = ~0ULL;
do {
r ^= mask & get_rand_64();
mask = r & (r >> 1) & (r >> 2);
mask |= (mask << 1) | (mask << 2);
} while(mask != 0);
return r;
}
I know, the last code does not init the r, but it is not matter, because of this variable will be overwritten in 1st loop iteration.
You could generate the number one bit at a time, keeping track of the number of consecutive set bits. Whenever you have two consecutive set bits, you insert an unset bit and set the count back to 0.
Suppose I have n circles of radius r. I want to place them randomly inside a rectangle of size AxA.
It is guaranteed that they fit. One can suppose that the sum of the area of all circles is about 60% of the area of the rectangle.
I can try it by doing a backtracking, trying to place, going back, etc., but there should be a better way to do it.
One possibility is to generate random points inside the rectangle without further constraints, and then move the points/centres iteratively (by little steps) such that avoiding overlapping. If two points are too near one from each other, each point can bring pressure to the other, to make it going away a little bit. The higher the pressure, the higher the move.
This process was implemented in C++. In the following simple code, to facilitate implementation, points and vectors are represented par std::complex type.
Note that I used srandand rand for test purpose. You may used better random algorithms, depending on your constraints.
According to the tests that I have performed, convergence seems guaranteed for a density of 60%. I also made some tests with a density of 70%: sometimes convergence, sometimes not.
Complexity is O(n^2 n_iter), where nis the number of circles and n_iterthe number of iterations.
n_iteris generally between 100 and 300, for a density of 60%. It could be decreased with relaxing the convergence criteria.
It could be seems high complexity, compared to other proposals in comments. In practice, for n = 15, the work is performed in less than 30ms on my PC. Huge time or fast enough, depending on the context. I have included a figure to illustrate the algorithm.
#include <cstdlib>
#include <iostream>
#include <fstream>
#include <vector>
#include <ctime>
#include <complex>
#include <cmath>
#include <tuple>
#include <ios>
#include <iomanip>
using dcomplex = std::complex<double>;
void print (const std::vector<dcomplex>& centers) {
std::cout << std::setprecision (9);
std::cout << "\ncenters:\n";
for (auto& z: centers) {
std::cout << real(z) << ", " << imag(z) << "\n";
}
}
std::tuple<bool, int, double> process (double A, double R, std::vector<dcomplex>& centers, int n_iter_max = 100) {
bool check = true;
int n = centers.size();
std::vector<dcomplex> moves (n, 0.0);
double acceleration = 1.0001; // to accelerate the convergence, if density not too large
// could be made dependent of the iteration index
double dmin;
auto limit = [&] (dcomplex& z) {
double zx = real(z);
double zi = imag(z);
if (zx < R) zx = R;
if (zx > A-R) zx = A-R;
if (zi < R) zi = R;
if (zi > A-R) zi = A-R;
return dcomplex(zx, zi);
};
int iter;
for (iter = 0; iter < n_iter_max; ++iter) {
for (int i = 0; i < n; ++i) moves[i] = 0.0;
dmin = A;
for (int i = 0; i < n; ++i) {
for (int j = i+1; j < n; ++j) {
auto vect = centers[i] - centers[j];
double dist = std::abs(vect);
if (dist < dmin) dmin = dist;
double x = std::max (0.0, 2*R*acceleration - dist) / 2.0;
double coef = x / (dist + R/10000);
moves[i] += coef * vect;
moves[j] -= coef * vect;
}
}
std::cout << "iteration " << iter << " dmin = " << dmin << "\n";
if (dmin/R >= 2.0 - 1.0e-6) break;
for (int i = 0; i < n; ++i) {
centers[i] += moves[i];
centers[i] = limit (centers[i]);
}
}
dmin = A;
for (int i = 0; i < n; ++i) {
for (int j = i+1; j < n; ++j) {
auto vect = centers[i] - centers[j];
double dist = std::abs(vect);
if (dist < dmin) dmin = dist;
}
}
std::cout << "Final: dmin/R = " << dmin/R << "\n";
check = dmin/R >= 2.0 - 1.0e-6;
return {check, iter, dmin};
}
int main() {
int n = 15; // number of circles
double R = 1.0; // ray of each circle
double density = 0.6; // area of all circles over total area A*A
double A; // side of the square
int n_iter = 1000;
A = sqrt (n*M_PI*R*R/density);
std::cout << "number of circles = " << n << "\n";
std::cout << "density = " << density << "\n";
std::cout << "A = " << A << std::endl;
std::vector<dcomplex> centers (n);
std::srand(std::time(0));
for (int i = 0; i < n; ++i) {
double x = R + (A - 2*R) * (double) std::rand()/RAND_MAX;
double y = R + (A - 2*R) * (double) std::rand()/RAND_MAX;
centers[i] = {x, y};
}
auto [check, n_iter_eff, dmin] = process (A, R, centers, n_iter);
std::cout << "check = " << check << "\n";
std::cout << "Relative min distance = " << std::setprecision (9) << dmin/R << "\n";
std::cout << "nb iterations = " << n_iter_eff << "\n";
print (centers);
return 0;
}
I used Eigen to calculate inner product of two matrix, the first one is A=(BC).eval() and second one is D=(EF).eval(). Here B,C,E,F are the same size (1500 * 1500) but with different values. I find the first one cost about 200 ms while the second one cost about 6000 ms, I have no idea why this happened.
#include <iostream>
#include <time.h>
#include "Eigen/Dense"
int main() {
clock_t start, stop;
Eigen::MatrixXf mat_a(1200, 1500);
Eigen::MatrixXf mat_b(1500, 1500);
Eigen::MatrixXf mat_r(1000, 1300);
int i, j;
float c = 0;
for (i = 0; i < 1200; i++) {
for (j = 0; j < 1500; j++) {
mat_a(i, j) = (float)(c/3 * 1.0e-40);
//if (i % 2 == 0 && j % 2 == 0) mat_a(i, j);
c++;
}
}
//std::cout << mat_a.row(0) << std::endl;
c = 100;
for (i = 0; i < 1500; i++) {
for (j = 0; j < 1500; j++) {
mat_b(i, j) = (float)(c/3 * 0.5e-10);
c++;
}
}
//std::cout << mat_b.row(0) << std::endl;
start = clock();
mat_r = mat_a * mat_b;
stop = clock();
std::cout << stop - start << std::endl;
getchar();
return 0;
}
as show in above example code. I find this is caused by the value of the matrix, when mat_a has value about e-40 and mat_b has value about e-10, this problem occurs stably.
Is there anyone who can explain it?
This is because your matrix contains denormal numbers that are slow to deal with for the CPU. You should make sure that you are using reasonable units so that those can be considered as zeros, and then enable the flush-to-zero (FTZ) and denormals-as-zero flags (DAZ), for instance using the fast-math mode of your compiler or at runtime, see this SO question.
I'm trying to use the repeated squaring algorithm (using recursion) to perform matrix exponentiation. I've included header files from the NEWMAT library instead of using arrays. The original matrix has elements in the range (-5,5), all numbers being of type float.
# include "C:\User\newmat10\newmat.h"
# include "C:\User\newmat10\newmatio.h"
# include "C:\User\newmat10\newmatap.h"
# include <iostream>
# include <time.h>
# include <ctime>
# include <cstdlib>
# include <iomanip>
using namespace std;
Matrix repeated_squaring(Matrix A, int exponent, int n) //Recursive function
{
A(n,n);
IdentityMatrix I(n);
if (exponent == 0) //Matrix raised to zero returns an Identity Matrix
return I;
else
{
if ( exponent%2 == 1 ) // if exponent is odd
return (A * repeated_squaring (A*A, (exponent-1)/2, n));
else //if exponent is even
return (A * repeated_squaring( A*A, exponent/2, n));
}
}
Matrix direct_squaring(Matrix B, int k, int no) //Brute Force Multiplication
{
B(no,no);
Matrix C = B;
for (int i = 1; i <= k; i++)
C = B*C;
return C;
}
//----Creating a matrix with elements b/w (-5,5)----
float unifRandom()
{
int a = -5;
int b = 5;
float temp = (float)((b-a)*( rand()/RAND_MAX) + a);
return temp;
}
Matrix initialize_mat(Matrix H, int ord)
{
H(ord,ord);
for (int y = 1; y <= ord; y++)
for(int z = 1; z<= ord; z++)
H(y,z) = unifRandom();
return(H);
}
//---------------------------------------------------
void main()
{
int exponent, dimension;
cout<<"Insert exponent:"<<endl;
cin>>exponent;
cout<< "Insert dimension:"<<endl;
cin>>dimension;
cout<<"The number of rows/columns in the square matrix is: "<<dimension<<endl;
cout<<"The exponent is: "<<exponent<<endl;
Matrix A(dimension,dimension),B(dimension,dimension);
Matrix C(dimension,dimension),D(dimension,dimension);
B= initialize_mat(A,dimension);
cout<<"Initial Matrix: "<<endl;
cout<<setw(5)<<setprecision(2)<<B<<endl;
//-----------------------------------------------------------------------------
cout<<"Repeated Squaring Result: "<<endl;
clock_t time_before1 = clock();
C = repeated_squaring (B, exponent , dimension);
cout<< setw(5) <<setprecision(2) <<C;
clock_t time_after1 = clock();
float diff1 = ((float) time_after1 - (float) time_before1);
cout << "It took " << diff1/CLOCKS_PER_SEC << " seconds to complete" << endl<<endl;
//---------------------------------------------------------------------------------
cout<<"Direct Squaring Result:"<<endl;
clock_t time_before2 = clock();
D = direct_squaring (B, exponent , dimension);
cout<<setw(5)<<setprecision(2)<<D;
clock_t time_after2 = clock();
float diff2 = ((float) time_after2 - (float) time_before2);
cout << "It took " << diff2/CLOCKS_PER_SEC << " seconds to complete" << endl<<endl;
}
I face the following problems:
The random number generator returns only "-5" as each element in the output.
The Matrix multiplication yield different results with brute force multiplication and using the repeated squaring algorithm.
I'm timing the execution time of my code to compare the times taken by brute force multiplication and by repeated squaring.
Could someone please find out what's wrong with the recursion and with the matrix initialization?
NOTE: While compiling this program, make sure you've imported the NEWMAT library.
Thanks in advance!
rand() returns an int so rand()/RAND_MAX will truncate to an integer = 0. Try your
repeated square algorithm by hand with n = 1, 2 and 3 and you'll find a surplus A *
and a gross inefficiency.
Final Working code has the following improvements:
Matrix repeated_squaring(Matrix A, int exponent, int n) //Recursive function
{
A(n,n);
IdentityMatrix I(n);
if (exponent == 0) //Matrix raised to zero returns an Identity Matrix
return I;
if (exponent == 1)
return A;
{
if (exponent % 2 == 1) // if exponent is odd
return (A*repeated_squaring (A*A, (exponent-1)/2, n));
else //if exponent is even
return (repeated_squaring(A*A, exponent/2, n));
}
}
Matrix direct_squaring(Matrix B, int k, int no) //Brute Force Multiplication
{
B(no,no);
Matrix C(no,no);
C=B;
for (int i = 0; i < k-1; i++)
C = B*C;
return C;
}
//----Creating a matrix with elements b/w (-5,5)----
float unifRandom()
{
int a = -5;
int b = 5;
float temp = (float) ((b-a)*((float) rand()/RAND_MAX) + a);
return temp;
}