I have a time-series, which essentially amounts to some instrument recording the current time whenever it makes a "detection". The sampling rate is therefore not in constant time, however we can treat it as such by "re-sampling", relying on the fact that the detections are made reliably and we can simply insert 0's to "fill in" the gaps. This will be important later...
The instrument should detect the "signals" sent by another, nearby instrument. This second instrument emits a signal at some unknown period, T (e.g. 1 signal per second), with a "jitter" likely on the order of a few tenths of a percent of the period.
My goal is to determine this period (or frequency, if you like) using only the timestamps recorded by the "detecting" instrument. Unfortunately, however, the detector is flooded with noise, and a significant amount (I estimate 97-98%) of "detections" (and therefore "points" in the time-series) are due to noise. Therefore, extracting the period will require more careful analysis.
My first thought was to simply feed the time series into an FFT algorithm (I'm using FFTW/DHT), however this wasn't particularly enlightening. I've also tried my own (admittedly rather crude) algorithm, which simply computed a cross-correlation of the series with "clean" series of increasing period. I didn't get very far with this, either, and there are quite a handful of details to consider (phase, etc).
It occurs to me that something like this must've been done before, and surely there's a "nice" way to accomplish it.
Here's my approach. Given a period, we can score it using a dynamic program to find the subsequence of detection times that includes the first and last detection and maximizes the sum of gap log-likelihoods, where the gap log-likelihood is defined as minus the square of the difference of the gap and the period (Gaussian jitter model).
If we have approximately the right period, then we can get a very good gap sequence (some weirdness at the beginning and end and wherever there is a missed detection, but this is OK).
If we have the wrong period, then we end up with basically exponential jitter, which has low log-likelihood.
The C++ below generates fake detection times with a planted period and then searches over periods. Scores are normalized by a (bad) estimate of the score for Poisson noise, so wrong periods score about 0.4. See the plot below.
#include <algorithm>
#include <cmath>
#include <iostream>
#include <limits>
#include <random>
#include <vector>
namespace {
static constexpr double kFalseNegativeRate = 0.01;
static constexpr double kCoefficientOfVariation = 0.003;
static constexpr int kSignals = 6000;
static constexpr int kNoiseToSignalRatio = 50;
template <class URNG>
std::vector<double> FakeTimes(URNG &g, const double period) {
std::vector<double> times;
std::bernoulli_distribution false_negative(kFalseNegativeRate);
std::uniform_real_distribution<double> phase(0, period);
double signal = phase(g);
std::normal_distribution<double> interval(period,
kCoefficientOfVariation * period);
std::uniform_real_distribution<double> noise(0, kSignals * period);
for (int i = 0; i < kSignals; i++) {
if (!false_negative(g)) {
times.push_back(signal);
}
signal += interval(g);
for (double j = 0; j < kNoiseToSignalRatio; j++) {
times.push_back(noise(g));
}
}
std::sort(times.begin(), times.end());
return times;
}
constexpr double Square(const double x) { return x * x; }
struct Subsequence {
double score;
int previous;
};
struct Result {
double score = std::numeric_limits<double>::quiet_NaN();
double median_interval = std::numeric_limits<double>::quiet_NaN();
};
Result Score(const std::vector<double> ×, const double period) {
if (times.empty() || !std::is_sorted(times.begin(), times.end())) {
return {};
}
std::vector<Subsequence> bests;
bests.reserve(times.size());
bests.push_back({0, -1});
for (int i = 1; i < times.size(); i++) {
Subsequence best = {std::numeric_limits<double>::infinity(), -1};
for (int j = i - 1; j > -1; j--) {
const double difference = times[i] - times[j];
const double penalty = Square(difference - period);
if (difference >= period && penalty >= best.score) {
break;
}
const Subsequence candidate = {bests[j].score + penalty, j};
if (candidate.score < best.score) {
best = candidate;
}
}
bests.push_back(best);
}
std::vector<double> intervals;
int i = bests.size() - 1;
while (true) {
int previous_i = bests[i].previous;
if (previous_i < 0) {
break;
}
intervals.push_back(times[i] - times[previous_i]);
i = previous_i;
}
if (intervals.empty()) {
return {};
}
const double duration = times.back() - times.front();
// The rate is doubled because we can look for a time in either direction.
const double rate = 2 * (times.size() - 1) / duration;
// Mean of the square of an exponential distribution with the given rate.
const double mean_square = 2 / Square(rate);
const double score = bests.back().score / (intervals.size() * mean_square);
const auto median_interval = intervals.begin() + intervals.size() / 2;
std::nth_element(intervals.begin(), median_interval, intervals.end());
return {score, *median_interval};
}
} // namespace
int main() {
std::default_random_engine g;
const auto times = FakeTimes(g, std::sqrt(2));
for (int i = 0; i < 2000; i++) {
const double period = std::pow(1.001, i) / 3;
const Result result = Score(times, period);
std::cout << period << ' ' << result.score << ' ' << result.median_interval
<< std::endl;
}
}
Related
my c program is running to slow (right now it is around 40 seconds without parallelization). I have tried using openmp which has brought the timing down significantly but I am looking to use simple and natural ways to make my code run faster other than using parallel for loops. The basic structure of the code is that is takes some command line arguments as inputs and then saves those inputs as variables. Then it recursively computes a variable called Rplus1 using the math.h library and the complex.h library. The problem of the code and where it is taking most of it's time is at the bottom where there are nested for loops. My goal is to get the whole code running in under 5 seconds but as of now it runs in about 40 seconds without using parallel for loops. Please Help!
#include "time.h"
#include "stdio.h"
#include "stdlib.h"
#include "complex.h"
#include "math.h"
#include "string.h"
#include "unistd.h"
#include "omp.h"
#define PI 3.14159265
int main (int argc, char *argv[]){
if(argc >= 8){
double start1 = omp_get_wtime();
// command line arguments are aligned in the following order: [theta] [number of layers in superlattice] [material_1] [lat const_1] [number of unit cells_1] [material_2] [lat const_2] [number of unit cells_2] .... [material_N] [lat const_N] [number of unit cells_N] [Log/Linear] [number of repeating superlattice layers] [yes/no]
int N;
sscanf(argv[2],"%d",&N); // Number of layers in superlattice specified by second input argument
if(strcmp(argv[argc-1],"yes") == 0) //If the substrate is included then add one more layer to the N variable
{
N = N+1;
}
int total;
sscanf(argv[argc-2],"%d",&total); // Number of repeating superlattice layers specified by second to last argument
double layers[N][6], horizangle[1001], vertangle[1001];
double complex (*F_hkl)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)), (*F_0)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)), (*g)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)), (*g_0)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)),SF_table[10];// this array will hold the unit cell structure factors for all of the materials selected for each wavevector in the beam spectrum
double real, real2, lam, c_light = 299792458, h_pl = 4.135667516e-15,E = 10e3, r_0 = 2.818e-15, Lccd = 1.013;// just a few variables to hold values through calculations and constants, speed of light, plancks const, photon energy, and detector distance from sample
double angle;
double complex z;// just a variable to hold complex numbers throughout calculations
int i,j,m,n,t; // integers to index through arrays
lam = (h_pl*c_light)/E;
sscanf(argv[1],"%lf",&angle); //first argument is the angle of incidence, read it
angle = angle*(PI/180.0);
angle2 = -angle;
double (*table)[10] = malloc(10*9*sizeof(double)); // this array holds all the coefficients to calculate the atomic scattering factor below
double (*table2)[10] = malloc(10*2*sizeof(double));
FILE*datfile1 = fopen("/home/vhosts/xraydev.engr.wisc.edu/data/coef_table.bin","rb"); // read the binary file containg all the coefficients
fread(table,sizeof(double),90,datfile1);
fclose(datfile1);
FILE*datfile2 = fopen("/home/vhosts/xraydev.engr.wisc.edu/data/dispersioncs.bin","rb");
fread(table2,sizeof(double),20,datfile2);
fclose(datfile2);
// Calculate scattering factors for all elements
double a,b;
double k_z = (sin(angle)/lam)*1e-10; // incorporate angular dependence of SF but neglect 0.24 degree divergence because of approximation
for(i = 0;i<10;i++) // for each element...
{
SF_table[i] = 0;
for(j = 0;j<4;j++) // summation
{
a = table[2*j][i];
b = table[2*j+1][i];
SF_table[i] = SF_table[i] + a * exp(-b*k_z*k_z);
}
SF_table[i] = SF_table[i] + table[8][i] + table2[0][i] + table2[1][i]*I;
}
free(table);
double mm = 4.0, (*phi)[1001][1001] = malloc(N*1001*1001*sizeof(double));
for(i = 1; i < N+1; i++) // for each layer of material...
{
sscanf(argv[i*3+1],"%lf",&layers[i-1][1]); // get out of plane lattice constant
sscanf(argv[i*3+2],"%lf",&layers[i-1][2]); // get the number of unit cells in the layer
layers[i-1][1] = layers[i-1][1]*1e-10; // convert lat const input to meters
// Define reciprocal space positions at the incident angle h, k, l
layers[i-1][3] = 0; // h
layers[i-1][4] = 0; // k
double l; // l calculated for each wavevector in the spectrum because l changes with angle of incidence
for (m = 0; m < 1001; m++)
{
for (n = 0; n <1001; n++)
{
l = 4;
phi[i-1][m][n] = 2*PI*layers[i-1][1]*sin(angle)/lam; // Caculate phi for each layer
if(strcmp(argv[i*3],"GaAs") == 0)
{
F_hkl[i-1][m][n] = (2+2*cexp(I*PI*l))*(SF_table[2]+SF_table[3]*cexp(I*PI*l/2));
F_0[i-1][m][n] = 0.5*8.0*(31 + table2[0][2] + table2[1][2]*I) + 0.5*8.0*(33 + table2[0][3] + table2[1][3]*I);
g[i-1][m][n] = 2*r_0*F_hkl[i-1][m][n]/mm/layers[i-1][1]*cos(2*angle[m][n]);
g_0[i-1][m][n] = 2*r_0*F_0[i-1][m][n]/mm/layers[i-1][1];
}
if(strcmp(argv[i*3],"AlGaAs") == 0)
{
F_hkl[i-1][m][n] = (2+2*cexp(I*PI*l))*((0.76*SF_table[2]+ 0.24*SF_table[4])+SF_table[3]*cexp(I*PI*l/2));
F_0[i-1][m][n] = 0.24*4.0*(13 + table2[0][4] + table2[1][4]*I) + 0.76*4.0*(31 + table2[0][2] + table2[1][2]*I) + 4.0*(33 + table2[0][3] + table2[1][3]*I);
g[i-1][m][n] = 2*r_0*F_hkl[i-1][m][n]/mm/layers[i-1][1]*cos(2*angle[m][n]);
g_0[i-1][m][n] = 2*r_0*F_0[i-1][m][n]/mm/layers[i-1][1];
}
}
}
}
double complex (*Rplus1)[1001] = malloc(1001*1001*sizeof(double complex));
for (m = 0; m < 1001; m++)
{
for (n = 0; n <1001; n++)
{
Rplus1[m][n] = 0.0;
}
}
double stop1 = omp_get_wtime();
for(i=1;i<N;i++) // For each layer of the film
{
for(j=0;j<layers[i][2];j++) // For each unit cell
{
for (m = 0; m < 1001; m++) // For each row of the diffraction pattern
{
for (n = 0; n <1001; n++) // For each column of the diffraction pattern
{
Rplus1[m][n] = -I*g[i][m][n] + ((1-I*g_0[i][m][n])*(1-I*g_0[i][m][n]))/(I*g[i][m][n] + (cos(-2*phi[i][m][n])+I*sin(-2*phi[i][m][n]))/Rplus1[m][n]);
}
}
}
}
double stop2 = omp_get_wtime();
double elapsed1 = (double)(stop1 - start1);// Second user defined function to use Durbin and Follis recursive formula
double elapsed2 = (double)(stop2 - start1);// Second user defined function to use Durbin and Follis recursive formula
printf("main() through before diffraction function took %f seconds to run\n\n",elapsed1);
printf("main() through after diffraction function took %f seconds to run\n\n",elapsed2);
}
}
Summary:
Any ideas about how to further improve upon the basic scatter operation in CUDA? Especially if one knows it will only be used to compact a larger array into a smaller one? or why the below methods of vectorizing memory ops and shared memory didn't work? I feel like there may be something fundamental I am missing and any help would be appreciated.
EDIT 03/09/15: So I found this Parallel For All Blog post "Optimized Filtering with Warp-Aggregated Atomics". I had assumed atomics would be intrinsically slower for this purpose, however I was wrong - especially since I don't think I care about maintaining element order in the array during my simulation. I'll have to think about it some more and then implement it to see what happens!
EDIT 01/04/16: I realized I never wrote about my results. Unfortunately in that Parallel for All Blog post they compared the global atomic method for compact to the Thrust prefix-sum compact method, which is actually quite slow. CUB's Device::IF is much faster than Thrust's - as is the prefix-sum version I wrote using CUB's Device::Scan + custom code. The warp-aggregrate global atomic method is still faster by about 5-10%, but nowhere near the 3-4x faster I had been hoping for based on the results in the blog. I'm still using the prefix-sum method as while maintaining element order is not necessary, I prefer the consistency of the prefix-sum results and the advantage from the atomics is not very big. I still try various methods to improve compact, but so far only marginal improvements (2%) at best for dramatically increased code complexity.
Details:
I am writing a simulation in CUDA where I compact out elements I am no longer interested in simulating every 40-60 time steps. From profiling it seems that the scatter op takes up the most amount of time when compacting - more so than the filter kernel or the prefix sum. Right now I use a pretty basic scatter function:
__global__ void scatter_arrays(float * new_freq, const float * const freq, const int * const flag, const int * const scan_Index, const int freq_Index){
int myID = blockIdx.x*blockDim.x + threadIdx.x;
for(int id = myID; id < freq_Index; id+= blockDim.x*gridDim.x){
if(flag[id]){
new_freq[scan_Index[id]] = freq[id];
}
}
}
freq_Index is the number of elements in the old array. The flag array is the result from the filter. Scan_ID is the result from the prefix sum on the flag array.
Attempts I've made to improve it are to read the flagged frequencies into shared memory first and then write from shared memory to global memory - the idea being that the writes to global memory would be more coalesced amongst the warps (e.g. instead of thread 0 writing to position 0 and thread 128 writing to position 1, thread 0 would write to 0 and thread 1 would write to 1). I also tried vectorizing the reads and the writes - instead of reading and writing floats/ints I read/wrote float4/int4 from the global arrays when possible, so four numbers at a time. This I thought might speed up the scatter by having fewer memory ops transferring larger amounts of memory. The "kitchen sink" code with both vectorized memory loads/stores and shared memory is below:
const int compact_threads = 256;
__global__ void scatter_arrays2(float * new_freq, const float * const freq, const int * const flag, const int * const scan_Index, const int freq_Index){
int gID = blockIdx.x*blockDim.x + threadIdx.x; //global ID
int tID = threadIdx.x; //thread ID within block
__shared__ float row[4*compact_threads];
__shared__ int start_index[1];
__shared__ int end_index[1];
float4 myResult;
int st_index;
int4 myFlag;
int4 index;
for(int id = gID; id < freq_Index/4; id+= blockDim.x*gridDim.x){
if(tID == 0){
index = reinterpret_cast<const int4*>(scan_Index)[id];
myFlag = reinterpret_cast<const int4*>(flag)[id];
start_index[0] = index.x;
st_index = index.x;
myResult = reinterpret_cast<const float4*>(freq)[id];
if(myFlag.x){ row[0] = myResult.x; }
if(myFlag.y){ row[index.y-st_index] = myResult.y; }
if(myFlag.z){ row[index.z-st_index] = myResult.z; }
if(myFlag.w){ row[index.w-st_index] = myResult.w; }
}
__syncthreads();
if(tID > 0){
myFlag = reinterpret_cast<const int4*>(flag)[id];
st_index = start_index[0];
index = reinterpret_cast<const int4*>(scan_Index)[id];
myResult = reinterpret_cast<const float4*>(freq)[id];
if(myFlag.x){ row[index.x-st_index] = myResult.x; }
if(myFlag.y){ row[index.y-st_index] = myResult.y; }
if(myFlag.z){ row[index.z-st_index] = myResult.z; }
if(myFlag.w){ row[index.w-st_index] = myResult.w; }
if(tID == blockDim.x -1 || gID == mutations_Index/4 - 1){ end_index[0] = index.w + myFlag.w; }
}
__syncthreads();
int count = end_index[0] - st_index;
int rem = st_index & 0x3; //equivalent to modulo 4
int offset = 0;
if(rem){ offset = 4 - rem; }
if(tID < offset && tID < count){
new_mutations_freq[population*new_array_Length+st_index+tID] = row[tID];
}
int tempID = 4*tID+offset;
if((tempID+3) < count){
reinterpret_cast<float4*>(new_freq)[tID] = make_float4(row[tempID],row[tempID+1],row[tempID+2],row[tempID+3]);
}
tempID = tID + offset + (count-offset)/4*4;
if(tempID < count){ new_freq[st_index+tempID] = row[tempID]; }
}
int id = gID + freq_Index/4 * 4;
if(id < freq_Index){
if(flag[id]){
new_freq[scan_Index[id]] = freq[id];
}
}
}
Obviously it gets a bit more complicated. :) While the above kernel seems stable when there are hundreds of thousands of elements in the array, I've noticed a race condition when the array numbers in the tens of millions. I'm still trying to track the bug down.
But regardless, neither method (shared memory or vectorization) together or alone improved performance. I was especially surprised by the lack of benefit from vectorizing the memory ops. It had helped in other functions I had written, though now I am wondering if maybe it helped because it increased Instruction-Level-Parallelism in the calculation steps of those other functions rather than the fewer memory ops.
I found the algorithm mentioned in this poster (similar algorithm also discussed in this paper) works pretty well, especially for compacting large arrays. It uses less memory to do it and is slightly faster than my previous method (5-10%). I put in a few tweaks to the poster's algorithm: 1) eliminating the final warp shuffle reduction in phase 1, can simply sum the elements as they are calculated, 2) giving the function the ability to work over more than just arrays sized as a multiple of 1024 + adding grid-strided loops, and 3) allowing each thread to load their registers simultaneously in phase 3 instead of one at a time. I also use CUB instead of Thrust for Inclusive sum for faster scans. There may be more tweaks I can make, but for now this is good.
//kernel phase 1
int myID = blockIdx.x*blockDim.x + threadIdx.x;
//padded_length is nearest multiple of 1024 > true_length
for(int id = myID; id < (padded_length >> 5); id+= blockDim.x*gridDim.x){
int lnID = threadIdx.x % warp_size;
int warpID = id >> 5;
unsigned int mask;
unsigned int cnt=0;//;//
for(int j = 0; j < 32; j++){
int index = (warpID<<10)+(j<<5)+lnID;
bool pred;
if(index > true_length) pred = false;
else pred = predicate(input[index]);
mask = __ballot(pred);
if(lnID == 0) {
flag[(warpID<<5)+j] = mask;
cnt += __popc(mask);
}
}
if(lnID == 0) counter[warpID] = cnt; //store sum
}
//kernel phase 2 -> CUB Inclusive sum transforms counter array to scan_Index array
//kernel phase 3
int myID = blockIdx.x*blockDim.x + threadIdx.x;
for(int id = myID; id < (padded_length >> 5); id+= blockDim.x*gridDim.x){
int lnID = threadIdx.x % warp_size;
int warpID = id >> 5;
unsigned int predmask;
unsigned int cnt;
predmask = flag[(warpID<<5)+lnID];
cnt = __popc(predmask);
//parallel prefix sum
#pragma unroll
for(int offset = 1; offset < 32; offset<<=1){
unsigned int n = __shfl_up(cnt, offset);
if(lnID >= offset) cnt += n;
}
unsigned int global_index = 0;
if(warpID > 0) global_index = scan_Index[warpID - 1];
for(int i = 0; i < 32; i++){
unsigned int mask = __shfl(predmask, i); //broadcast from thread i
unsigned int sub_group_index = 0;
if(i > 0) sub_group_index = __shfl(cnt, i-1);
if(mask & (1 << lnID)){
compacted_array[global_index + sub_group_index + __popc(mask & ((1 << lnID) - 1))] = input[(warpID<<10)+(i<<5)+lnID];
}
}
}
}
EDIT: There is a newer article by a subset of the poster authors where they examine a faster variation of compact than what is written above. However, their new version is not order preserving, so not useful for myself and I haven't implemented it to test it out. That said, if your project doesn't rely on object order, their newer compact version can probably speed up your algorithm.
I'm attempting to implement the variant of parallel radix sort described in http://arxiv.org/pdf/1008.2849v2.pdf (Algorithm 2), but my C++ implementation (for 4 digits in base 10) contains a bug that I'm unable to locate.
For debugging purposes I'm using no parallelism, but the code should still sort correctly.
For instance the line arr.at(i) = item accesses indices outside its bounds in the following
std::vector<int> v = {4612, 4598};
radix_sort2(v);
My implementation is as follows
#include <set>
#include <array>
#include <vector>
void radix_sort2(std::vector<int>& arr) {
std::array<std::set<int>, 10> buckets3;
for (const int item : arr) {
int d = item / 1000;
buckets3.at(d).insert(item);
}
//Prefix sum
std::array<int, 10> outputIndices;
outputIndices.at(0) = 0;
for (int i = 1; i < 10; ++i) {
outputIndices.at(i) = outputIndices.at(i - 1) +
buckets3.at(i - 1).size();
}
for (const auto& bucket3 : buckets3) {
std::array<std::set<int>, 10> buckets0, buckets1;
std::array<int, 10> histogram2 = {};
for (const int item : bucket3) {
int d = item % 10;
buckets0.at(d).insert(item);
}
for (const auto& bucket0 : buckets0) {
for (const int item : bucket0) {
int d = (item / 10) % 10;
buckets1.at(d).insert(item);
int d2 = (item / 100) % 10;
++histogram2.at(d2);
}
}
for (const auto& bucket1 : buckets1) {
for (const int item : bucket1) {
int d = (item / 100) % 10;
int i = outputIndices.at(d) + histogram2.at(d);
++histogram2.at(d);
arr.at(i) = item;
}
}
}
}
Can anyone spot my mistake?
I took at look at the paper you linked. You haven't made any mistakes, none that I can see. In fact, in my estimation, you corrected a mistake in the algorithm.
I wrote out the algorithm and ended up with the exact same problem as you did. After reviewing Algorithm 2, either I woefully mis-understand how it is supposed to work, or it is flawed. There are at least a couple of problems with the algorithm, specifically revolving around outputIndices, and histogram2.
Looking at the algorithm, the final index of an item is determined by the counting sort stored in outputIndices. (lets ignore the histogram for now).
If you had an inital array of numbers {0100, 0103, 0102, 0101} The prefix sum of that would be 4.
The algorithm makes no indication I can determine to lag the result by 1. That being said, in order for the algorithm to work the way they intend, it does have to be lagged, so, moving on.
Now, the prefix sums are 0, 4, 4.... The algorithm doesn't use the MSD as the index into the outputIndices array, it uses "MSD - 1"; So taking 1 as the index into the array, the starting index for the first item without the histogram is 4! Outside the array on the first try.
The outputIndices is built with the MSD, it makes sense for it to be accessed by MSD.
Further, even if you tweak the algorithm to correctly to use the MSD into the outputIndices, it still won't sort correctly. With your initial inputs (swapped) {4598, 4612}, they will stay in that order. They are sorted (locally) as if they are 2 digit numbers. If you increase it to have other numbers not starting with 4, they will be globally, sorted, but the local sort is never finished.
According to the paper the goal is to use the histogram to do that, but I don't see that happening.
Ultimately, I'm assuming, what you want is an algorithm that works the way described. I've modified the algorithm, keeping with the overall stated goal of the paper of using the MSD to do a global sort, and the rest of the digits by reverse LSD.
I don't think these changes should have any impact on your desire to parallel-ize the function.
void radix_sort2(std::vector<int>& arr)
{
std::array<std::vector<int>, 10> buckets3;
for (const int item : arr)
{
int d = item / 1000;
buckets3.at(d).push_back(item);
}
//Prefix sum
std::array<int, 10> outputIndices;
outputIndices.at(0) = 0;
for (int i = 1; i < 10; ++i)
{
outputIndices.at(i) = outputIndices.at(i - 1) + buckets3.at(i - 1).size();
}
for (const auto& bucket3 : buckets3)
{
if (bucket3.size() <= 0)
continue;
std::array<std::vector<int>, 10> buckets0, buckets1, buckets2;
for (const int item : bucket3)
buckets0.at(item % 10).push_back(item);
for (const auto& bucket0 : buckets0)
for (const int item : bucket0)
buckets1.at((item / 10) % 10).push_back(item);
for (const auto& bucket1 : buckets1)
for (const int item : bucket1)
buckets2.at((item / 100) % 10).push_back(item);
int count = 0;
for (const auto& bucket2 : buckets2)
{
for (const int item : bucket2)
{
int d = (item / 1000) % 10;
int i = outputIndices.at(d) + count;
++count;
arr.at(i) = item;
}
}
}
}
For extensiblility, it would probably make sense to create a helper function that does the local sorting. You should be able to extend it to handle any number of digit numbers that way.
I have been reading + researching on algorithms and formulas to work out a score for my user submitted content to display currently hot / trending items higher up the list, however i'll admit i'm a little over my head here.
I'll give some background on what i'm after... users upload audio to my site, audios have several actions:
Played
Downloaded
Liked
Favorited
Ideally i want an algorithm where I can update an audios score each time a new activity is logged (played, download etc...), also a download action is worth more than a play, like more than a download and a favourite more than a like.
If possible i would like for audios older than 1 week to drop off quite sharply from the list to give newer content more of a chance of trending.
I have read about reddits algorithm which looked good, but i'm in over my head on how to tweak it to make use of my multiple variables, and to drop off older articles after around 7 days.
Some articles that we're interesting:
https://medium.com/hacking-and-gonzo/how-reddit-ranking-algorithms-work-ef111e33d0d9 (reddits algo)
http://www.evanmiller.org/rank-hotness-with-newtons-law-of-cooling.html
Any help is appreciated!
Paul
Reddits old formula and a little drop off
Basically you can use Reddit's formula. Since your system only supports upvotes you could weight them, resulting in something like this:
def hotness(track)
s = track.playedCount
s = s + 2*track.downloadCount
s = s + 3*track.likeCount
s = s + 4*track.favCount
baseScore = log(max(s,1))
timeDiff = (now - track.uploaded).toWeeks
if(timeDiff > 1)
x = timeDiff - 1
baseScore = baseScore * exp(-8*x*x)
return baseScore
The factor exp(-8*x*x) will give you your desired drop off:
The basics behind
You can use any function that goes to zero faster than your score goes up. Since we use log on our score, even a linear function can get multiplied (as long as your score doesn't grow exponentially).
So all you need is a function that returns 1 as long as you don't want to modify the score, and drops afterwards. Our example above forms that function:
multiplier(x) = x > 1 ? exp(-8*x*x) : 1
You can vary the multiplier if you want less steep curves.
Example in C++
Lets say that the probability for a given track to be played in a given hour is 50%, download 10%, like 1% and favorite 0.1%. Then the following C++ program will give you an estimate for your scores behavior:
#include <iostream>
#include <fstream>
#include <random>
#include <ctime>
#include <cmath>
struct track{
track() : uploadTime(0),playCount(0),downCount(0),likeCount(0),faveCount(0){}
std::time_t uploadTime;
unsigned int playCount;
unsigned int downCount;
unsigned int likeCount;
unsigned int faveCount;
void addPlay(unsigned int n = 1){ playCount += n;}
void addDown(unsigned int n = 1){ downCount += n;}
void addLike(unsigned int n = 1){ likeCount += n;}
void addFave(unsigned int n = 1){ faveCount += n;}
unsigned int baseScore(){
return playCount +
2 * downCount +
3 * likeCount +
4 * faveCount;
}
};
int main(){
track test;
const unsigned int dayLength = 24 * 3600;
const unsigned int weekLength = dayLength * 7;
std::mt19937 gen(std::time(0));
std::bernoulli_distribution playProb(0.5);
std::bernoulli_distribution downProb(0.1);
std::bernoulli_distribution likeProb(0.01);
std::bernoulli_distribution faveProb(0.001);
std::ofstream fakeRecord("fakeRecord.dat");
std::ofstream fakeRecordDecay("fakeRecordDecay.dat");
for(unsigned int i = 0; i < weekLength * 3; i += 3600){
test.addPlay(playProb(gen));
test.addDown(downProb(gen));
test.addLike(likeProb(gen));
test.addFave(faveProb(gen));
double baseScore = std::log(std::max<unsigned int>(1,test.baseScore()));
double timePoint = static_cast<double>(i)/weekLength;
fakeRecord << timePoint << " " << baseScore << std::endl;
if(timePoint > 1){
double x = timePoint - 1;
fakeRecordDecay << timePoint << " " << (baseScore * std::exp(-8*x*x)) << std::endl;
}
else
fakeRecordDecay << timePoint << " " << baseScore << std::endl;
}
return 0;
}
Result:
This should be sufficient for you.
I have implemented an algorithm for floating point decimal to rational fraction approximation (example: 0.333 -> 1/3) and now I wonder, is there a way to find an irrational number which satisfies the condition. For example, given the input 0.282842712474 I want the result to be sqrt(2)/5 and not 431827/1526739 which my algorithm produces. The only condition is that the first digits of the result (converted back to floating point) should be the digits of the input, the rest doesn't matter. Thanks in advance!
I came up with solution, that from given set of possible denominators and nominators finds best approximation of given number.
For example this set can contain all numbers that can be created by:
1 <= radicand <= 100000
1 <= root_index <= 20
If set has N elements, than this solution finds best approximation in O(N log N).
In this solution X represents denominator and Y nominator.
sort numbers from set
for each number X from set:
using binary find smallest Y such that Y/X >= input_number
compare Y/X with currently best approximation of input_number
I couldn't resist and I implemented it:
#include <cstdio>
#include <vector>
#include <algorithm>
#include <cmath>
using namespace std;
struct Number {
// number value
double value;
// number representation
int root_index;
int radicand;
Number(){}
Number(double value, int root_index, int radicand)
: value(value), root_index(root_index), radicand(radicand) {}
bool operator < (const Number& rhs) const {
// in case of equal numbers, i want smaller radicand first
if (fabs(value - rhs.value) < 1e-12) return radicand < rhs.radicand;
return value < rhs.value;
}
void print() const {
if (value - (int)value < 1e-12) printf("%.0f", value);
else printf("sqrt_%d(%d)",root_index, radicand);
}
};
std::vector<Number> numbers;
double best_result = 1e100;
Number best_numerator;
Number best_denominator;
double input;
void compare_approximpation(const Number& numerator, const Number& denominator) {
double value = numerator.value / denominator.value;
if (fabs(value - input) < fabs(best_result - input)) {
best_result = value;
best_numerator = numerator;
best_denominator = denominator;
}
}
int main() {
const int NUMBER_LIMIT = 100000;
const int ROOT_LIMIT = 20;
// only numbers created by this loops will be used
// as numerator and denominator
for(int i=1; i<=ROOT_LIMIT; i++) {
for(int j=1; j<=NUMBER_LIMIT; j++) {
double value = pow(j, 1.0 /i);
numbers.push_back(Number(value, i, j));
}
}
sort(numbers.begin(), numbers.end());
scanf("%lf",&input);
int numerator_index = 0;
for(int denominator_index=0; denominator_index<numbers.size(); denominator_index++) {
// you were interested only in integral denominators
if (numbers[denominator_index].root_index == 1) {
// i use simple sweeping technique instead of binary search (its faster)
while(numerator_index < numbers.size() && numbers[numerator_index].root_index &&
numbers[numerator_index].value / numbers[denominator_index].value <= input) {
numerator_index++;
}
// comparing approximations
compare_approximpation(numbers[numerator_index], numbers[denominator_index]);
if (numerator_index > 0) {
compare_approximpation(numbers[numerator_index - 1], numbers[denominator_index]);
}
}
}
printf("Best approximation %.12lf = ", best_numerator.value / best_denominator.value);
best_numerator.print();
printf(" / ");
best_denominator.print();
printf("\n");
}