OpenCL lost value of variables because of for-loop

OpenCL lost value of variables because of for-loop - for-loop

I've the following opencl-code:
__constant SomeConstants[] = { 5, 7, 242, 74 };
long aslong(__global int * arr, int offset) {
return ... <CalculateLong> ...;
};
void calcSomething(__global int * numArray, int * result) {
long tempArray[] = { numArray[0], numArray[1], numArray[2], numArray[3] };
long x0 = (result[0] = aslong(numArray, 0)) + tempArray[0];
long x1 = (result[1] = aslong(numArray, 8)) + tempArray[1];
long x2 = (result[2] = aslong(numArray, 16)) + tempArray[2];
long x3 = (result[3] = aslong(numArray, 32)) + tempArray[3];
// do some calculations here
result[0] = x0;
result[1] = x1;
result[2] = x2;
result[3] = x3;
};
__kernel void calc(__global int * numArray, __global long * sum) {
long arr[] = { SomeConstants[0], SomeConstants[1], SomeConstants[2], SomeConstants[3] };
calcSomething(numArray, arr);
sum[get_global_id(0)] = arr[0];
sum[get_global_id(0)+1] = arr[1];
sum[get_global_id(0)+2] = arr[2];
sum[get_global_id(0)+3] = arr[3];
};
The code above works as expected but the problem is, that I can't add some more calculations to calcSomething. When I add an empty loop to calcSomethig like as follows:
void calcSomething(__global int * numArray, int * result) {
long tempArray[] = { numArray[0], numArray[1], numArray[2], numArray[3] };
long x0 = (result[0] = aslong(numArray, 0)) + tempArray[0];
long x1 = (result[1] = aslong(numArray, 8)) + tempArray[1];
long x2 = (result[2] = aslong(numArray, 16)) + tempArray[2];
long x3 = (result[3] = aslong(numArray, 32)) + tempArray[3];
for (int i = 0; i < 18; i++) {}
result[0] = x0;
result[1] = x1;
result[2] = x2;
result[3] = x3;
};
result[0] and result[1] stores the correct values, but result[2] and result[3] are absolutely wrong. By commenting out the for-loop, result[2] and 3 are correct... any idea what happened to x0, x1, x2 and x3 while calling the for-loop?

EDITED ANSWER: Your problem is located in the operations:
sum[get_global_id(0)] = arr[0];
sum[get_global_id(0)+1] = arr[1];
sum[get_global_id(0)+2] = arr[2];
sum[get_global_id(0)+3] = arr[3];
You are using your sum[] global array with overlapping indexes. However MANY work-items will write the same memory, producing undefined results.
You need either to fix the overlapping, or perform an atomic operation to the global variable.
Example:
gid 0 -> Write to sum[0,1,2,3]
gid 1 -> Write to sum[1,2,3,4]
//....
The for loop doesn't have to do with the problem at all. You are just seeing the effect of different schedules in a multithreaded enviroment. But that is not the root of the problem.
Further edit:
The code has problems NOT related to OpenCL. The only problem with OpenCL is already described above.
There are many places where C variables are wrongly adressed or casted implicitly and then used wrongly. For example in aslong() 8 consecutive ints are added to make a long, but a long is made of 8bytes, not 8int.

Related

Is there any optimization function in Rcpp

The following is my Rcpp code, and I want to minimize the objective function logtpoi(x,theta) respect to theta in R by 'nlminb'. I found it is slow.
I have two question:
Anyone can improve my Rcpp code? Thank you very much.
Is there any optimization functions in Rcpp? If yes,maybe I can use them in Rcpp directly. And how to use them? Thank you very much.
My code:
#include <RcppArmadillo.h>
using namespace Rcpp;
using namespace arma;
// [[Rcpp::depends("RcppArmadillo")]]
// [[Rcpp::export]]
List dtpoi0(const IntegerVector& x, const NumericVector& theta){
//x is 3-dim vector; theta is a 6-dim parameter vector.
//be careful the order of theta1,...,theta6.
double theta1 = theta[0]; double theta2 = theta[1];
double theta3 = theta[2]; double theta4 = theta[3];
double theta5 = theta[4]; double theta6 = theta[5];
int x1 = x[0]; int x2 = x[1]; int x3 = x[2];
IntegerVector z1 = IntegerVector::create(x1,x2);
IntegerVector z2 = IntegerVector::create(x1,x3);
IntegerVector z3 = IntegerVector::create(x2,x3);
int s1 = min(z1); int s2 = min(z2); int s3 = min(z3);
arma::imat missy(1,3,fill::zeros); arma::irowvec ijk={0,0,0};
for (int i = 0; i <= s1; ++i) {
for (int j = 0; j <= s2; ++j) {
for (int k = 0; k <= s3; ++k) {
if ((i+j <= s1) & (i+k <= s2) & ( j+k <= s3))
{ ijk = {i,j,k};
missy = join_cols(missy,ijk);}
}
}
}
IntegerMatrix misy = as<IntegerMatrix>(wrap(missy));
IntegerVector u1 = IntegerVector::create(0);
IntegerVector u2 = IntegerVector::create(0);
IntegerVector u3 = IntegerVector::create(0);
IntegerVector u4 = IntegerVector::create(0);
IntegerVector u5 = IntegerVector::create(0);
IntegerVector u6 = IntegerVector::create(0);
int total = misy.nrow();
double fvalue = 0;
NumericVector part1(1); NumericVector part2(1);
NumericVector part3(1); NumericVector part4(1);
NumericVector part5(1); NumericVector part6(1);
for (int l = 1; l < total; ++l) {
u1 = IntegerVector::create(x1-misy(l,0)-misy(l,1));
u2 = IntegerVector::create(x2-misy(l,0)-misy(l,2));
u3 = IntegerVector::create(x3-misy(l,1)-misy(l,2));
u4 = IntegerVector::create(misy(l,0));
u5 = IntegerVector::create(misy(l,1));
u6 = IntegerVector::create(misy(l,2));
part1 = dpois(u1,theta1);
part2 = dpois(u2,theta2);
part3 = dpois(u3,theta3);
part4 = dpois(u4,theta4);
part5 = dpois(u5,theta5);
part6 = dpois(u6,theta6);
fvalue = fvalue + (part1*part2*part3*part4*part5*part6)[0]; }
return(List::create(Named("misy") = misy,Named("fvalue") = fvalue));
}
// [[Rcpp::export]]
NumericVector dtpoi(const IntegerMatrix& x, const NumericVector& theta){
//x is n*3 matrix, n is the number of observations.
int n = x.nrow();
NumericVector density(n);
for (int i = 0; i < n; ++i){
density(i) = dtpoi0(x.row(i),theta)["fvalue"];
}
return(density);
}
// [[Rcpp::export]]
double logtpoi0(const IntegerMatrix& x,const NumericVector theta){
// theta must be a 6-dimiension parameter.
double nln = -sum(log( dtpoi(x,theta) + 1e-60 ));
if(arma::is_finite(nln)) {nln = nln;} else {nln = -1e10;}
return(nln);
}

Huge caveat ahead: I don’t really know Armadillo. But I’ve had a stab at it because the code looks interesting.
A few general things:
You don’t need to declare things before you assign them for the first time. In particular, it’s generally not necessary to declare vectors outside a loop if they’re only used inside the loop. This is probably no less efficient than declaring them inside the loop. However, if your code is too slow it makes sense to carefully profile this, and test whether the assumption holds.
Many of your declarations are just aliases for vector elements and don’t seem necessary.
Your z{1…3} vectors aren’t necessary. C++ has a min function to find the minimum of two elements.
dtpoi0 contains two main loops. Both of these have been heavily modified in my code:
The first loop iterates over many ks that can are never used, due to the internal if that tests whether i + j exceeds s2. By pulling this check into the loop condition of j, we perform fewer k loops.
Your if uses & instead of &&. Like in R, using && rather than & causes short-circuiting. While this is probably not more efficient in this case, using && is idiomatic, whereas & causes head-scratching (my code uses and which is an alternative way of spelling && in C++; I prefer its readability).
The second loops effectively performs a matrix operation manually. I feel that there should be a way of expressing this purely with matrix operations — but as mentioned I’m not an Armadillo user. Still, my changes attempt to vectorise as much of this operation as possible (if nothing else this makes the code shorter). The dpois inner product is unfortunately still inside a loop.
The logic of logtpoi0 can be made more idiomatic and (IMHO) more readable by using the conditional operator instead of if.
const-correctness is a big deal in C++, since it weeds out accidental modifications. Use const liberally when declaring variables that are not supposed to change.
In terms of efficiency, the biggest hit when calling dtpoi or logtpoi0 is probably the conversion of missy to misy, which causes allocations and memory copies. Only convert to IntegerMatrix when necessary, i.e. when actually returning that value to R. For that reason, I’ve split dtpoi0 into two parts.
Another inefficiency is the fact that the first loop in dtpoi0 grows a matrix by appending columns. That’s a big no-no. However, rewriting the code to avoid this isn’t trivial.
#include <algorithm>
#include <RcppArmadillo.h>
// [[Rcpp::depends("RcppArmadillo")]]
using namespace Rcpp;
using namespace arma;
imat dtpoi0_mat(const IntegerVector& x) {
const int s1 = std::min(x[0], x[1]);
const int s2 = std::min(x[0], x[2]);
const int s3 = std::min(x[1], x[2]);
imat missy(1, 3, fill::zeros);
for (int i = 0; i <= s1; ++i) {
for (int j = 0; j <= s2 and i + j <= s1; ++j) {
for (int k = 0; k <= s3 and i + k <= s2 and j + k <= s3; ++k) {
missy = join_cols(missy, irowvec{i, j, k});
}
}
}
return missy;
}
double dtpoi0_fvalue(const IntegerVector& x, const NumericVector& theta, imat& missy) {
double fvalue = 0.0;
ivec xx = as<ivec>(x);
missy.each_row([&](irowvec& v) {
const ivec u(join_cols(xx - v(uvec{0, 0, 1}) - v(uvec{1, 2, 3}), v));
double prod = 1;
for (int i = 0; i < u.n_elem; ++i) {
prod *= R::dpois(u[i], theta[i], 0);
}
fvalue += prod;
});
return fvalue;
}
double dtpoi0_fvalue(const IntegerVector& x, const NumericVector& theta) {
imat missy = dtpoi0_mat(x);
return dtpoi0_fvalue(x, theta, missy);
}
// [[Rcpp::export]]
List dtpoi0(const IntegerVector& x, const NumericVector& theta) {
imat missy = dtpoi0_mat(x);
const double fvalue = dtpoi0_fvalue(x, theta, missy);
return List::create(Named("misy") = as<IntegerMatrix>(wrap(missy)), Named("fvalue") = fvalue);
}
// [[Rcpp::export]]
NumericVector dtpoi(const IntegerMatrix& x, const NumericVector& theta) {
//x is n*3 matrix, n is the number of observations.
int n = x.nrow();
NumericVector density(n);
for (int i = 0; i < n; ++i){
density(i) = dtpoi0_fvalue(x.row(i), theta);
}
return density;
}
// [[Rcpp::export]]
double logtpoi0(const IntegerMatrix& x, const NumericVector theta) {
// theta must be a 6-dimension parameter.
const double nln = -sum(log(dtpoi(x, theta) + 1e-60));
return is_finite(nln) ? nln : -1e10;
}
Important: This compiles, but I can’t test its correctness. It’s entirely possible (even likely!) that my refactor introduced errors. It should therefore only be viewed as a solution sketch, and should by no means be copied and pasted into an application.

Optimizing global memory load in CUDA

My task :
I have two matrices : A - (18 x 4194304) ; B - (18 x 1024).
I have to take each 18-length vector from A and compute distance with each 18-length vector from B and find minimum distance and index.
My code :
__device__
void GetMin(float &dist, int &idx)
{
float dist2;
int idx2;
dist2 = __shfl_down_sync(0xFFFFFFFF, dist, 16, 32);
idx2 = __shfl_down_sync(0xFFFFFFFF, idx, 16);
if (dist > dist2)
{
dist = dist2;
idx = idx2;
}
dist2 = __shfl_down_sync(0xFFFFFFFF, dist, 8, 32);
idx2 = __shfl_down_sync(0xFFFFFFFF, idx, 8);
if (dist > dist2)
{
dist = dist2;
idx = idx2;
}
dist2 = __shfl_down_sync(0xFFFFFFFF, dist, 4, 32);
idx2 = __shfl_down_sync(0xFFFFFFFF, idx, 4);
if (dist > dist2)
{
dist = dist2;
idx = idx2;
}
dist2 = __shfl_down_sync(0xFFFFFFFF, dist, 2, 32);
idx2 = __shfl_down_sync(0xFFFFFFFF, idx, 2);
if (dist > dist2)
{
dist = dist2;
idx = idx2;
}
dist2 = __shfl_down_sync(0xFFFFFFFF, dist, 1, 32);
idx2 = __shfl_down_sync(0xFFFFFFFF, idx, 1);
if (dist > dist2)
{
dist = dist2;
idx = idx2;
}
}
__global__
void CalcMinDist_kernel(const float *A, const float *B, float *output, const int nNumPixels, int nNumImages)
{
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int ty = threadIdx.y;
int lane_id = tx % 32;
float dist = 0;
int idx = 0;
float fMin = 99999999;
int nMinIdx = -1;
for(int i = lane_id; i < 1024; i += 32)
{
dist = 0;
for(int j = 0; j < nNumImages; ++j)
{
int img_idx = blockIdx.x * ty + j * nNumPixels;
dist += (A[img_idx] - B[i * nNumImages + j]) *
(A[img_idx] - B[i * nNumImages + j]);
}
idx = i;
GetMin(dist, idx);
if(threadIdx.x == 0)
{
if(fMin > dist)
{
fMin = dist;
nMinIdx = idx;
}
}
}
if(threadIdx.x == 0)
{
output[blockIdx.x * ty] = nMinIdx;
}
}
Looking at the profiler, I'm memory bound, and do have ~90% occupancy. Is there any way to speed up this operation?
Let me know if I need to provide any other information.

Actually, I would look at the algorithm first. This is a geometric problem - treat it as such.
You should represent the B data using a different data structure, e.g. by clustering or building a partition structure (e.g. k-d tree). That will let you avoid actually computing the distance from most B elements. (You could also consider a project onto fewer dimensions, but the benefit of this may be more elusive.)
With respect to the access pattern - you would probably benefit from having consecutive threads working on consecutive elements of the 18-element-long vectors, rather than having threads work on complete 18-element-long vectors individually. That would better fit the memory layout - right now, a warp read is of many elements which are at distance 18 from each other. If I understand the code correctly anyway.
(I also think the GetMin() could avoid some of the index swaps, but that's not significant since you only perform very few of those.)

OpenCL Kernel Error -11

I'm new to OpenCL and i'm trying to parallelise an edge detection program.I'm trying to write a kernel from the edge detection function.
The original function:
void edgeDetection(float *out, float *in, int w, int h) {
int r,c;
for (r = 0; r < h-2; r++) {
for (c = 0; c < w-2; c++) {
float G;
float* pOut = &out[r*w + c];
float Gx = 0.0;
float Gy = 0.0;
int fr,fc;
/* run the 2d-convolution filter */
for (fr = 0; fr < 3; fr++) {
for (fc = 0; fc < 3; fc++) {
float p = in[(r+fr)*w + (c+fc)];
/* X-directional edges */
Gx += p * F[fr*3 + fc];
/* Y-directional edges */
Gy += p * F[fc*3 + fr];
}
}
/* all edges, pythagoral sum */
G = sqrtf(Gx*Gx + Gy*Gy);
*pOut = G;
}
}
}
My OpenCL Kernel:
__kernel
void edgeDetection(__global float *out,
__global float *in, int w, int h)
{
// Get the work-item’s unique ID
const int r = get_global_id(0);
const int c = get_global_id(1);
if(r>=0 && c>=0 && r<h-2 && c<w-2){
float G;
float* pOut = &out[r*w + c];
float Gx = 0.0;
float Gy = 0.0;
int fr,fc;
for (fr = 0; fr < 3; fr++) {
for (fc = 0; fc < 3; fc++) {
float p = in[(r+fr)*w + (c+fc)];
Gx += p * F[fr*3 + fc];
Gy += p * F[fc*3 + fr];
}
}
G = sqrtf(Gx*Gx + Gy*Gy);
*pOut = G;
}
}
When I try to build the program from the .cl file using this(chk is a function to check if there are any failures/errors):
status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
chk(status, "clBuildProgram");
I get an error saying, "clBuildProgram failed (-11)". From my researches, I've seen that it is commonly tought that this error is caused by a syntax error. However, after checking many times I cannot see anything particularly wrong with my kernel. Can somebody help me figure out what's wrong with it?

There are many errors in the code:
1)
float* pOut = &out[r*w + c];
This is invalid, it should be:
__global float* pOut = &out[r*w + c];
2) You are using F in the kernel which was never defined.
3) sqrtf is not defined in CL, did you mean sqrt instead?

expression must have arithmetic or unscoped enum type

This is my first try at a method without input. Here is the code:
int factorial(int a)
{
int i = 1, result = 1;
while (i <= a)
{
result = result * i;
i++;
}
return result;
}
int double_factorial(int a)
{
int i = 2, result = 1;
while (i <= a)
{
result = result * i;
i = i + 2;
}
return result;
}
long double pi()
{
unsigned long int n = 4294967295;
unsigned long int i = 0;
long double result = 0;
while (i <= n)
{
result = result + (factorial(i) / double_factorial(2 * i + 1));
i++;
}
long double pi = result * 2;
return pi;
}
long double circumference_circle_input_radius(double r)
{
long double C = 2.0 * pi * r; //error: 'pi' expression must have arithmetic or unscoped enum type.
}
When I try to use method "pi" in this, the error appeared. I dont understand what the error means, so it is quite hard to understand the problem and debug it.

pi is a function, not a variable. To call it in an expression, you need to use parentheses:
long double C = 2.0 * pi() * r;
^^
Without the parentheses, the compiler thinks you're trying to multiply the function itself by 2, which isn't an operation that makes any sense.

convert bezier curve to polygonal chain?

I want to split a bezier curve into a polygonal chain with n straight lines. The number of lines being dependent on a maximum allowed angle between 2 connecting lines.
I'm looking for an algorithm to find the most optimal solution (ie to reduce as much as possible the number of straight lines).
I know how to split a bezier curve using Casteljau or Bernstein polynomals. I tried dividing the bezier into half calculate the angle between the straight lines, and split again if the angle between the connecting lines is within a certain threshold range, but i may run into shortcuts.
Is there a known algorithm or pseudo code available to do this conversion?

Use de Casteljau algorithm recursively until the control points are approximately collinear. See for instance http://www.antigrain.com/research/adaptive_bezier/index.html.

This was a fascinating topic. The only thing I'm adding is tested C# code, to perhaps save somebody the trouble. And I tried to write for clarity as opposed to speed, so it mostly follows the AGG web site's PDF doc (see above) on the Casteljau algorithm. The Notation follows the diagram in that PDF.
public class Bezier
{
public PointF P1; // Begin Point
public PointF P2; // Control Point
public PointF P3; // Control Point
public PointF P4; // End Point
// Made these global so I could diagram the top solution
public Line L12;
public Line L23;
public Line L34;
public PointF P12;
public PointF P23;
public PointF P34;
public Line L1223;
public Line L2334;
public PointF P123;
public PointF P234;
public Line L123234;
public PointF P1234;
public Bezier(PointF p1, PointF p2, PointF p3, PointF p4)
{
P1 = p1; P2 = p2; P3 = p3; P4 = p4;
}
/// <summary>
/// Consider the classic Casteljau diagram
/// with the bezier points p1, p2, p3, p4 and lines l12, l23, l34
/// and their midpoint of line l12 being p12 ...
/// and the line between p12 p23 being L1223
/// and the midpoint of line L1223 being P1223 ...
/// </summary>
/// <param name="lines"></param>
public void SplitBezier( List<Line> lines)
{
L12 = new Line(this.P1, this.P2);
L23 = new Line(this.P2, this.P3);
L34 = new Line(this.P3, this.P4);
P12 = L12.MidPoint();
P23 = L23.MidPoint();
P34 = L34.MidPoint();
L1223 = new Line(P12, P23);
L2334 = new Line(P23, P34);
P123 = L1223.MidPoint();
P234 = L2334.MidPoint();
L123234 = new Line(P123, P234);
P1234 = L123234.MidPoint();
if (CurveIsFlat())
{
lines.Add(new Line(this.P1, this.P4));
return;
}
else
{
Bezier bz1 = new Bezier(this.P1, P12, P123, P1234);
bz1.SplitBezier(lines);
Bezier bz2 = new Bezier(P1234, P234, P34, this.P4);
bz2.SplitBezier(lines);
}
return;
}
/// <summary>
/// Check if points P1, P1234 and P2 are colinear (enough).
/// This is very simple-minded algo... there are better...
/// </summary>
/// <returns></returns>
public bool CurveIsFlat()
{
float t1 = (P2.Y - P1.Y) * (P3.X - P2.X);
float t2 = (P3.Y - P2.Y) * (P2.X - P1.X);
float delta = Math.Abs(t1 - t2);
return delta < 0.1; // Hard-coded constant
}
The PointF is from System.Drawing, and the Line class follows:
public class Line
{
PointF P1; PointF P2;
public Line(PointF pt1, PointF pt2)
{
P1 = pt1; P2 = pt2;
}
public PointF MidPoint()
{
return new PointF((P1.X + P2.X) / 2f, (P1.Y + P2.Y) / 2f);
}
}
A sample call creates the Bezier object with 4 points (begin, 2 control, and end), and returns a list of lines that approximate the Bezier:
TopBezier = new Bezier(Point1, Point2, Point3, Point4 );
List<Line> lines = new List<Line>();
TopBezier.SplitBezier(lines);
Thanks to Dr Jerry, AGG, and all the other contributors.

There are some alternatives for RSA flattening that are reported to be faster:
RSA vs PAA:
http://www.cis.usouthal.edu/~hain/general/Theses/Ahmad_thesis.pdf
RSA vs CAA vs PAA:
http://www.cis.usouthal.edu/~hain/general/Theses/Racherla_thesis.pdf
RSA = Recursive Subdivision Algorithm
PAA = Parabolic Approximation Algorithm
CAA = Circular Approximation Algorithm
According to Rachela, CAA is slower than the PAA by a factor of 1.5–2. CAA is as slow as RSA, but achieves required flatness better in offset curves.
It seems that PAA is best choice for actual curve and CAA is best for offset's of curve (when stroking curves).
I have tested PAA of both thesis, but they fail in some cases. Ahmad's PAA fails in collinear cases (all points on same line) and Rachela's PAA fails in collinear cases and in cases where both control points are equal. With some fixes, it may be possible to get them work as expected.

A visual example on my website -> DXF -> polybezier.
it is basically a recursive split with casteljau.
Bezier2Poly.prototype.convert = function(array,init) {
if (init) {
this.vertices = [];
}
if (!init && (Math.abs(this.controlPointsDiff(array[0], array[2])) < this.threshold
|| Math.abs(this.controlPointsDiff({x:array[2].x-array[1].x, y:array[2]-array[1].y}, array[2])) < this.threshold)) {
this.vertices.push(array[2]);
} else {
var split = this.splitBezier(array);
this.convert(split.b1);
this.convert(split.b2);
}
return this.vertices;
}
And judgement by: calculating the angle between the controlpoints and the line through the endpoint.
Bezier2Poly.prototype.controlPointsDiff = function (vector1, vector2) {
var angleCp1 = Math.atan2(vector1.y, vector1.x);
var angleCp2 = Math.atan2(vector2.y, vector2.x);
return angleCp1 - angleCp2;
}

i solve it with qt for any svg path including bezier curve , i found in svg module a static function in qsvghandler.cpp which parsePathDataFast from your svg path to QPainterPath and the cherry on the cake!! QPainterPath have three native functions to convert your path to polygon (the big one toFillPolygon and the others which split in a list of polygon toSubpathPolygons or toFillPolygons) along with nice stuff like bounding box, intersected, translate ... ready to use with Boost::Geometry now, not so bad!
the header parsepathdatafast.h
#ifndef PARSEPATHDATAFAST_H
#define PARSEPATHDATAFAST_H
#include <QPainterPath>
#include <QString>
bool parsePathDataFast(const QStringRef &dataStr, QPainterPath &path);
#endif // PARSEPATHDATAFAST_H
the code parsepathdatafast.cpp
#include <QtCore/qmath.h>
#include <QtMath>
#include <QChar>
#include <QByteArray>
#include <QMatrix>
#include <parsepathdatafast.h>
Q_CORE_EXPORT double qstrtod(const char *s00, char const **se, bool *ok);
// '0' is 0x30 and '9' is 0x39
static inline bool isDigit(ushort ch)
{
static quint16 magic = 0x3ff;
return ((ch >> 4) == 3) && (magic >> (ch & 15));
}
static qreal toDouble(const QChar *&str)
{
const int maxLen = 255;//technically doubles can go til 308+ but whatever
char temp[maxLen+1];
int pos = 0;
if (*str == QLatin1Char('-')) {
temp[pos++] = '-';
++str;
} else if (*str == QLatin1Char('+')) {
++str;
}
while (isDigit(str->unicode()) && pos < maxLen) {
temp[pos++] = str->toLatin1();
++str;
}
if (*str == QLatin1Char('.') && pos < maxLen) {
temp[pos++] = '.';
++str;
}
while (isDigit(str->unicode()) && pos < maxLen) {
temp[pos++] = str->toLatin1();
++str;
}
bool exponent = false;
if ((*str == QLatin1Char('e') || *str == QLatin1Char('E')) && pos < maxLen) {
exponent = true;
temp[pos++] = 'e';
++str;
if ((*str == QLatin1Char('-') || *str == QLatin1Char('+')) && pos < maxLen) {
temp[pos++] = str->toLatin1();
++str;
}
while (isDigit(str->unicode()) && pos < maxLen) {
temp[pos++] = str->toLatin1();
++str;
}
}
temp[pos] = '\0';
qreal val;
if (!exponent && pos < 10) {
int ival = 0;
const char *t = temp;
bool neg = false;
if(*t == '-') {
neg = true;
++t;
}
while(*t && *t != '.') {
ival *= 10;
ival += (*t) - '0';
++t;
}
if(*t == '.') {
++t;
int div = 1;
while(*t) {
ival *= 10;
ival += (*t) - '0';
div *= 10;
++t;
}
val = ((qreal)ival)/((qreal)div);
} else {
val = ival;
}
if (neg)
val = -val;
} else {
bool ok = false;
val = qstrtod(temp, 0, &ok);
}
return val;
}
static inline void parseNumbersArray(const QChar *&str, QVarLengthArray<qreal, 8> &points)
{
while (str->isSpace())
++str;
while (isDigit(str->unicode()) ||
*str == QLatin1Char('-') || *str == QLatin1Char('+') ||
*str == QLatin1Char('.')) {
points.append(toDouble(str));
while (str->isSpace())
++str;
if (*str == QLatin1Char(','))
++str;
//eat the rest of space
while (str->isSpace())
++str;
}
}
/**
static QVector<qreal> parsePercentageList(const QChar *&str)
{
QVector<qreal> points;
if (!str)
return points;
while (str->isSpace())
++str;
while ((*str >= QLatin1Char('0') && *str <= QLatin1Char('9')) ||
*str == QLatin1Char('-') || *str == QLatin1Char('+') ||
*str == QLatin1Char('.')) {
points.append(toDouble(str));
while (str->isSpace())
++str;
if (*str == QLatin1Char('%'))
++str;
while (str->isSpace())
++str;
if (*str == QLatin1Char(','))
++str;
//eat the rest of space
while (str->isSpace())
++str;
}
return points;
}
**/
static void pathArcSegment(QPainterPath &path,
qreal xc, qreal yc,
qreal th0, qreal th1,
qreal rx, qreal ry, qreal xAxisRotation)
{
qreal sinTh, cosTh;
qreal a00, a01, a10, a11;
qreal x1, y1, x2, y2, x3, y3;
qreal t;
qreal thHalf;
sinTh = qSin(xAxisRotation * (M_PI / 180.0));
cosTh = qCos(xAxisRotation * (M_PI / 180.0));
a00 = cosTh * rx;
a01 = -sinTh * ry;
a10 = sinTh * rx;
a11 = cosTh * ry;
thHalf = 0.5 * (th1 - th0);
t = (8.0 / 3.0) * qSin(thHalf * 0.5) * qSin(thHalf * 0.5) / qSin(thHalf);
x1 = xc + qCos(th0) - t * qSin(th0);
y1 = yc + qSin(th0) + t * qCos(th0);
x3 = xc + qCos(th1);
y3 = yc + qSin(th1);
x2 = x3 + t * qSin(th1);
y2 = y3 - t * qCos(th1);
path.cubicTo(a00 * x1 + a01 * y1, a10 * x1 + a11 * y1,
a00 * x2 + a01 * y2, a10 * x2 + a11 * y2,
a00 * x3 + a01 * y3, a10 * x3 + a11 * y3);
}
// the arc handling code underneath is from XSVG (BSD license)
/*
* Copyright 2002 USC/Information Sciences Institute
*
* Permission to use, copy, modify, distribute, and sell this software
* and its documentation for any purpose is hereby granted without
* fee, provided that the above copyright notice appear in all copies
* and that both that copyright notice and this permission notice
* appear in supporting documentation, and that the name of
* Information Sciences Institute not be used in advertising or
* publicity pertaining to distribution of the software without
* specific, written prior permission. Information Sciences Institute
* makes no representations about the suitability of this software for
* any purpose. It is provided "as is" without express or implied
* warranty.
*
* INFORMATION SCIENCES INSTITUTE DISCLAIMS ALL WARRANTIES WITH REGARD
* TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL INFORMATION SCIENCES
* INSTITUTE BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
* DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
* OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
* TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
* PERFORMANCE OF THIS SOFTWARE.
*
*/
static void pathArc(QPainterPath &path,
qreal rx,
qreal ry,
qreal x_axis_rotation,
int large_arc_flag,
int sweep_flag,
qreal x,
qreal y,
qreal curx, qreal cury)
{
qreal sin_th, cos_th;
qreal a00, a01, a10, a11;
qreal x0, y0, x1, y1, xc, yc;
qreal d, sfactor, sfactor_sq;
qreal th0, th1, th_arc;
int i, n_segs;
qreal dx, dy, dx1, dy1, Pr1, Pr2, Px, Py, check;
rx = qAbs(rx);
ry = qAbs(ry);
sin_th = qSin(x_axis_rotation * (M_PI / 180.0));
cos_th = qCos(x_axis_rotation * (M_PI / 180.0));
dx = (curx - x) / 2.0;
dy = (cury - y) / 2.0;
dx1 = cos_th * dx + sin_th * dy;
dy1 = -sin_th * dx + cos_th * dy;
Pr1 = rx * rx;
Pr2 = ry * ry;
Px = dx1 * dx1;
Py = dy1 * dy1;
/* Spec : check if radii are large enough */
check = Px / Pr1 + Py / Pr2;
if (check > 1) {
rx = rx * qSqrt(check);
ry = ry * qSqrt(check);
}
a00 = cos_th / rx;
a01 = sin_th / rx;
a10 = -sin_th / ry;
a11 = cos_th / ry;
x0 = a00 * curx + a01 * cury;
y0 = a10 * curx + a11 * cury;
x1 = a00 * x + a01 * y;
y1 = a10 * x + a11 * y;
/* (x0, y0) is current point in transformed coordinate space.
(x1, y1) is new point in transformed coordinate space.
The arc fits a unit-radius circle in this space.
*/
d = (x1 - x0) * (x1 - x0) + (y1 - y0) * (y1 - y0);
sfactor_sq = 1.0 / d - 0.25;
if (sfactor_sq < 0) sfactor_sq = 0;
sfactor = qSqrt(sfactor_sq);
if (sweep_flag == large_arc_flag) sfactor = -sfactor;
xc = 0.5 * (x0 + x1) - sfactor * (y1 - y0);
yc = 0.5 * (y0 + y1) + sfactor * (x1 - x0);
/* (xc, yc) is center of the circle. */
th0 = qAtan2(y0 - yc, x0 - xc);
th1 = qAtan2(y1 - yc, x1 - xc);
th_arc = th1 - th0;
if (th_arc < 0 && sweep_flag)
th_arc += 2 * M_PI;
else if (th_arc > 0 && !sweep_flag)
th_arc -= 2 * M_PI;
n_segs = qCeil(qAbs(th_arc / (M_PI * 0.5 + 0.001)));
for (i = 0; i < n_segs; i++) {
pathArcSegment(path, xc, yc,
th0 + i * th_arc / n_segs,
th0 + (i + 1) * th_arc / n_segs,
rx, ry, x_axis_rotation);
}
}
bool parsePathDataFast(const QStringRef &dataStr, QPainterPath &path)
{
qreal x0 = 0, y0 = 0; // starting point
qreal x = 0, y = 0; // current point
char lastMode = 0;
QPointF ctrlPt;
const QChar *str = dataStr.constData();
const QChar *end = str + dataStr.size();
while (str != end) {
while (str->isSpace())
++str;
QChar pathElem = *str;
++str;
QChar endc = *end;
*const_cast<QChar *>(end) = 0; // parseNumbersArray requires 0-termination that QStringRef cannot guarantee
QVarLengthArray<qreal, 8> arg;
parseNumbersArray(str, arg);
*const_cast<QChar *>(end) = endc;
if (pathElem == QLatin1Char('z') || pathElem == QLatin1Char('Z'))
arg.append(0);//dummy
const qreal *num = arg.constData();
int count = arg.count();
while (count > 0) {
qreal offsetX = x; // correction offsets
qreal offsetY = y; // for relative commands
switch (pathElem.unicode()) {
case 'm': {
if (count < 2) {
num++;
count--;
break;
}
x = x0 = num[0] + offsetX;
y = y0 = num[1] + offsetY;
num += 2;
count -= 2;
path.moveTo(x0, y0);
// As per 1.2 spec 8.3.2 The "moveto" commands
// If a 'moveto' is followed by multiple pairs of coordinates without explicit commands,
// the subsequent pairs shall be treated as implicit 'lineto' commands.
pathElem = QLatin1Char('l');
}
break;
case 'M': {
if (count < 2) {
num++;
count--;
break;
}
x = x0 = num[0];
y = y0 = num[1];
num += 2;
count -= 2;
path.moveTo(x0, y0);
// As per 1.2 spec 8.3.2 The "moveto" commands
// If a 'moveto' is followed by multiple pairs of coordinates without explicit commands,
// the subsequent pairs shall be treated as implicit 'lineto' commands.
pathElem = QLatin1Char('L');
}
break;
case 'z':
case 'Z': {
x = x0;
y = y0;
count--; // skip dummy
num++;
path.closeSubpath();
}
break;
case 'l': {
if (count < 2) {
num++;
count--;
break;
}
x = num[0] + offsetX;
y = num[1] + offsetY;
num += 2;
count -= 2;
path.lineTo(x, y);
}
break;
case 'L': {
if (count < 2) {
num++;
count--;
break;
}
x = num[0];
y = num[1];
num += 2;
count -= 2;
path.lineTo(x, y);
}
break;
case 'h': {
x = num[0] + offsetX;
num++;
count--;
path.lineTo(x, y);
}
break;
case 'H': {
x = num[0];
num++;
count--;
path.lineTo(x, y);
}
break;
case 'v': {
y = num[0] + offsetY;
num++;
count--;
path.lineTo(x, y);
}
break;
case 'V': {
y = num[0];
num++;
count--;
path.lineTo(x, y);
}
break;
case 'c': {
if (count < 6) {
num += count;
count = 0;
break;
}
QPointF c1(num[0] + offsetX, num[1] + offsetY);
QPointF c2(num[2] + offsetX, num[3] + offsetY);
QPointF e(num[4] + offsetX, num[5] + offsetY);
num += 6;
count -= 6;
path.cubicTo(c1, c2, e);
ctrlPt = c2;
x = e.x();
y = e.y();
break;
}
case 'C': {
if (count < 6) {
num += count;
count = 0;
break;
}
QPointF c1(num[0], num[1]);
QPointF c2(num[2], num[3]);
QPointF e(num[4], num[5]);
num += 6;
count -= 6;
path.cubicTo(c1, c2, e);
ctrlPt = c2;
x = e.x();
y = e.y();
break;
}
case 's': {
if (count < 4) {
num += count;
count = 0;
break;
}
QPointF c1;
if (lastMode == 'c' || lastMode == 'C' ||
lastMode == 's' || lastMode == 'S')
c1 = QPointF(2*x-ctrlPt.x(), 2*y-ctrlPt.y());
else
c1 = QPointF(x, y);
QPointF c2(num[0] + offsetX, num[1] + offsetY);
QPointF e(num[2] + offsetX, num[3] + offsetY);
num += 4;
count -= 4;
path.cubicTo(c1, c2, e);
ctrlPt = c2;
x = e.x();
y = e.y();
break;
}
case 'S': {
if (count < 4) {
num += count;
count = 0;
break;
}
QPointF c1;
if (lastMode == 'c' || lastMode == 'C' ||
lastMode == 's' || lastMode == 'S')
c1 = QPointF(2*x-ctrlPt.x(), 2*y-ctrlPt.y());
else
c1 = QPointF(x, y);
QPointF c2(num[0], num[1]);
QPointF e(num[2], num[3]);
num += 4;
count -= 4;
path.cubicTo(c1, c2, e);
ctrlPt = c2;
x = e.x();
y = e.y();
break;
}
case 'q': {
if (count < 4) {
num += count;
count = 0;
break;
}
QPointF c(num[0] + offsetX, num[1] + offsetY);
QPointF e(num[2] + offsetX, num[3] + offsetY);
num += 4;
count -= 4;
path.quadTo(c, e);
ctrlPt = c;
x = e.x();
y = e.y();
break;
}
case 'Q': {
if (count < 4) {
num += count;
count = 0;
break;
}
QPointF c(num[0], num[1]);
QPointF e(num[2], num[3]);
num += 4;
count -= 4;
path.quadTo(c, e);
ctrlPt = c;
x = e.x();
y = e.y();
break;
}
case 't': {
if (count < 2) {
num += count;
count = 0;
break;
}
QPointF e(num[0] + offsetX, num[1] + offsetY);
num += 2;
count -= 2;
QPointF c;
if (lastMode == 'q' || lastMode == 'Q' ||
lastMode == 't' || lastMode == 'T')
c = QPointF(2*x-ctrlPt.x(), 2*y-ctrlPt.y());
else
c = QPointF(x, y);
path.quadTo(c, e);
ctrlPt = c;
x = e.x();
y = e.y();
break;
}
case 'T': {
if (count < 2) {
num += count;
count = 0;
break;
}
QPointF e(num[0], num[1]);
num += 2;
count -= 2;
QPointF c;
if (lastMode == 'q' || lastMode == 'Q' ||
lastMode == 't' || lastMode == 'T')
c = QPointF(2*x-ctrlPt.x(), 2*y-ctrlPt.y());
else
c = QPointF(x, y);
path.quadTo(c, e);
ctrlPt = c;
x = e.x();
y = e.y();
break;
}
case 'a': {
if (count < 7) {
num += count;
count = 0;
break;
}
qreal rx = (*num++);
qreal ry = (*num++);
qreal xAxisRotation = (*num++);
qreal largeArcFlag = (*num++);
qreal sweepFlag = (*num++);
qreal ex = (*num++) + offsetX;
qreal ey = (*num++) + offsetY;
count -= 7;
qreal curx = x;
qreal cury = y;
pathArc(path, rx, ry, xAxisRotation, int(largeArcFlag),
int(sweepFlag), ex, ey, curx, cury);
x = ex;
y = ey;
}
break;
case 'A': {
if (count < 7) {
num += count;
count = 0;
break;
}
qreal rx = (*num++);
qreal ry = (*num++);
qreal xAxisRotation = (*num++);
qreal largeArcFlag = (*num++);
qreal sweepFlag = (*num++);
qreal ex = (*num++);
qreal ey = (*num++);
count -= 7;
qreal curx = x;
qreal cury = y;
pathArc(path, rx, ry, xAxisRotation, int(largeArcFlag),
int(sweepFlag), ex, ey, curx, cury);
x = ex;
y = ey;
}
break;
default:
return false;
}
lastMode = pathElem.toLatin1();
}
}
return true;
}
One question, i doesn't find Q_PI constant in the standard qt headers and i replace it with M_PI hope is OK!!

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

OpenCL lost value of variables because of for-loop - for-loop

Related

Is there any optimization function in Rcpp

Optimizing global memory load in CUDA

OpenCL Kernel Error -11

expression must have arithmetic or unscoped enum type

convert bezier curve to polygonal chain?

Categories

Resources