gcc arm inline assembler %e0 and %f0 operand modifiers for 16-byte NEON operands?

gcc arm inline assembler %e0 and %f0 operand modifiers for 16-byte NEON operands? - gcc

Found the following inline assembler code to calculate the vector cross product :
float32x4_t cross_test( const float32x4_t& lhs, const float32x4_t& rhs )
{
float32x4_t result;
asm volatile(
"vext.8 d6, %e2, %f2, #4 \n\t"
"vext.8 d7, %e1, %f1, #4 \n\t"
"vmul.f32 %e0, %f1, %e2 \n\t"
"vmul.f32 %f0, %e1, d6 \n\t"
"vmls.f32 %e0, %f2, %e1 \n\t"
"vmls.f32 %f0, %e2, d7 \n\t"
"vext.8 %e0, %f0, %e0, #4 "
: "+w" ( result )
: "w" ( lhs ), "w" ( rhs )
: "d6", "d7" );
return result;
}
What do the modifiers e and f after '%' mean (e.g. %e2)? I can not find any reference for this.
This is the assembler code generated by gcc:
vext.8 d6, d20, d21, #4
vext.8 d7, d18, d19, #4
vmul.f32 d16, d19, d20
vmul.f32 d17, d18, d6
vmls.f32 d16, d21, d18
vmls.f32 d17, d20, d7
vext.8 d16, d17, d16, #4
I now understood the meaning of the used modifiers. Now I tried to follow the cross product algorithm. For this I added some additional comments to the assembler code but the result is not equal to my expectation:
// History:
// - '%e' = lower register part
// - '%f' = higher register part
// - '%?0' = res = [ x2 y2 | z2 v2 ]
// - '%?1' = lhs = [ x0 y0 | z0 v0 ]
// - '%?2' = rhs = [ x1 y1 | z1 v1 ]
// - '%e0' = [ x2 y2 ]
// - '%f0' = [ z2 v2 ]
// - '%e1' = [ x0 y0 ]
// - '%f1' = [ z0 v0 ]
// - '%e2' = [ x1 y1 ]
// - '%f2' = [ z1 v1 ]
// Implemented algorithm:
// |x2| |y0 * z1 - z0 * y1|
// |y2| = |z0 * x1 - x0 * z1|
// |z2| |x0 * y1 - y0 * x1|
asm (
"vext.8 d6, %e2, %f2, #4 \n\t" // e2=[ x1 y1 ], f2=[ z1 v1 ] -> d6=[ v1 x1 ]
"vext.8 d7, %e1, %f1, #4 \n\t" // e1=[ x0 y0 ], f1=[ z0 v0 ] -> d7=[ v0 x0 ]
"vmul.f32 %e0, %f1, %e2 \n\t" // f1=[ z0 v0 ], e2=[ x1 y1 ] -> e0=[ z0 * x1, v0 * y1 ]
"vmul.f32 %f0, %e1, d6 \n\t" // e1=[ x0 y0 ], d6=[ v1 x1 ] -> f0=[ x0 * v1, y0 * x1 ]
"vmls.f32 %e0, %f2, %e1 \n\t" // f2=[ z1 v1 ], e1=[ x0 y0 ] -> e0=[ z0 * x1 - z1 * x0, v0 * y1 - v1 * y0 ] = [ y2, - ]
"vmls.f32 %f0, %e2, d7 \n\t" // e2=[ x1 y1 ], d7=[ v0 x0 ] -> f0=[ x0 * v1 - x1 * v0, y0 * x1 - y1 * x0 ] = [ -, - ]
"vext.8 %e0, %f0, %e0, #4 " //
: "+w" ( result ) // Output section: 'w'='VFP floating point register', '+'='read/write'
: "w" ( lhs ), "w" ( rhs ) // Input section : 'w'='VFP floating point register'
: "d6", "d7" ); // Temporary 64[bit] register.

First of all, this is weird. result isn't initialized before the asm statement, but it's used as an input/output operand with "+w" ( result ). I think "=w" (result) would be better. It also makes no sense that this is volatile; the output is a pure function of the inputs with no side effects or dependency on any "hidden" inputs, so the same inputs will give the same result every time. Thus, omitting volatile would allow the compiler to CSE it and hoist it out of loops if possible, instead of forcing it to re-compute every time the source runs it with the same inputs.
I couldn't find any reference either; the gcc manual's Extended ASM page only documents operand modifiers for x86, not ARM.
But I think we can see the operand modifiers do from looking at the asm output:
%e0 is substituted with d16, %f0 is substituted with d17. %e1 is d18 and %f1 is d19. %2 is in d20 and d21
Your inputs are 16-byte NEON vectors, in q registers. In ARM32, the upper and lower half of each q register is separately accessible as a d register. (Unlike AArch64 where each s / d register is the bottom element of a different q reg.) It looks like this code is taking advantage of this to shuffle for free by using 64-bit SIMD on the high and low pair of floats, after doing a 4-byte vext shuffle to mix those pairs of floats.
%e[operand] is the low d register of an operand, %f[operand] is the high d register. They're not documented, but the gcc source code says (in arm_print_operand in gcc/config/arm/arm.c#L22486:
These two codes print the low/high doubleword register of a Neon quad
register, respectively. For pair-structure types, can also print
low/high quadword registers.
I didn't test what happens if you apply these modifiers to 64-bit operands like float32x2_t, and this is all just me reverse-engineering from one example. But it makes perfect sense that there would be modifiers for this.
x86 modifiers include one for the low and high 8 bits of integer registers (so you can get AL / AH if your input as in EAX), so partial-register stuff is definitely something that GNU C inline asm operand modifiers can do.
Beware that undocumented means unsupported.

I am looking for the meaning of %e0 & %f0, this topic is very helpful. The cross_test() output could be explained as follows:
#include <arm_neon.h>
#include <stdio.h>
float32x4_t cross_test(const float32x4_t& lhs, const float32x4_t& rhs) {
float32x4_t result;
// | f | e
// -----------------------------
// 1 | a3(4) a2(3) | a1(2) a0(1)
// 2 | b3(5) b2(6) | b1(7) b0(8)
asm volatile (
"vext.8 d6, %e1, %f1, #4" "\n" // a2, a1
"vext.8 d7, %e2, %f2, #4" "\n" // b2, b1
"vmul.f32 %e0, %f1, %e2" "\n" // a3*b1, a2*b0
"vmul.f32 %f0, %e1, d7" "\n" // a1*b2, a0*b1
"vmls.f32 %e0, %f2, %e1" "\n" // a3*b1-a1*b3(18), a2*b0-a0*b2(18)
"vmls.f32 %f0, %e2, d6" "\n" // a1*b2-a2*b1(-9), a0*b1-a1*b0(-9)
"vext.8 %e0, %f0, %e0, #4" "\n" // a2*b0-a0*b2(18), a1*b2-a2*b1(-9)
: "+w"(result) // %0
: "w"(lhs), // %1
"w"(rhs) // %2
: "d6", "d7"
);
return result;
}
#define nforeach(i, count) \
for (int i = 0, __count = static_cast<int>(count); i < __count; ++i)
#define dump_f128(qf) do { \
float *fp = reinterpret_cast<float *>(&qf); \
puts(#qf ":"); \
nforeach(i, 4) { \
printf("[%d]%f\n", i, fp[i]); \
} \
} while (0)
int main() {
float fa[] = {1., 2., 3., 4.};
float fb[] = {8., 7., 6., 5.};
float32x4_t qa, qb, qres;
qa = vld1q_f32(const_cast<const float *>(&fa[0]));
qb = vld1q_f32(const_cast<const float *>(&fb[0]));
qres = cross_test(qa, qb);
dump_f128(qa);
puts("---");
dump_f128(qb);
puts("---");
// -9, 18, -9, -9
dump_f128(qres);
return 0;
}

Related

Making macro expanded when Debug GNU assembly program using GDB

I write a program using assembly language and compile it by GCC as following:
gcc -g sha256-avx.S main256.c -o test.exe
Now, I need to debug it. But I don't know how to the content of macro. The following is my method about using GDB.
(gdb) s
400 vmovdqa XFER, _XFER(%rsp)
(gdb) s
401 FOUR_ROUNDS_AND_SCHED
(gdb) s
403 vpaddd 1*16(TBL), X0, XFER
(gdb) s
404 vmovdqa XFER, _XFER(%rsp)
(gdb) s
405 FOUR_ROUNDS_AND_SCHED
FOUR_ROUNDS_AND_SCHED is a macro defined by myself.
.macro FOUR_ROUNDS_AND_SCHED
## compute s0 four at a time and s1 two at a time
## compute W[-16] + W[-7] 4 at a time
mov e, y0 # y0 = e
MY_ROR (25-11), y0 # y0 = e >> (25-11)
mov a, y1 # y1 = a
vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
MY_ROR (22-13), y1 # y1 = a >> (22-13)
xor e, y0 # y0 = e ^ (e >> (25-11))
mov f, y2 # y2 = f
MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
xor a, y1 # y1 = a ^ (a >> (22-13)
// ...
.endm
In fact, My_ROR is also a macro defined in my program.
I want to watch the every instruction in macro(FOUR_ROUNDS_AND_SCHED and My_ROR) when debug program step-by-step using GDB. How should I compile my program?

How should I compile my program?
You can compile the program as you do now, but use x/4i $pc to see actual instructions, and stepi to single step instructions instead of lines.

Try combining disp/i $pc with stepi?

Algorithm to mix colours on 7 individual pieces of toy

I'm a woodworker trying to seek some math and algorithm help on the expertise here.
I'm trying to make 28 sets of Tangram for gifting relatives, like this:
DanielHolth + RobotE at nl:Wp [CC BY-SA 3.0 (http://creativecommons.org/licenses/by-sa/3.0/)], from Wikimedia Commons
The toys consists of 7 pieces of wood board that should be painted to individual colours. To make the painting easier, I think the best way is to paint them in batches of 4 sets in the same colour, then mix them.
I have labeled the pieces 1-7 to make discussion easier:
What is the most efficient way to mix the pieces so I do not get identical colour combination per set? I would like the gift to be as individual as possible and colour combination is a good way to achieve this goal.
Edit: Each set of puzzle is made of seven pieces each of a different colour.

Order your colors in some fashion (say R -> G -> B -> Y -> P -> O -> W), and then order your pieces similarly (which you've already done in your picture, 1-7). Lay them out in in a matrix, with each color on a separate row (repeating the columns/pieces, since there's going to be 4 duplicates each). Let B3 denote a blue 3 piece, O7- orange 7, etc.
1 2 3 4 5 6 7 1 2 3 4 5 6 7 1 2 3 4 5 6 7 1 2 3 4 5 6 7
(R) R1 R2 R3 R4 R5 R6 R7 R1 R2 R3 R4 R5 R6 R7 R1 R2 R3 R4 R5 R6 R7 R1 R2 R3 R4 R5 R6 R7
(G) G1 G2 G3 G4 G5 G6 G7 G1 G2 G3 G4 G5 G6 G7 G1 G2 G3 G4 G5 G6 G7 G1 G2 G3 G4 G5 G6 G7
(G) B1 B2 B3 B4 B5 B6 B7 B1 B2 B3 B4 B5 B6 B7 B1 B2 B3 B4 B5 B6 B7 B1 B2 B3 B4 B5 B6 B7
(Y) Y1 Y2 Y3 Y4 Y5 Y6 Y7 Y1 Y2 Y3 Y4 Y5 Y6 Y7 Y1 Y2 Y3 Y4 Y5 Y6 Y7 Y1 Y2 Y3 Y4 Y5 Y6 Y7
(P) P1 P2 P3 P4 P5 P6 P7 P1 P2 P3 P4 P5 P6 P7 P1 P2 P3 P4 P5 P6 P7 P1 P2 P3 P4 P5 P6 P7
(O) O1 O2 O3 O4 O5 O6 O7 O1 O2 O3 O4 O5 O6 O7 O1 O2 O3 O4 O5 O6 O7 O1 O2 O3 O4 O5 O6 O7
(W) W1 W2 W3 W4 W5 W6 W7 W1 W2 W3 W4 W5 W6 W7 W1 W2 W3 W4 W5 W6 W7 W1 W2 W3 W4 W5 W6 W7
Now, take out the lower left "triangle" of pieces. That is- remove 0 pieces from the beginning of the first row, 1 from the second, 2 from the third...
1 2 3 4 5 6 7 1 2 3 4 5 6 7 1 2 3 4 5 6 7 1 2 3 4 5 6 7
(R) R1 R2 R3 R4 R5 R6 R7 R1 R2 R3 R4 R5 R6 R7 R1 R2 R3 R4 R5 R6 R7 R1 R2 R3 R4 R5 R6 R7
(G) G2 G3 G4 G5 G6 G7 G1 G2 G3 G4 G5 G6 G7 G1 G2 G3 G4 G5 G6 G7 G1 G2 G3 G4 G5 G6 G7
(B) B3 B4 B5 B6 B7 B1 B2 B3 B4 B5 B6 B7 B1 B2 B3 B4 B5 B6 B7 B1 B2 B3 B4 B5 B6 B7
(Y) Y4 Y5 Y6 Y7 Y1 Y2 Y3 Y4 Y5 Y6 Y7 Y1 Y2 Y3 Y4 Y5 Y6 Y7 Y1 Y2 Y3 Y4 Y5 Y6 Y7
(P) P5 P6 P7 P1 P2 P3 P4 P5 P6 P7 P1 P2 P3 P4 P5 P6 P7 P1 P2 P3 P4 P5 P6 P7
(O) O6 O7 O1 O2 O3 O4 O5 O6 O7 O1 O2 O3 O4 O5 O6 O7 O1 O2 O3 O4 O5 O6 O7
(W) W7 W1 W2 W3 W4 W5 W6 W7 W1 W2 W3 W4 W5 W6 W7 W1 W2 W3 W4 W5 W6 W7
Then place these extras at the ends of their corresponding rows. Now, just take the first piece from each row, and make a new set. You can do this 7 times before it will repeat itself. For clarity, in the above example your first set would be R1 G2 B3 Y4 P5 O6 W7, the second set R2 G3 B4 Y5 P6 O7 W1.
After that, repeat the process again- remove 0 from the first row, 1 from the second, etc. Again, move the extras to the ends of their rows, and draw 7 new sets from the first elements of each row. Repeat this process twice more for your final two batches of 7 sets each. Each set will be unique.

I've already posted an answer that attempts to solve the problem as simply as possible, but I felt it was appropriate to provide a solution that also attempts to maximize uniqueness. Another answer already covers the basics of this, but does not account for color pairs created from identical puzzle pieces, so I've attempted to do so here.
This solver isn't the fastest, but guarantees that there will be no more than two identically colored pairs of pieces between any two sets. When ran without shuffling, there is a large bias towards certain colors taking particular pieces, so I provide an argument to shuffle the intermediate arrays to eliminate this bias, at the cost of fewer generated sets (potentially less than 28 - if so, run again). The program will spit out all sets found that satisfy the above criteria, so you can manually pick whichever 28 seems the most "random" or "uniform" to the human eye.
from itertools import combinations, permutations
from random import shuffle
def get_subsets(color_set):
subsets = []
for d in ({}, {'1':'5'}, {'4':'6'}, {'1':'5', '4':'6'}):
tr = lambda s: str.translate(s, str.maketrans(d))
subsets.extend(set(tr(y) for y in x) for x in combinations(color_set, 3))
return subsets
def make_sets(do_random=True):
color_sets = [set(c+str(i) for i, c in enumerate(perm)) for perm in permutations("RGBYPOW")]
results, pairs = [], []
while color_sets:
results.append(color_sets[0])
pairs.extend(get_subsets(color_sets[0]))
color_sets = [x for x in color_sets if all(y - x for y in pairs)]
if do_random: shuffle(color_sets)
results = sorted(sorted(perm, key=lambda x:x[1]) for perm in results)
print("\n".join(map(str, results)))
print(len(results))
if __name__ == "__main__":
make_sets()
Example output:
['B0', 'G1', 'O2', 'W3', 'P4', 'R5', 'Y6']
['B0', 'P1', 'W2', 'Y3', 'O4', 'G5', 'R6']
['B0', 'R1', 'W2', 'O3', 'G4', 'P5', 'Y6']
['B0', 'R1', 'Y2', 'P3', 'W4', 'O5', 'G6']
['B0', 'W1', 'R2', 'G3', 'O4', 'Y5', 'P6']
['G0', 'B1', 'O2', 'P3', 'R4', 'W5', 'Y6']
['G0', 'B1', 'R2', 'W3', 'Y4', 'O5', 'P6']
['G0', 'O1', 'P2', 'B3', 'W4', 'R5', 'Y6']
['G0', 'O1', 'Y2', 'R3', 'B4', 'W5', 'P6']
['G0', 'P1', 'O2', 'Y3', 'B4', 'R5', 'W6']
['G0', 'W1', 'P2', 'O3', 'R4', 'Y5', 'B6']
['O0', 'B1', 'Y2', 'W3', 'R4', 'P5', 'G6']
['O0', 'G1', 'R2', 'Y3', 'W4', 'P5', 'B6']
['O0', 'P1', 'G2', 'R3', 'Y4', 'B5', 'W6']
['O0', 'R1', 'B2', 'G3', 'P4', 'W5', 'Y6']
['P0', 'B1', 'R2', 'O3', 'W4', 'Y5', 'G6']
['P0', 'R1', 'G2', 'W3', 'B4', 'Y5', 'O6']
['P0', 'W1', 'B2', 'Y3', 'O4', 'R5', 'G6']
['P0', 'W1', 'G2', 'B3', 'Y4', 'O5', 'R6']
['R0', 'G1', 'B2', 'Y3', 'P4', 'O5', 'W6']
['R0', 'O1', 'P2', 'Y3', 'G4', 'W5', 'B6']
['R0', 'Y1', 'W2', 'P3', 'G4', 'B5', 'O6']
['W0', 'G1', 'B2', 'P3', 'R4', 'Y5', 'O6']
['W0', 'O1', 'P2', 'G3', 'Y4', 'B5', 'R6']
['W0', 'R1', 'Y2', 'G3', 'O4', 'P5', 'B6']
['W0', 'Y1', 'G2', 'O3', 'B4', 'P5', 'R6']
['W0', 'Y1', 'O2', 'R3', 'P4', 'G5', 'B6']
['Y0', 'B1', 'P2', 'R3', 'W4', 'G5', 'O6']
['Y0', 'G1', 'W2', 'O3', 'B4', 'R5', 'P6']
['Y0', 'O1', 'B2', 'G3', 'R4', 'P5', 'W6']
['Y0', 'P1', 'R2', 'B3', 'G4', 'W5', 'O6']
31

For fun, here's some code that attempts to minimise, without an exhaustive search, the number of sequential colour pairs that can coexist in the same position in an attempt to respond to your request for "as individual as possible." It has an element of randomness. Sometimes it can produce a triple sequence duplicate but at it's best, we get only pair duplicates. (Maybe the chance similarities recipients would find between their gifts would be part of the beauty.)
(Dillon Davis commented that it can produce identical pairs for positions 1 & 5 or 4 & 6, which appear to be prominent similar triangles in the design. I may make a change to it a little later to prioritise avoiding those duplicates.)
let cs = ['R', 'G', 'B', 'Y', 'P', 'O', 'W'];
let pairs = [];
for (let i=0; i<6; i++)
for (let j=i+1; j<7; j++)
pairs.push(cs[i] + cs[j], cs[j] + cs[i]);
let positionMatches = [];
const results = pairs.slice(0, 28);
// Build combinations
for (let i=0; i<5; i++){
// Avoid repeating pairs
// in the same position
let set = new Set();
for (let j=0; j<28; j++){
const last = results[j].substr(-1);
let found = false;
for (let c of cs){
const candidate = last + c;
// Match found
if (!set.has(candidate) && !results[j].includes(c)){
results[j] += c;
set.add(candidate);
found = true;
break;
}
}
// Match not found
// Lower the restriction
// and insert random match
if (!found){
const cs_ = cs.filter(
c => !results[j].includes(c));
const c = cs_[
~~(Math.random()*cs_.length)];
results[j] += c;
positionMatches.push((i+2) + ':' + last + c);
}
}
}
console.log(results.join('\n'));
console.log('');
for (let p of positionMatches){
const [pos, pair] = p.split(':');
console.log(pair + ' duplicate at position ' + pos)
}
UPDATE
Here's a solver with much more random assignment than the one above, which is more sequential and therefore predictable. We can set pairs we'd like to "unmatch" in the unmatch map, and control how much more we'd like to try random candidates when examining the specially chosen unmatched pairs or other pairs (we might want to give more weight to the former to let them try more random candidates). One result that seemed pretty good as I was playing around is listed below (it was achieved with the same 50/50 random setting). Click "Run snippet" for a different result each time!
const unmatch = {
// Try to avoid duplicate pairs
// at indexes (0, 4) and (3, 5)
4: 0,
5: 3
};
const unmatchTrials = 50;
const regularTrials = 50;
let cs = ['R', 'G', 'B', 'Y', 'P', 'O', 'W'];
let pairs = [];
for (let i=0; i<6; i++)
for (let j=i+1; j<7; j++)
pairs.push(cs[i] + cs[j], cs[j] + cs[i]);
let positionMatches = [];
const results = pairs.slice(0, 28);
// Build combinations
for (let i=0; i<5; i++){
// Avoid repeating pairs in the same position,
// as well as in custom positions
let set = new Set();
let unmatchS = new Set();
for (let j=0; j<28; j++){
const last = results[j].substr(-1);
let found = false;
const ri = i + 2;
let count = unmatch.hasOwnProperty(ri) ? unmatchTrials : regularTrials;
while (!found && --count > 0){
const ii = ~~(Math.random() * cs.length);
const c = cs[ii];
const candidate = last + c;
let u = unmatch.hasOwnProperty(ri)
? unmatchS.has(results[j][unmatch[ri]] + c)
: false;
// Match found
if (!set.has(candidate) && !results[j].includes(c) && !u){
results[j] += c;
set.add(candidate);
if (unmatch.hasOwnProperty(ri))
unmatchS.add(results[j][unmatch[ri]] + c)
found = true;
}
}
// Match not found
// Lower the restriction
// and insert random match
if (!found){
const cs_ = cs.filter(
c => !results[j].includes(c));
const c = cs_[
~~(Math.random()*cs_.length)];
results[j] += c;
positionMatches.push((i+2) + ':' + last + c);
}
}
}
console.log(results.join('\n'));
console.log('');
for (let p of positionMatches){
const [pos, pair] = p.split(':');
console.log(pair + ' duplicate at position ' + pos)
}
let m04 = new Set();
let m35 = new Set();
for (let r of results){
const c04 = r[0] + r[4];
const c35 = r[3] + r[5];
if (m04.has(c04))
console.log('15 match: ' + c04);
m04.add(c04);
if (m35.has(c35))
console.log('46 match: ' + c35);
m35.add(c35);
}
(The output below seemed pretty good. Dillon Davis noticed a pair of tangrams there that share a sequence "POW." Those could possibly be for two people who may or may not yet know that they share a special connection. (We could also, you know, just tweak one of them manually :)
RGWBYOP
GROBPYW
RBPWOYG
BRWYOGP
RYWPGOB
YRPBGWO
RPBYWOG
PRYGWBO
ROBWPGY
ORGYPBW
RWGOBYP
WRBOPGY
GBOWYRP
BGOYRWP
GYRWBPO
YGROWPB
GPWORBY
PGYBRWO
GOYPWRB
OGPYBRW
GWPROBY
WGBRYPO
BYGPOWR
YBRPOWG
BPGRWYO
PBYWGOR
BORGPWY
OBWGRPY
PO duplicate at position 4
PG duplicate at position 5
RW duplicate at position 5
OW duplicate at position 5
GO duplicate at position 5
GY duplicate at position 6
WO duplicate at position 6
BY duplicate at position 6
PO duplicate at position 6
46 match: BW
15 match: BO
46 match: PW

Finding the most frequently occurring element in an SSE register

Does anyone have any thoughts on how to calculate the mode (statistic) of a vector of 8-bit integers in SSE4.x? To clarify, this would be 16x8-bit values in a 128-bit register.
I want the result as a vector mask which selects the mode-valued elements. i.e. the result of _mm_cmpeq_epi8(v, set1(mode(v))), as well as the scalar value.
Providing some additional context; while the above problem is an interesting one to solve in its own right, I have been through most algorithms I can think of with linear complexity. This class will wipe out any gains I can get from calculating this number.
I hope to engage you all in searching for some deep magic, here. It's possible that an approximation may be necessary to break this bound, such as "select a frequently occurring element" for example (N.B. difference against the most), which would be of merit. A probabilistic answer would be usable, too.
SSE and x86 have some very interesting semantics. It may be worth exploring a superoptimization pass.

Probably a relatively simple brute force SSEx approach is suitable here, see the code below.
The idea is to byte-rotate the input vector v by 1 to 15 positions and compare the rotated vector
with the original v for equality. To shorten the dependency chain and to increase the
instruction level parallelism, two counters are used to count (vertical sum) these equal elements:
sum1 and sum2, because there might be architectures that benefit from this.
Equal elements are counted as -1. Variable sum = sum1 + sum2 contains the total count with values
between -1 and -16. min_brc contains the horizontal minimum of sum broadcasted to all elements.
mask = _mm_cmpeq_epi8(sum,min_brc) is the mask for the mode-valued elements requested as an
intermediate result by the OP. In the next few lines of the code the actual mode is extracted.
This solution is certainly faster than a scalar solution.
Note that with AVX2 the upper 128-bit lanes can be used to speedup the computation further.
It takes 20 cycles (throughput) to compute only the a mask for the mode-valued elements. With the actual mode broadcasted
across the SSE register it takes about 21.4 cycles.
Note the behaviour in the next example:
[1, 1, 3, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] returns mask=[-1,-1,-1,-1,0,0,...,0]
and the mode value is 1, although 1 occurs as often as 3.
The code below is tested, but not thoroughly tested
#include <stdio.h>
#include <x86intrin.h>
/* gcc -O3 -Wall -m64 -march=nehalem mode_uint8.c */
int print_vec_char(__m128i x);
__m128i mode_statistic(__m128i v){
__m128i sum2 = _mm_set1_epi8(-1); /* Each integer occurs at least one time */
__m128i v_rot1 = _mm_alignr_epi8(v,v,1);
__m128i v_rot2 = _mm_alignr_epi8(v,v,2);
__m128i sum1 = _mm_cmpeq_epi8(v,v_rot1);
sum2 = _mm_add_epi8(sum2,_mm_cmpeq_epi8(v,v_rot2));
__m128i v_rot3 = _mm_alignr_epi8(v,v,3);
__m128i v_rot4 = _mm_alignr_epi8(v,v,4);
sum1 = _mm_add_epi8(sum1,_mm_cmpeq_epi8(v,v_rot3));
sum2 = _mm_add_epi8(sum2,_mm_cmpeq_epi8(v,v_rot4));
__m128i v_rot5 = _mm_alignr_epi8(v,v,5);
__m128i v_rot6 = _mm_alignr_epi8(v,v,6);
sum1 = _mm_add_epi8(sum1,_mm_cmpeq_epi8(v,v_rot5));
sum2 = _mm_add_epi8(sum2,_mm_cmpeq_epi8(v,v_rot6));
__m128i v_rot7 = _mm_alignr_epi8(v,v,7);
__m128i v_rot8 = _mm_alignr_epi8(v,v,8);
sum1 = _mm_add_epi8(sum1,_mm_cmpeq_epi8(v,v_rot7));
sum2 = _mm_add_epi8(sum2,_mm_cmpeq_epi8(v,v_rot8));
__m128i v_rot9 = _mm_alignr_epi8(v,v,9);
__m128i v_rot10 = _mm_alignr_epi8(v,v,10);
sum1 = _mm_add_epi8(sum1,_mm_cmpeq_epi8(v,v_rot9));
sum2 = _mm_add_epi8(sum2,_mm_cmpeq_epi8(v,v_rot10));
__m128i v_rot11 = _mm_alignr_epi8(v,v,11);
__m128i v_rot12 = _mm_alignr_epi8(v,v,12);
sum1 = _mm_add_epi8(sum1,_mm_cmpeq_epi8(v,v_rot11));
sum2 = _mm_add_epi8(sum2,_mm_cmpeq_epi8(v,v_rot12));
__m128i v_rot13 = _mm_alignr_epi8(v,v,13);
__m128i v_rot14 = _mm_alignr_epi8(v,v,14);
sum1 = _mm_add_epi8(sum1,_mm_cmpeq_epi8(v,v_rot13));
sum2 = _mm_add_epi8(sum2,_mm_cmpeq_epi8(v,v_rot14));
__m128i v_rot15 = _mm_alignr_epi8(v,v,15);
sum1 = _mm_add_epi8(sum1,_mm_cmpeq_epi8(v,v_rot15));
__m128i sum = _mm_add_epi8(sum1,sum2); /* Sum contains values such as -1, -2 ,...,-16 */
/* The next three instructions compute the horizontal minimum of sum */
__m128i sum_shft = _mm_srli_epi16(sum,8); /* Shift right 8 bits, while shifting in zeros */
__m128i min1 = _mm_min_epu8(sum,sum_shft); /* sum and sum_shuft are considered as unsigned integers. sum_shft is zero at the odd positions and so is min1 */
__m128i min2 = _mm_minpos_epu16(min1); /* Byte 0 within min2 contains the horizontal minimum of sum */
__m128i min_brc = _mm_shuffle_epi8(min2,_mm_setzero_si128()); /* Broadcast horizontal minimum */
__m128i mask = _mm_cmpeq_epi8(sum,min_brc); /* Mask = -1 at the byte positions where the value of v is equal to the mode of v */
/* comment next 4 lines out if there is no need to broadcast the mode value */
int bitmask = _mm_movemask_epi8(mask);
int indx = __builtin_ctz(bitmask); /* Index of mode */
__m128i v_indx = _mm_set1_epi8(indx); /* Broadcast indx */
__m128i answer = _mm_shuffle_epi8(v,v_indx); /* Broadcast mode to each element of answer */
/* Uncomment lines below to print intermediate results, to see how it works. */
// printf("sum = ");print_vec_char (sum );
// printf("sum_shft = ");print_vec_char (sum_shft );
// printf("min1 = ");print_vec_char (min1 );
// printf("min2 = ");print_vec_char (min2 );
// printf("min_brc = ");print_vec_char (min_brc );
// printf("mask = ");print_vec_char (mask );
// printf("v_indx = ");print_vec_char (v_indx );
// printf("answer = ");print_vec_char (answer );
return answer; /* or return mask, or return both .... :) */
}
int main() {
/* To test throughput set throughput_test to 1, otherwise 0 */
/* Use e.g. perf stat -d ./a.out to test throughput */
#define throughput_test 0
/* Different test vectors */
int i;
char x1[16] = {5, 2, 2, 7, 21, 4, 7, 7, 3, 9, 2, 5, 4, 3, 5, 5};
char x2[16] = {5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
char x3[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
char x4[16] = {1, 2, 3, 2, 1, 6, 7, 8, 2, 2, 2, 3, 3, 2, 15, 16};
char x5[16] = {1, 1, 3, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
printf("\n15...0 = 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0\n\n");
__m128i x_vec = _mm_loadu_si128((__m128i*)x1);
printf("x_vec = ");print_vec_char(x_vec );
__m128i y = mode_statistic (x_vec);
printf("answer = ");print_vec_char(y );
#if throughput_test == 1
__m128i x_vec1 = _mm_loadu_si128((__m128i*)x1);
__m128i x_vec2 = _mm_loadu_si128((__m128i*)x2);
__m128i x_vec3 = _mm_loadu_si128((__m128i*)x3);
__m128i x_vec4 = _mm_loadu_si128((__m128i*)x4);
__m128i x_vec5 = _mm_loadu_si128((__m128i*)x5);
__m128i y1, y2, y3, y4, y5;
__asm__ __volatile__ ( "vzeroupper" : : : ); /* Remove this line on non-AVX processors */
for (i=0;i<100000000;i++){
y1 = mode_statistic (x_vec1);
y2 = mode_statistic (x_vec2);
y3 = mode_statistic (x_vec3);
y4 = mode_statistic (x_vec4);
y5 = mode_statistic (x_vec5);
x_vec1 = mode_statistic (y1 );
x_vec2 = mode_statistic (y2 );
x_vec3 = mode_statistic (y3 );
x_vec4 = mode_statistic (y4 );
x_vec5 = mode_statistic (y5 );
}
printf("mask mode = ");print_vec_char(y1 );
printf("mask mode = ");print_vec_char(y2 );
printf("mask mode = ");print_vec_char(y3 );
printf("mask mode = ");print_vec_char(y4 );
printf("mask mode = ");print_vec_char(y5 );
#endif
return 0;
}
int print_vec_char(__m128i x){
char v[16];
_mm_storeu_si128((__m128i *)v,x);
printf("%3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi\n",
v[15],v[14],v[13],v[12],v[11],v[10],v[9],v[8],v[7],v[6],v[5],v[4],v[3],v[2],v[1],v[0]);
return 0;
}
Output:
15...0 = 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
x_vec = 5 5 3 4 | 5 2 9 3 | 7 7 4 21 | 7 2 2 5
sum = -4 -4 -2 -2 | -4 -3 -1 -2 | -3 -3 -2 -1 | -3 -3 -3 -4
min_brc = -4 -4 -4 -4 | -4 -4 -4 -4 | -4 -4 -4 -4 | -4 -4 -4 -4
mask = -1 -1 0 0 | -1 0 0 0 | 0 0 0 0 | 0 0 0 -1
answer = 5 5 5 5 | 5 5 5 5 | 5 5 5 5 | 5 5 5 5
The horizontal minimum is computed with Evgeny Kluev's method.

Sort the data in the register.
Insertion sort can be done in 16 (15) steps, by initializing the register to "Infinity", which tries to illustrate a monotonically decreasing array and inserting the new element in parallel to all possible places:
// e.g. FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF 78
__m128i sorted = _mm_or_si128(my_array, const_FFFFF00);
for (int i = 1; i < 16; ++i)
{
// Trying to insert e.g. A0, we must shift all the FF's to left
// e.g. FF FF FF FF FF FF FF FF FF FF FF FF FF FF 78 00
__m128i shifted = _mm_bslli_si128(sorted, 1);
// Taking the MAX of shifted and 'A0 on all places'
// e.g. FF FF FF FF FF FF FF FF FF FF FF FF FF FF A0 A0
shifted = _mm_max_epu8(shifted, _mm_set1_epi8(my_array[i]));
// and minimum of the shifted + original --
// e.g. FF FF FF FF FF FF FF FF FF FF FF FF FF FF A0 78
sorted = _mm_min_epu8(sorted, shifted);
}
Then calculate mask for vec[n+1] == vec[n], move mask to GPR and use that to index a 32768 entry LUT for best index location.
In real case one probably want's to sort more than just one vector; i.e. sort 16 16-entry vectors at once;
__m128i input[16]; // not 1, but 16 vectors
transpose16x16(input); // inplace vector transpose
sort(transpose); // 60-stage network exists for 16 inputs
// linear search -- result in 'mode'
__m128i mode = input[0];
__m128i previous = mode;
__m128i count = _mm_set_epi8(0);
__m128i max_count = _mm_setzero_si128(0);
for (int i = 1; i < 16; i++)
{
__m128i &current = input[i];
// histogram count is off by one
// if (current == previous) count++;
// else count = 0;
// if (count > max_count)
// mode = current, max_count = count
prev = _mm_cmpeq_epi8(prev, current);
count = _mm_and_si128(_mm_sub_epi8(count, prev), prev);
__m128i max_so_far = _mm_cmplt_epi8(max_count, count);
mode = _mm_blendv_epi8(mode, current, max_so_far);
max_count = _mm_max_epi8(max_count, count);
previous = current;
}
The inner loop totals amortized cost of 7-8 instructions per result;
Sorting has typically 2 instructions per stage -- i.e. 8 instructions per result, when 16 results need 60 stages or 120 instructions.
(This still leaves the transpose as an exercise -- but I think it should be vastly faster than sorting?)
So, this should be in the ball park of 24 instructions per 8-bit result.

For performance comparison with scalar code. Non-vectorized on main part but vectorized on table-clear and tmp initialization. (168 cycles per f() call for fx8150 (22M calls complete in 1.0002 seconds at 3.7 GHz))
#include <x86intrin.h>
unsigned char tmp[16]; // extracted values are here (single instruction, store_ps)
unsigned char table[256]; // counter table containing zeroes
char f(__m128i values)
{
_mm_store_si128((__m128i *)tmp,values);
int maxOccurence=0;
int currentValue=0;
for(int i=0;i<16;i++)
{
unsigned char ind=tmp[i];
unsigned char t=table[ind];
t++;
if(t>maxOccurence)
{
maxOccurence=t;
currentValue=ind;
}
table[ind]=t;
}
for(int i=0;i<256;i++)
table[i]=0;
return currentValue;
}
g++ 6.3 output:
f: # #f
movaps %xmm0, tmp(%rip)
movaps %xmm0, -24(%rsp)
xorl %r8d, %r8d
movq $-15, %rdx
movb -24(%rsp), %sil
xorl %eax, %eax
jmp .LBB0_1
.LBB0_2: # %._crit_edge
cmpl %r8d, %esi
cmovgel %esi, %r8d
movb tmp+16(%rdx), %sil
incq %rdx
.LBB0_1: # =>This Inner Loop Header: Depth=1
movzbl %sil, %edi
movb table(%rdi), %cl
incb %cl
movzbl %cl, %esi
cmpl %r8d, %esi
cmovgl %edi, %eax
movb %sil, table(%rdi)
testq %rdx, %rdx
jne .LBB0_2
xorps %xmm0, %xmm0
movaps %xmm0, table+240(%rip)
movaps %xmm0, table+224(%rip)
movaps %xmm0, table+208(%rip)
movaps %xmm0, table+192(%rip)
movaps %xmm0, table+176(%rip)
movaps %xmm0, table+160(%rip)
movaps %xmm0, table+144(%rip)
movaps %xmm0, table+128(%rip)
movaps %xmm0, table+112(%rip)
movaps %xmm0, table+96(%rip)
movaps %xmm0, table+80(%rip)
movaps %xmm0, table+64(%rip)
movaps %xmm0, table+48(%rip)
movaps %xmm0, table+32(%rip)
movaps %xmm0, table+16(%rip)
movaps %xmm0, table(%rip)
movsbl %al, %eax
ret

subtracting two 8 bits integer bit by bit in assembly x86

so I'm trying to implement this algorithm to calculate the difference of two 8 bits integers
b = 0
difference = 0
for i = 0 to (n-1)
x = bit i of X
y = bit i of Y
bit i of difference = x xor y xor b
b = ((not x) and y) or ((not x) and b) or (y and b)
end for loop
this is what i did
calculation:
mov ebx, 0
mov diff, 0
mov ecx, 7
subtract:
mov al, X
and al, 1h ; find bit i of X
mov dl, Y
and dl, 1h ; find bit i of Y
mov ah, al
mov dh, al
xor al, dl
xor al, bl
mov diff, al ; find bit i of the difference
; calculate b value for the next interation
not ah
and ah, dl
not dh
and dh, dl
and dl, bl
or ah, dh
or ah, dl
mov bl, ah
; rotate X and Y to get ready for the next iteration
rol X, 1
rol Y, 1
loop subtract
the problem with this code is its only work on the first iteration of the loop
so for example if I enter first number to be 2 and the second number to be 1
the when i go through the loop,first iteration, the x value would be 0 and the y value would be 1, the i bit of the difference would be 1 and b value calculated would be 1
, but this only work for the first iteration, on the next iteration, I had x = 0, y = 0 and b = 1(from the last calculation), so I wanted my diff to be 1 and my b value for this iteration to be 1, instead I got 0 for both of them.
why doesn't the code work, as i was following the algorithm, and implement accordingly.
thank in advance
and

Try a higher level language first to understand the algorithm, then port that to asm.
#include <stdio.h>
//b = 0
//difference = 0
//for i = 0 to (n-1)
//
// x = bit i of X
// y = bit i of Y
// bit i of difference = x xor y xor b
// b = ((not x) and y) or ((not x) and b) or (y and b)
//
//end for loop
int main ( void )
{
unsigned char X,Y,Z;
unsigned char x,y,z,b,bnext;
unsigned char i;
X=0Xf5; Y=0Xf1;
b=0;
Z=0;
for (i=1;i;i<<=1)
{
x=0;
y=0;
if(i&X) x=1;
if(i&Y) y=1;
z=((x^y)^b)&1;
if(z) Z|=i;
bnext = ((~x)&y) | ((~x)&b) | (y&b);
b=bnext&1;
}
printf("0x%02X 0x%02X\n",Z,X-Y);
return(0);
}
you might even re-write it a few times to approach real instructions.
z=((x^y)^b)&1;
becomes
z = x;
z = z ^ y;
z = z ^ b;
z = z & 1;

Sort points in clockwise order?

Given an array of x,y points, how do I sort the points of this array in clockwise order (around their overall average center point)? My goal is to pass the points to a line-creation function to end up with something looking rather "solid", as convex as possible with no lines intersecting.
For what it's worth, I'm using Lua, but any pseudocode would be appreciated.
Update: For reference, this is the Lua code based on Ciamej's excellent answer (ignore my "app" prefix):
function appSortPointsClockwise(points)
local centerPoint = appGetCenterPointOfPoints(points)
app.pointsCenterPoint = centerPoint
table.sort(points, appGetIsLess)
return points
end
function appGetIsLess(a, b)
local center = app.pointsCenterPoint
if a.x >= 0 and b.x < 0 then return true
elseif a.x == 0 and b.x == 0 then return a.y > b.y
end
local det = (a.x - center.x) * (b.y - center.y) - (b.x - center.x) * (a.y - center.y)
if det < 0 then return true
elseif det > 0 then return false
end
local d1 = (a.x - center.x) * (a.x - center.x) + (a.y - center.y) * (a.y - center.y)
local d2 = (b.x - center.x) * (b.x - center.x) + (b.y - center.y) * (b.y - center.y)
return d1 > d2
end
function appGetCenterPointOfPoints(points)
local pointsSum = {x = 0, y = 0}
for i = 1, #points do pointsSum.x = pointsSum.x + points[i].x; pointsSum.y = pointsSum.y + points[i].y end
return {x = pointsSum.x / #points, y = pointsSum.y / #points}
end

First, compute the center point.
Then sort the points using whatever sorting algorithm you like, but use special comparison routine to determine whether one point is less than the other.
You can check whether one point (a) is to the left or to the right of the other (b) in relation to the center by this simple calculation:
det = (a.x - center.x) * (b.y - center.y) - (b.x - center.x) * (a.y - center.y)
if the result is zero, then they are on the same line from the center, if it's positive or negative, then it is on one side or the other, so one point will precede the other.
Using it you can construct a less-than relation to compare points and determine the order in which they should appear in the sorted array. But you have to define where is the beginning of that order, I mean what angle will be the starting one (e.g. the positive half of x-axis).
The code for the comparison function can look like this:
bool less(point a, point b)
{
if (a.x - center.x >= 0 && b.x - center.x < 0)
return true;
if (a.x - center.x < 0 && b.x - center.x >= 0)
return false;
if (a.x - center.x == 0 && b.x - center.x == 0) {
if (a.y - center.y >= 0 || b.y - center.y >= 0)
return a.y > b.y;
return b.y > a.y;
}
// compute the cross product of vectors (center -> a) x (center -> b)
int det = (a.x - center.x) * (b.y - center.y) - (b.x - center.x) * (a.y - center.y);
if (det < 0)
return true;
if (det > 0)
return false;
// points a and b are on the same line from the center
// check which point is closer to the center
int d1 = (a.x - center.x) * (a.x - center.x) + (a.y - center.y) * (a.y - center.y);
int d2 = (b.x - center.x) * (b.x - center.x) + (b.y - center.y) * (b.y - center.y);
return d1 > d2;
}
This will order the points clockwise starting from the 12 o'clock. Points on the same "hour" will be ordered starting from the ones that are further from the center.
If using integer types (which are not really present in Lua) you'd have to assure that det, d1 and d2 variables are of a type that will be able to hold the result of performed calculations.
If you want to achieve something looking solid, as convex as possible, then I guess you're looking for a Convex Hull. You can compute it using the Graham Scan.
In this algorithm, you also have to sort the points clockwise (or counter-clockwise) starting from a special pivot point. Then you repeat simple loop steps each time checking if you turn left or right adding new points to the convex hull, this check is based on a cross product just like in the above comparison function.
Edit:
Added one more if statement if (a.y - center.y >= 0 || b.y - center.y >=0) to make sure that points that have x=0 and negative y are sorted starting from the ones that are further from the center. If you don't care about the order of points on the same 'hour' you can omit this if statement and always return a.y > b.y.
Corrected the first if statements with adding -center.x and -center.y.
Added the second if statement (a.x - center.x < 0 && b.x - center.x >= 0). It was an obvious oversight that it was missing. The if statements could be reorganized now because some checks are redundant. For example, if the first condition in the first if statement is false, then the first condition of the second if must be true. I decided, however, to leave the code as it is for the sake of simplicity. It's quite possible that the compiler will optimize the code and produce the same result anyway.

What you're asking for is a system known as polar coordinates. Conversion from Cartesian to polar coordinates is easily done in any language. The formulas can be found in this section.
After converting to polar coordinates, just sort by the angle, theta.

An interesting alternative approach to your problem would be to find the approximate minimum to the Traveling Salesman Problem (TSP), ie. the shortest route linking all your points. If your points form a convex shape, it should be the right solution, otherwise, it should still look good (a "solid" shape can be defined as one that has a low perimeter/area ratio, which is what we are optimizing here).
You can use any implementation of an optimizer for the TSP, of which I am pretty sure you can find a ton in your language of choice.

Another version (return true if a comes before b in counterclockwise direction):
bool lessCcw(const Vector2D &center, const Vector2D &a, const Vector2D &b) const
{
// Computes the quadrant for a and b (0-3):
// ^
// 1 | 0
// ---+-->
// 2 | 3
const int dax = ((a.x() - center.x()) > 0) ? 1 : 0;
const int day = ((a.y() - center.y()) > 0) ? 1 : 0;
const int qa = (1 - dax) + (1 - day) + ((dax & (1 - day)) << 1);
/* The previous computes the following:
const int qa =
( (a.x() > center.x())
? ((a.y() > center.y())
? 0 : 3)
: ((a.y() > center.y())
? 1 : 2)); */
const int dbx = ((b.x() - center.x()) > 0) ? 1 : 0;
const int dby = ((b.y() - center.y()) > 0) ? 1 : 0;
const int qb = (1 - dbx) + (1 - dby) + ((dbx & (1 - dby)) << 1);
if (qa == qb) {
return (b.x() - center.x()) * (a.y() - center.y()) < (b.y() - center.y()) * (a.x() - center.x());
} else {
return qa < qb;
}
}
This is faster, because the compiler (tested on Visual C++ 2015) doesn't generate jump to compute dax, day, dbx, dby. Here the output assembly from the compiler:
; 28 : const int dax = ((a.x() - center.x()) > 0) ? 1 : 0;
vmovss xmm2, DWORD PTR [ecx]
vmovss xmm0, DWORD PTR [edx]
; 29 : const int day = ((a.y() - center.y()) > 0) ? 1 : 0;
vmovss xmm1, DWORD PTR [ecx+4]
vsubss xmm4, xmm0, xmm2
vmovss xmm0, DWORD PTR [edx+4]
push ebx
xor ebx, ebx
vxorps xmm3, xmm3, xmm3
vcomiss xmm4, xmm3
vsubss xmm5, xmm0, xmm1
seta bl
xor ecx, ecx
vcomiss xmm5, xmm3
push esi
seta cl
; 30 : const int qa = (1 - dax) + (1 - day) + ((dax & (1 - day)) << 1);
mov esi, 2
push edi
mov edi, esi
; 31 :
; 32 : /* The previous computes the following:
; 33 :
; 34 : const int qa =
; 35 : ( (a.x() > center.x())
; 36 : ? ((a.y() > center.y()) ? 0 : 3)
; 37 : : ((a.y() > center.y()) ? 1 : 2));
; 38 : */
; 39 :
; 40 : const int dbx = ((b.x() - center.x()) > 0) ? 1 : 0;
xor edx, edx
lea eax, DWORD PTR [ecx+ecx]
sub edi, eax
lea eax, DWORD PTR [ebx+ebx]
and edi, eax
mov eax, DWORD PTR _b$[esp+8]
sub edi, ecx
sub edi, ebx
add edi, esi
vmovss xmm0, DWORD PTR [eax]
vsubss xmm2, xmm0, xmm2
; 41 : const int dby = ((b.y() - center.y()) > 0) ? 1 : 0;
vmovss xmm0, DWORD PTR [eax+4]
vcomiss xmm2, xmm3
vsubss xmm0, xmm0, xmm1
seta dl
xor ecx, ecx
vcomiss xmm0, xmm3
seta cl
; 42 : const int qb = (1 - dbx) + (1 - dby) + ((dbx & (1 - dby)) << 1);
lea eax, DWORD PTR [ecx+ecx]
sub esi, eax
lea eax, DWORD PTR [edx+edx]
and esi, eax
sub esi, ecx
sub esi, edx
add esi, 2
; 43 :
; 44 : if (qa == qb) {
cmp edi, esi
jne SHORT $LN37#lessCcw
; 45 : return (b.x() - center.x()) * (a.y() - center.y()) < (b.y() - center.y()) * (a.x() - center.x());
vmulss xmm1, xmm2, xmm5
vmulss xmm0, xmm0, xmm4
xor eax, eax
pop edi
vcomiss xmm0, xmm1
pop esi
seta al
pop ebx
; 46 : } else {
; 47 : return qa < qb;
; 48 : }
; 49 : }
ret 0
$LN37#lessCcw:
pop edi
pop esi
setl al
pop ebx
ret 0
?lessCcw##YA_NABVVector2D##00#Z ENDP ; lessCcw
Enjoy.

vector3 a = new vector3(1 , 0 , 0)..............w.r.t X_axis
vector3 b = any_point - Center;
- y = |a * b| , x = a . b
- Atan2(y , x)...............................gives angle between -PI to + PI in radians
- (Input % 360 + 360) % 360................to convert it from 0 to 2PI in radians
- sort by adding_points to list_of_polygon_verts by angle we got 0 to 360
Finally you get Anticlockwize sorted verts
list.Reverse()..................Clockwise_order

I know this is somewhat of an old post with an excellent accepted answer, but I feel like I can still contribute something useful. All the answers so far essentially use a comparison function to compare two points and determine their order, but what if you want to use only one point at a time and a key function?
Not only is this possible, but the resulting code is also extremely compact. Here is the complete solution using Python's built-in sorted function:
# Create some random points
num = 7
points = np.random.random((num, 2))
# Compute their center
center = np.mean(points, axis=0)
# Make arctan2 function that returns a value from [0, 2 pi) instead of [-pi, pi)
arctan2 = lambda s, c: angle if (angle := np.arctan2(s, c)) >= 0 else 2 * np.pi + angle
# Define the key function
def clockwise_around_center(point):
diff = point - center
rcos = np.dot(diff, center)
rsin = np.cross(diff, center)
return arctan2(rsin, rcos)
# Sort our points using the key function
sorted_points = sorted(points, key=clockwise_around_center)
This answer would also work in 3D, if the points are on a 2D plane embedded in 3D. We would only have to modify the calculation of rsin by dotting it with the normal vector of the plane. E.g.
rsin = np.dot([0,0,1], np.cross(diff, center))
if that plane has e_z as its normal vector.
The advantage of this code is that it works on only one point at the time using a key function. The quantity rsin, if you work it out on a coefficient level, is exactly the same as what is called det in the accepter answer, except that I compute it between point - center and center, not between point1 - center and point2 - center. But the geometrical meaning of this quantity is the radius times the sin of the angle, hence I call this variable rsin. Similarly for the dot product, which is the radius times the cosine of the angle and hence called rcos.
One could argue that this solution uses arctan2, and is therefore less clean. However, I personally think that the clearity of using a key function outweighs the need for one call to a trig function. Note that I prefer to have arctan2 return a value from [0, 2 pi), because then we get the angle 0 when point happens to be identical to center, and thus it will be the first point in our sorted list. This is an optional choice.
In order to understand why this code works, the crucial insight is that all our points are defined as arrows with respect to the origin, including the center point itself. So if we calculate point - center, this is equivalent to placing the arrow from the tip of center to the tip of point, at the origin. Hence we can sort the arrow point - center with respect to the angle it makes with the arrow pointing to center.

Here's a way to sort the vertices of a rectangle in clock-wise order. I modified the original solution provided by pyimagesearch and got rid of the scipy dependency.
import numpy as np
def pointwise_distance(pts1, pts2):
"""Calculates the distance between pairs of points
Args:
pts1 (np.ndarray): array of form [[x1, y1], [x2, y2], ...]
pts2 (np.ndarray): array of form [[x1, y1], [x2, y2], ...]
Returns:
np.array: distances between corresponding points
"""
dist = np.sqrt(np.sum((pts1 - pts2)**2, axis=1))
return dist
def order_points(pts):
"""Orders points in form [top left, top right, bottom right, bottom left].
Source: https://www.pyimagesearch.com/2016/03/21/ordering-coordinates-clockwise-with-python-and-opencv/
Args:
pts (np.ndarray): list of points of form [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
Returns:
[type]: [description]
"""
# sort the points based on their x-coordinates
x_sorted = pts[np.argsort(pts[:, 0]), :]
# grab the left-most and right-most points from the sorted
# x-roodinate points
left_most = x_sorted[:2, :]
right_most = x_sorted[2:, :]
# now, sort the left-most coordinates according to their
# y-coordinates so we can grab the top-left and bottom-left
# points, respectively
left_most = left_most[np.argsort(left_most[:, 1]), :]
tl, bl = left_most
# now that we have the top-left coordinate, use it as an
# anchor to calculate the Euclidean distance between the
# top-left and right-most points; by the Pythagorean
# theorem, the point with the largest distance will be
# our bottom-right point. Note: this is a valid assumption because
# we are dealing with rectangles only.
# We need to use this instead of just using min/max to handle the case where
# there are points that have the same x or y value.
D = pointwise_distance(np.vstack([tl, tl]), right_most)
br, tr = right_most[np.argsort(D)[::-1], :]
# return the coordinates in top-left, top-right,
# bottom-right, and bottom-left order
return np.array([tl, tr, br, bl], dtype="float32")

With numpy:
import matplotlib.pyplot as plt
import numpy as np
# List of coords
coords = np.array([7,7, 5, 0, 0, 0, 5, 10, 10, 0, 0, 5, 10, 5, 0, 10, 10, 10]).reshape(-1, 2)
centroid = np.mean(coords, axis=0)
sorted_coords = coords[np.argsort(np.arctan2(coords[:, 1] - centroid[1], coords[:, 0] - centroid[0])), :]
plt.scatter(coords[:,0],coords[:,1])
plt.plot(coords[:,0],coords[:,1])
plt.plot(sorted_coords[:,0],sorted_coords[:,1])
plt.show()

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio