Linux Kernel Atomic Operations with Memory Barriers - linux-kernel

I am trying to better understand memory barriers in the Linux Kernel. I am wondering if my implementation below would result in valid atomic operations if these threads ran in parallel. I am not focusing on a specific architecture in this implementation...I would want this to work for any architecture. Basically my implementation has a smp_mb() after any write operation and barrier() before any read operation. And if this implementation would not work, would it work for a single writer thread with one or many reader threads?
int a = 0;
int *b = &a;
int c;
#define my_atomic_set(var, val) { \
*var = val; \
smp_mb(); \
}
#define my_atomic_set_ret(var, val) { \
barrier(); \
int ret = *var; \
*var = val; \
smp_mb(); \
ret;
}
#define my_atomic_add(var, val) { \
barrier(); \
int tmp = *var; \
*var = tmp + val; \
smp_mb(); \
}
#define my_atomic_get(var) { \
barrier(); \
*var; \
}
void thread1_func() {
my_atomic_add(b, 1);
my_atomic_set(b, 2);
}
void thread2_func() {
my_atomic_set(b, 3);
my_atomic_add(b, 4);
}
void thread3_func() {
c = my_atomic_get(b);
c = my_atomic_set_ret(b, 10);
}
void thread4_func() {
c = my_atomic_get(b);
c = my_atomic_get(b);
}

Related

C Macro is not expanded within Macro

I am developing a application which interact with hardware with ioctls. I wrote some lower level api for performing device operations. I wrote some macros as wrappers. as following.
WRITE_REGISTER(99, REGISTER_ADDRESS( 10, SCKT0_REG) );
with gcc -E,
I noticed inner macro is not getting expanded. Could you please help me to resolve the issue?
Compiling source code snippet is given bellow.
#include <stdio.h>
#include <string.h>
#define MAX_REG_NAME_LENGTH 32
#define MODULE_SEQUENCER 20
#define SCKT0_REG 11
struct register_struct
{
unsigned int reg_addr;
unsigned int reg_value;
char reg_name [MAX_REG_NAME_LENGTH];
char module_name[MAX_REG_NAME_LENGTH] ;
} register_st;
#define GET_REGISTER_ADDRESS(module_index, register_offset) \
(module_index * (1 << BITS_PER_MODULE) + register_offset) \
#define REGISTER_ADDRESS_(module_index, register_offset) \
{ \
memset(&register_st, 0, sizeof(struct register_struct)); \
register_st.reg_addr = GET_REGISTER_ADDRESS(module_index, register_offset); \
memcpy(register_st.reg_name, STR(register_offset), sizeof(STR(register_offset))); \
memcpy(register_st.module_name, STR(MODULE_SEQUENCER), sizeof(STR(MODULE_SEQUENCER))); \
} \
#define REGISTER_ADDRESS(x, y) REGISTER_ADDRESS_(x, y)
#define WRITE_REGISTER(register_value, register_offset) \
({ \
register_st.reg_value = register_value; \
}) \
int main()
{
unsigned int uiValue = 0;
WRITE_REGISTER(99, REGISTER_ADDRESS( 10, SCKT0_REG) );
return 0;
}

Looking for source code of __builtin_avr_delay_cycles called by _delay_ms in avr-gcc

I was investigating the delay_ms function of avr-gcc. In delay.h I found its definition:
void _delay_ms(double __ms)
{
double __tmp ;
#if __HAS_DELAY_CYCLES && defined(__OPTIMIZE__) && \
!defined(__DELAY_BACKWARD_COMPATIBLE__) && \
__STDC_HOSTED__
uint32_t __ticks_dc;
extern void __builtin_avr_delay_cycles(unsigned long);
__tmp = ((F_CPU) / 1e3) * __ms;
#if defined(__DELAY_ROUND_DOWN__)
__ticks_dc = (uint32_t)fabs(__tmp);
#elif defined(__DELAY_ROUND_CLOSEST__)
__ticks_dc = (uint32_t)(fabs(__tmp)+0.5);
#else
//round up by default
__ticks_dc = (uint32_t)(ceil(fabs(__tmp)));
#endif
__builtin_avr_delay_cycles(__ticks_dc);
#else
...
}
I am interested in how the __builtin_avr_delay_cycles function looks like internally and where it is defined? Where can I find the source?
As said in my comment to this very question on electronics.SE:
Compiler builtins are kinda funky to find, always, because they are not just C functions, but things that get inserted while parsing/compiling the code (at various levels of abstraction from the textual representation of the code itself. compiler theory stuff). What you're looking for is the function avr_expand_builtin in the GCC source tree. There's a case AVR_BUILTIN_DELAY_CYCLES in there. Look for what happens there.
Which is:
/* Implement `TARGET_EXPAND_BUILTIN'. */
/* Expand an expression EXP that calls a built-in function,
with result going to TARGET if that's convenient
(and in mode MODE if that's convenient).
SUBTARGET may be used as the target for computing one of EXP's operands.
IGNORE is nonzero if the value is to be ignored. */
static rtx
avr_expand_builtin (tree exp, rtx target,
rtx subtarget ATTRIBUTE_UNUSED,
machine_mode mode ATTRIBUTE_UNUSED,
int ignore)
{
tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
const char *bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
unsigned int id = DECL_FUNCTION_CODE (fndecl);
const struct avr_builtin_description *d = &avr_bdesc[id];
tree arg0;
rtx op0;
gcc_assert (id < AVR_BUILTIN_COUNT);
switch (id)
{
case AVR_BUILTIN_NOP:
emit_insn (gen_nopv (GEN_INT (1)));
return 0;
case AVR_BUILTIN_DELAY_CYCLES:
{
arg0 = CALL_EXPR_ARG (exp, 0);
op0 = expand_expr (arg0, NULL_RTX, VOIDmode, EXPAND_NORMAL);
if (!CONST_INT_P (op0))
error ("%s expects a compile time integer constant", bname);
else
avr_expand_delay_cycles (op0);
return NULL_RTX;
}
…
thus, the function you're looking for is avr_expand_delay_cycles in the same file:
static void
avr_expand_delay_cycles (rtx operands0)
{
unsigned HOST_WIDE_INT cycles = UINTVAL (operands0) & GET_MODE_MASK (SImode);
unsigned HOST_WIDE_INT cycles_used;
unsigned HOST_WIDE_INT loop_count;
if (IN_RANGE (cycles, 83886082, 0xFFFFFFFF))
{
loop_count = ((cycles - 9) / 6) + 1;
cycles_used = ((loop_count - 1) * 6) + 9;
emit_insn (gen_delay_cycles_4 (gen_int_mode (loop_count, SImode),
avr_mem_clobber()));
cycles -= cycles_used;
}
if (IN_RANGE (cycles, 262145, 83886081))
{
loop_count = ((cycles - 7) / 5) + 1;
if (loop_count > 0xFFFFFF)
loop_count = 0xFFFFFF;
cycles_used = ((loop_count - 1) * 5) + 7;
emit_insn (gen_delay_cycles_3 (gen_int_mode (loop_count, SImode),
avr_mem_clobber()));
cycles -= cycles_used;
}
if (IN_RANGE (cycles, 768, 262144))
{
loop_count = ((cycles - 5) / 4) + 1;
if (loop_count > 0xFFFF)
loop_count = 0xFFFF;
cycles_used = ((loop_count - 1) * 4) + 5;
emit_insn (gen_delay_cycles_2 (gen_int_mode (loop_count, HImode),
avr_mem_clobber()));
cycles -= cycles_used;
}
if (IN_RANGE (cycles, 6, 767))
{
loop_count = cycles / 3;
if (loop_count > 255)
loop_count = 255;
cycles_used = loop_count * 3;
emit_insn (gen_delay_cycles_1 (gen_int_mode (loop_count, QImode),
avr_mem_clobber()));
cycles -= cycles_used;
}
while (cycles >= 2)
{
emit_insn (gen_nopv (GEN_INT (2)));
cycles -= 2;
}
if (cycles == 1)
{
emit_insn (gen_nopv (GEN_INT (1)));
cycles--;
}
}
Of biggest interest here is that this modifies a node in the Abstract Syntax Tree, and emits instructions there.

No IP_ORIGDSTADDR header

I'm trying to get the original destination of UDP packet using IP_ORIGDSTADDR. On older kernel that mine, it seems to work (I'm running the currently debian testing kernel, 4.7.0-1-amd64).
#define _DEFAULT_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#include <netinet/in.h>
#define DEFAULT_ADDR "127.0.0.1"
#define DEFAULT_PORT 6666
int main(int ac, char **av)
{
int sock;
struct sockaddr_in sin;
memset(&sin, 0, sizeof sin);
sin.sin_family = AF_INET;
if (inet_aton(ac >= 2 ? av[1] : DEFAULT_ADDR, &sin.sin_addr) < 0) {
fprintf(stderr, "Invalid address\n");
goto err;
}
sin.sin_port = htons(ac >= 3 ? atoi(av[2]) : DEFAULT_PORT);
sock = socket(AF_INET, SOCK_DGRAM, 0);
if (sock < 0) {
perror("socket");
goto err;
}
if (bind(sock, (struct sockaddr *) &sin, sizeof sin) < 0) {
perror("bind");
goto close_err;
}
#define SOCK_OPT(l, n, v) do { \
int _v = v; \
socklen_t _s; \
if (setsockopt(sock, l, n, &_v, sizeof _v) < 0) { \
perror("setsockopt "# l "/" # n); \
goto close_err; \
} \
\
_s = sizeof (_v); \
if (getsockopt(sock, l, n, &_v, &_s) < 0) { \
perror("getsockopt "# l "/" # n); \
goto close_err; \
} \
\
if (_v != v) { \
fprintf(stderr, "Unexpected sockopt (expected %d, found %d)\n", v, _v); \
goto close_err; \
} \
\
printf(#l "/" #n " is set to %d\n", _v); \
\
} while (0)
SOCK_OPT(SOL_IP, IP_RECVORIGDSTADDR, 1);
SOCK_OPT(SOL_IP, IP_RECVOPTS, 1);
SOCK_OPT(SOL_IP, IP_PKTINFO, 1);
#undef SOCK_OPT
printf("Reading on %s:%d\n", inet_ntoa(sin.sin_addr), htons(sin.sin_port));
for (;;) {
ssize_t n;
char buf[1024];
char tmp[80];
struct iovec iovec[] = {
{
.iov_base = buf,
.iov_len = sizeof buf - 1,
}
};
struct msghdr msghdr;
struct cmsghdr *cmsg_ptr;
struct sockaddr_storage from = { 0 };
int port;
union cmsg_data {
struct sockaddr_in sin;
struct in_pktinfo pktinfo;
};
char msg_control[CMSG_SPACE(sizeof(union cmsg_data)) * 10] = { 0 };
int found;
memset(&msghdr, 0, sizeof msghdr);
msghdr.msg_name = &from;
msghdr.msg_namelen = sizeof from;
msghdr.msg_iov = iovec;
msghdr.msg_iovlen = sizeof iovec / sizeof iovec[0];
msghdr.msg_control = msg_control;
msghdr.msg_controllen = sizeof msg_control;
msghdr.msg_flags = MSG_EOR | MSG_TRUNC | MSG_CTRUNC | MSG_OOB | MSG_ERRQUEUE;
n = recvmsg(sock, &msghdr, MSG_OOB);
if (n < 0) {
perror("recvmsg");
continue;
}
if (buf[n - 1] == '\n')
n--;
buf[n] = 0;
switch (from.ss_family) {
default:
tmp[0] = 0;
break;
case AF_INET:
inet_ntop(AF_INET, &((struct sockaddr_in *) &from)->sin_addr, tmp, sizeof tmp);
port = htons(((struct sockaddr_in *) &from)->sin_port);
break;
case AF_INET6:
inet_ntop(AF_INET6, &((struct sockaddr_in6 *) &from)->sin6_addr, tmp, sizeof tmp);
port = htons(((struct sockaddr_in6 *) &from)->sin6_port);
break;
}
printf("%s:%d Rx %ldb: %.*s, msg_control = %zdb\n", tmp, port, n, (int) n, buf, sizeof msg_control);
found = 0;
for (cmsg_ptr = CMSG_FIRSTHDR(&msghdr); cmsg_ptr != NULL; cmsg_ptr = CMSG_NXTHDR(&msghdr, cmsg_ptr)) {
union cmsg_data *cmsg_data = (union cmsg_data *) CMSG_DATA(cmsg_ptr);
switch (cmsg_ptr->cmsg_level) {
default:
fprintf(stderr, "Unexecpted level : %d\n", cmsg_ptr->cmsg_level);
break;
case SOL_IP:
switch (cmsg_ptr->cmsg_type) {
default:
fprintf(stderr, "Unexecpted type : %d\n", cmsg_ptr->cmsg_type);
break;
case IP_ORIGDSTADDR:
printf("IP_ORIGDSTADDR: sin_addr = %s, sin_port = %d\n", inet_ntoa(cmsg_data->sin.sin_addr), htons(cmsg_data->sin.sin_port));
found++;
break;
case IP_PKTINFO:
snprintf(tmp, sizeof tmp, "%s", inet_ntoa(cmsg_data->pktinfo.ipi_spec_dst));
printf("IP_PKTINFO: ifindex = %u, spec_dst = %s, addr = %s\n", cmsg_data->pktinfo.ipi_ifindex, tmp, inet_ntoa(cmsg_data->pktinfo.ipi_addr));
break;
}
}
}
if (found != 1)
fprintf(stderr, "*** Warning: No SOL_IP / IP_ORIGDSTADDR found\n");
}
close_err:
close(sock);
err:
return 1;
}
When trying this bunch of code (eg. sending packets using netcat), I dont have any IP_ORIGDSTADDR, but only IP_PKTINFO : I need to have UDP port, only IP_ORIGDSTADDR can provide it.
Does anyone have met this strange behaviour ?

Is there a way to help auto-vectorizing compiler to emit saturation arithmetic intrinsic in LLVM?

I have a few for loops that does saturated arithmetic operations.
For instance:
Implementation of saturated add in my case is as follows:
static void addsat(Vector &R, Vector &A, Vector &B)
{
int32_t a, b, r;
int32_t max_add;
int32_t min_add;
const int32_t SAT_VALUE = (1<<(16-1))-1;
const int32_t SAT_VALUE2 = (-SAT_VALUE - 1);
const int32_t sat_cond = (SAT_VALUE <= 0x7fffffff);
const uint32_t SAT = 0xffffffff >> 16;
for (int i=0; i<R.length; i++)
{
a = static_cast<uint32_t>(A.data[i]);
b = static_cast<uint32_t>(B.data[i]);
max_add = (int32_t)0x7fffffff - a;
min_add = (int32_t)0x80000000 - a;
r = (a>0 && b>max_add) ? 0x7fffffff : a + b;
r = (a<0 && b<min_add) ? 0x80000000 : a + b;
if ( sat_cond == 1)
{
std_max(r,r,SAT_VALUE2);
std_min(r,r,SAT_VALUE);
}
else
{
r = static_cast<uint16_t> (static_cast<int32_t> (r));
}
R.data[i] = static_cast<uint16_t>(r);
}
}
I see that there is paddsat intrinsic in x86 that could have been the perfect solution to this loop. I do get the code auto vectorized but with a combination of multiple operations according to my code. I would like to know what could be the best way to write this loop that auto-vectorizer finds the addsat operation match right.
Vector structure is:
struct V {
static constexpr int length = 32;
unsigned short data[32];
};
Compiler used is clang 3.8 and code was compiled for AVX2 Haswell x86-64 architecture.

printf for p89v664 prints junk characters from actual micro controller

I am trying to print message on serial terminal from p89v664 using following code,
#include<P89V66x.H>
#include<stdio.h>
char putchar(char c) {
if (c == '\n') {
while (!TI);
TI = 0;
S0BUF = 0x0d;
}
TI = 0;
S0BUF = c;
while (!TI);
return c;
}
int printf(char*str) {
unsigned int cnt = 0;
while(*str != '\0')
{
putchar(*str);
cnt++;
str++;
}
}
void delay(unsigned int i) {
int d = 100;
for(;i!=0;i--) {
for(;d!=0;d--);
d = 100;
}
}
int main(void) {
/**Serial init*/
S0CON = 0x50; /* SCON: mode 1, 8-bit UART, enable rcvr */
TMOD |= 0x20; /* TMOD: timer 1, mode 2, 8-bit reload */
TH1 = 0xF6; /* TH1: reload value for 9600 baud */
TR1 = 1; /* TR1: timer 1 run */
TI = 1;
while(1) {
printf("Hello\n");
delay(300);
printf("Hello World\n");
delay(10000);
}
}
above program works fine till the time printf function definition in this program is not commented.
If printf function in above program is commented to use printf from standard library then junk characters are printed on serial console. (i used putty).
I used Keil uVision V4.14.4.0 compiler.
Is there anything missing?
I dont understand what is wrong with this program.
After some experiments i found that problem was with keil uVision4 evaluation version.
I compiled this code using sdcc and ran it and it worked. May be keil evaluation version's limitation was creating problem. Thank very much you Mellowcandle for all replies.
Edit:
#include <P89V66x.H>
#include<stdio.h>
void putchar(char c) {
TI = 0;
S0BUF = c;
if (c == '\n') {
while (!TI);
TI = 0;
S0BUF = 0x0d;
}
while (!TI);
}
int main(void) {
/**Serial init*/
unsigned short int c = 65334;
S0CON = 0x50; /* SCON: mode 1, 8-bit UART, enable rcvr */
TMOD |= 0x20; /* TMOD: timer 1, mode 2, 8-bit reload */
/**For 11.0592 crystal
value should TH = -3 or
TH1 = FD*/
TH1 = 0xF6; /* TH1: reload value for 9600 baud for
18 Mhz cyrstal */
TR1 = 1; /* TR1: timer 1 run */
while(1) {
printf("Hello %u\n", c);
delay(300);
printf("Hello World %u\n" ,c);
delay(10000);
}
}
command used to compile this code is,
sdcc {filename}

Resources