How to work with NETLINK_KOBJECT_UEVENT protocol in user space? - linux-kernel

Let's consider this example code:
#include <linux/netlink.h>
#include <sys/socket.h>
#include <string.h>
#include <unistd.h>
#include <stdio.h>
#define BUF_SIZE 4096
int main() {
int fd, res;
unsigned int i, len;
char buf[BUF_SIZE];
struct sockaddr_nl nls;
fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_KOBJECT_UEVENT);
if (fd == -1) {
return 1;
}
memset(&nls, 0, sizeof(nls));
nls.nl_family = AF_NETLINK;
nls.nl_pid = getpid();
nls.nl_groups = 1;
res = bind(fd, (struct sockaddr *)&nls, sizeof(nls));
if (res == -1) {
return 2;
}
while (1) {
len = recv(fd, buf, sizeof(buf), 0);
printf("============== Received %d bytes\n", len);
for (i = 0; i < len; ++i) {
if (buf[i] == 0) {
printf("[0x00]\n");
} else if (buf[i] < 33 || buf[i] > 126) {
printf("[0x%02hhx]", buf[i]);
} else {
printf("%c", buf[i]);
}
}
printf("<END>\n");
}
close(fd);
return 0;
}
It listens on netlink socket for events related to hotplug. Basically, it works. However, some parts are unclear for me even after spending whole evening on googling, reading different pieces of documentation and manuals and working through examples.
Basically, I have two questions.
What different values for sockaddr_nl.nl_groups means? At least for NETLINK_KOBJECT_UEVENT protocol.
If buffer allocated for the message is too small, the message will be simply truncated (you can play with the BUF_SIZE size to see that). What this buffer size should be to not lose any data? Is it possible to know in user space length of the incoming message to allocate enough space?
I would appreciate either direct answers as references to kernel code.

The values represent different multicast groups. A netlink socket can have 31 different multicast groups (0 means unicast) that multicast messages can be sent to. For NETLINK_KOBJECT_UEVENT it looks like it's fixed to 1 see f.ex. here.
You should be able to use getsockopt with level set to SOL_SOCKET and optname set to SO_RCVBUF.

Related

How to check if stdout was closed - without writing data to it?

I wrote a program that reads data, filters and processes it and writes it to stdout. If stdout is piped to another process, and the piped process terminates, I get SIGPIPEd, which is great, because the program terminates, and the pipeline comes to a timely end.
Depending on the filter parameters however, there may not be a single write for tens of seconds, and during that time there won't be a SIGPIPE, although the downstream process has long finished. How can I detect this, without actually writing something to stdout? Currently, the pipeline is just hanging, until my program terminates of natural causes.
I tried writing a zero-length slice
if _, err := os.Stdout.Write([]byte{}); err != nil
but unfortunately that does not result in an error.
N.B. Ideally, this should work regardless of the platform, but if it works on Linux only, that's already an improvement.
This doesn't answer it in Go, but you can likely find a way to use this.
If you can apply Poll(2) to the write end of your pipe, you will get an notification when it becomes un-writable. How to integrate that into your Go code depends upon your program; hopefully it could be useful:
#include <errno.h>
#include <poll.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
void sp(int sno) {
write(2, "sigpipe!\n", 9);
_exit(1);
}
int waitfd(int fd) {
int n;
struct pollfd p;
p.fd = fd;
p.events = POLLOUT | POLLRDBAND;
/* RDBAND is for what looks like a bug in illumos fifovnops.c */
p.revents = 0;
if ((n=poll(&p, 1, -1)) == 1) {
if (p.revents & POLLOUT) {
return fd;
}
if (p.revents & (POLLERR|POLLHUP)) {
return -1;
}
}
fprintf(stderr, "poll=%d (%d:%s), r=%#x\n",
n, errno, strerror(errno), p.revents);
return -1;
}
int main() {
int count = 0;
char c;
signal(SIGPIPE, sp);
while (read(0, &c, 1) > 0) {
int w;
while ((w=waitfd(1)) != -1 &&
write(1, &c, 1) != 1) {
}
if (w == -1) {
break;
}
count++;
}
fprintf(stderr, "wrote %d\n", count);
return 0;
}
In linux, you can run this program as: ./a.out < /dev/zero | sleep 1 and it will print something like: wrote 61441. You can change it to sleep for 3s, and it will print the same thing. That is pretty good evidence that it is has filled the pipe, and is waiting for space.
Sleep will never read from the pipe, so when its time is up, it closes the read side, which wakes up poll(2) with a POLLERR event.
If you change the poll event to not include POLLOUT, you get the simpler program:
#include <errno.h>
#include <fcntl.h>
#include <poll.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
int waitfd(int fd) {
int n;
struct pollfd p;
p.fd = fd;
p.events = POLLRDBAND;
p.revents = 0;
if ((n=poll(&p, 1, -1)) == 1) {
if (p.revents & (POLLERR|POLLHUP)) {
return -1;
}
}
fprintf(stderr, "poll=%d (%d:%s), r=%#x\n",
n, errno, strerror(errno), p.revents);
return -1;
}
int main() {
if (waitfd(1) == -1) {
fprintf(stderr, "Got an error!\n");
}
return 0;
}
where "Got an error!" indicates the pipe was closed. I don't know how portable this is, as poll(2) documentation is kinda sketchy.
Without the POLLRDBAND (so events is 0), this works on Linux, but wouldn't on UNIX (at least Solaris and macos). Again, docs were useless, but having the kernel source answers many questions :)
This example, using threads, can be directly mapped to go:
#include <pthread.h>
#include <errno.h>
#include <poll.h>
#include <signal.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
int Events = POLLRDBAND;
void sp(int sno) {
char buf[64];
write(2, buf, snprintf(buf, sizeof buf, "%d: sig%s(%d)\n", getpid(), sys_siglist[sno], sno));
_exit(1);
}
int waitfd(int fd) {
int n;
struct pollfd p;
p.fd = fd;
p.events = Events;
/* RDBAND is for what looks like a bug in illumos fifovnops.c */
p.revents = 0;
if ((n=poll(&p, 1, -1)) == 1) {
if (p.revents & (POLLERR|POLLHUP)) {
return -1;
}
return fd;
}
return -1;
}
void *waitpipe(void *t) {
int x = (int)(intptr_t)t; /*gcc braindead*/
waitfd(x);
kill(getpid(), SIGUSR1);
return NULL;
}
int main(int ac) {
pthread_t killer;
int count = 0;
char c;
Events |= (ac > 1) ? POLLOUT : 0;
signal(SIGPIPE, sp);
signal(SIGUSR1, sp);
pthread_create(&killer, 0, waitpipe, (int *)1);
while (read(0, &c, 1) > 0) {
write(1, &c, 1);
count++;
}
fprintf(stderr, "wrote %d\n", count);
return 0;
}
Note that it parks a thread on poll, and it generates a SIGUSR1. Here is running it:
mcloud:pipe $ ./spthr < /dev/zero | hexdump -n80
0000000 0000 0000 0000 0000 0000 0000 0000 0000
*
0000050
185965: sigUser defined signal 1(10)
mcloud:pipe $ ./spthr < /dev/zero | sleep 1
185969: sigUser defined signal 1(10)
mcloud:pipe $ ./spthr | sleep 1
185972: sigUser defined signal 1(10)
mcloud:pipe $ ./spthr < /dev/zero | hexdump -n800000
0000000 0000 0000 0000 0000 0000 0000 0000 0000
*
00c3500
185976: sigBroken pipe(13)
In the first command, hexdump quits after 80 bytes, the poll is fundamentally racing with the read+write loop, so it could have generated either a sigpipe or sigusr1.
The second two demonstrate that sleep will cause a sigusr1 (poll returned an exception event) whether or not the write side of the pipe is full when the pipe reader exits.
The fourth, uses hexdump to read a lot of data, way more than pipe capacity, which more deterministically causes a sigpipe.
You can generate test programs which model it more exactly, but the point is that the program gets notification as soon as the pipe is closed; not having to wait until its next write.
Not a real solution to the problem - namely, detecting if the process down the pipe has terminated without writing to it - but here is a workaround, suggested in a comment by Daniel Farrell: (Define and) use a heartbeat signal that will get ignored downstream.
As this workaround is not transparent, it may not be possible if you don't control all processes involved.
Here's an example that uses the NUL byte as heartbeat signal for text based data:
my-cmd | head -1 | tr -d '\000' > file
my-cmd would send NUL bytes in times of inactivity to get a timely EPIPE / SIGPIPE.
Note the use of tr to strip off the heartbeats again once it has served its purpose - otherwise they would end up in file.

Inverting an image using MPI

I am trying to invert a PGM image using MPI. The grayscale (PGM) image should be loaded on the root processor and then be sent to each of the s^2 processors. Each processor will invert a block of the given image, and the inverted blocks will be gathered back on the root processor, which will assemble the blocks into the final image and write it to a PGM image. I ran the following code, but did not get any output. The image was read after running the code, but there was no indication of writing the resultant image. Could you please let me know what could be wrong with it?
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <time.h>
#include <string.h>
#include <math.h>
#include <memory.h>
#define max(x, y) ((x>y) ? (x):(y))
#define min(x, y) ((x<y) ? (x):(y))
int xdim;
int ydim;
int maxraw;
unsigned char *image;
void ReadPGM(FILE*);
void WritePGM(FILE*);
#define s 2
int main(int argc, char **argv) {
MPI_Init(&argc, &argv);
int p, rank;
MPI_Comm_size(MPI_COMM_WORLD, &p);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
const int NPROWS=s; /* number of rows in _decomposition_ */
const int NPCOLS=s; /* number of cols in _decomposition_ */
const int BLOCKROWS = xdim/NPROWS; /* number of rows in _block_ */
const int BLOCKCOLS = ydim/NPCOLS; /* number of cols in _block_ */
int i, j;
FILE *fp;
float BLimage[BLOCKROWS*BLOCKCOLS];
for (int ii=0; ii<BLOCKROWS*BLOCKCOLS; ii++)
BLimage[ii] = 0;
float BLfilteredMat[BLOCKROWS*BLOCKCOLS];
for (int ii=0; ii<BLOCKROWS*BLOCKCOLS; ii++)
BLfilteredMat[ii] = 0;
if (rank == 0) {
/* begin reading PGM.... */
ReadPGM(fp);
}
MPI_Datatype blocktype;
MPI_Datatype blocktype2;
MPI_Type_vector(BLOCKROWS, BLOCKCOLS, ydim, MPI_FLOAT, &blocktype2);
MPI_Type_create_resized( blocktype2, 0, sizeof(float), &blocktype);
MPI_Type_commit(&blocktype);
int disps[NPROWS*NPCOLS];
int counts[NPROWS*NPCOLS];
for (int ii=0; ii<NPROWS; ii++) {
for (int jj=0; jj<NPCOLS; jj++) {
disps[ii*NPCOLS+jj] = ii*ydim*BLOCKROWS+jj*BLOCKCOLS;
counts [ii*NPCOLS+jj] = 1;
}
}
MPI_Scatterv(image, counts, disps, blocktype, BLimage, BLOCKROWS*BLOCKCOLS, MPI_FLOAT, 0, MPI_COMM_WORLD);
//************** Invert the block **************//
for (int proc=0; proc<p; proc++) {
if (proc == rank) {
for (int j = 0; j < BLOCKCOLS; j++) {
for (int i = 0; i < BLOCKROWS; i++) {
BLfilteredMat[j*BLOCKROWS+i] = 255 - image[j*BLOCKROWS+i];
}
}
} // close if (proc == rank) {
MPI_Barrier(MPI_COMM_WORLD);
} // close for (int proc=0; proc<p; proc++) {
MPI_Gatherv(BLfilteredMat, BLOCKROWS*BLOCKCOLS,MPI_FLOAT, image, counts, disps,blocktype, 0, MPI_COMM_WORLD);
if (rank == 0) {
/* Begin writing PGM.... */
WritePGM(fp);
free(image);
}
MPI_Finalize();
return (1);
}
It is very likely MPI is not the right tool for the job. The reason for this is that your job is inherently bandwidth limited.
Think of it this way: You have a coloring book with images which you all want to color in.
Method 1: you take your time and color them in one by one.
Method 2: you copy each page to a new sheet of paper and mail it to a friend who then colors it in for you. He mails it back to you and in the end you glue all the pages you received from all of your friends together to make one colored-in book.
Note that method two involves copying the whole book, which is arguably the same amount of work needed to color in the whole book. So method two is less time-efficient without even considering the overhead of shoving the pages into an envelope, licking the stamp, going to the post office and waiting for the letter to be delivered.
If you look at your code, every transmitted byte is only touched once throughout the whole program in this line:
BLfilteredMat[j*BLOCKROWS+i] = 255 - image[j*BLOCKROWS+i];
The single processor is much faster at subtracting two integers than it is at sending an integer of the wire, therefore one must advise against using MPI for your particular problem.
My suggestion to solve your problem: Try to avoid unneccessary communication whenever possible. Do all processes have access to the file system on which the files are located? You could try reading them directly from the filesystem.

how to fix this MPI code program

This program demonstrates an unsafe program, because sometimes it will execute fine, and other times it will fail. The reason why the program fails or hangs is due to buffer exhaustion on the receiving task side, as a consequence of the way an MPI library has implemented an eager protocol for messages of a certain size. One possible solution is to include an MPI_Barrier call in the both the send and receive loops.
how its program code is correct???
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#define MSGSIZE 2000
int main (int argc, char *argv[])
{
int numtasks, rank, i, tag=111, dest=1, source=0, count=0;
char data[MSGSIZE];
double start, end, result;
MPI_Status status;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank == 0) {
printf ("mpi_bug5 has started...\n");
if (numtasks > 2)
printf("INFO: Number of tasks= %d. Only using 2 tasks.\n", numtasks);
}
/******************************* Send task **********************************/
if (rank == 0) {
/* Initialize send data */
for(i=0; i<MSGSIZE; i++)
data[i] = 'x';
start = MPI_Wtime();
while (1) {
MPI_Send(data, MSGSIZE, MPI_BYTE, dest, tag, MPI_COMM_WORLD);
count++;
if (count % 10 == 0) {
end = MPI_Wtime();
printf("Count= %d Time= %f sec.\n", count, end-start);
start = MPI_Wtime();
}
}
}
/****************************** Receive task ********************************/
if (rank == 1) {
while (1) {
MPI_Recv(data, MSGSIZE, MPI_BYTE, source, tag, MPI_COMM_WORLD, &status);
/* Do some work - at least more than the send task */
result = 0.0;
for (i=0; i < 1000000; i++)
result = result + (double)random();
}
}
MPI_Finalize();
}
Ways to improve this code so that the receiver doesn't end up with an unlimited number of unexpected messages include:
Synchronization - you mentioned MPI_Barrier, but even using MPI_Ssend instead of MPI_Send would work.
Explicit buffering - the use of MPI_Bsend or Brecv to ensure adequate buffering exists.
Posted receives - the receiving process posts IRecvs before starting work to ensure that the messages are received into the buffers meant to hold the data, rather than system buffers.
In this pedagogical case, since the number of messages is unlimited, only the first (synchronization) would reliably work.

Page faults on OS X when reading with MMAP

I am trying to benchmark file system I/O on Mac OS X using mmap.
#include <unistd.h>
#include <fcntl.h>
#include <dirent.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <stdio.h>
#include <math.h>
char c;
int main(int argc, char ** argv)
{
if (argc != 2)
{
printf("no files\n");
exit(1);
}
int fd = open(argv[1], O_RDONLY);
fcntl(fd, F_NOCACHE, 1);
int offset=0;
int size=0x100000;
int pagesize = getpagesize();
struct stat stats;
fstat(fd, &stats);
int filesize = stats.st_size;
printf("%d byte pages\n", pagesize);
printf("file %s # %d bytes\n", argv[1], filesize);
while(offset < filesize)
{
if(offset + size > filesize)
{
int pages = ceil((filesize-offset)/(double)pagesize);
size = pages*pagesize;
}
printf("mapping offset %x with size %x\n", offset, size);
void * mem = mmap(0, size, PROT_READ, 0, fd, offset);
if(mem == -1)
return 0;
offset+=size;
int i=0;
for(; i<size; i+=pagesize)
{
c = *((char *)mem+i);
}
munmap(mem, size);
}
return 0;
}
The idea is that I'll map a file or portion of it and then cause a page fault by dereferencing it. I am slowly losing my sanity since this doesn't at all work and I've done similar things on Linux before.
Change this line
void * mem = mmap(0, size, PROT_READ, 0, fd, offset);
to
void * mem = mmap(0, size, PROT_READ, MAP_PRIVATE, fd, offset);
And, don't compare mem with -1. Use this instead:
if(mem == MAP_FAILED) { ... }
It's both more readable and more portable.
General advice: if you're on a different UNIX platform from what you're used to, it's a good idea to open the man page. For mmap on OS X, it can be found here. It says
Conforming applications must specify either MAP_PRIVATE or MAP_SHARED.
So, specifying 0 on the fourth
argument is not OK in OS X. I believe
this is true for BSD in general.

socket "read" hanging if the MacBook sleeps more than 10 minutes

I am writing an app, where a socket is connecting to a host and downloading a file.
The application runs in Mac.
Now, while the app is downloading, if I put the MacBook in sleep mode for more than 10 minutes, 60% of the time the app hangs when the computer wakes up.
The stack trace shows that, it has hanged in the "read" call. I am able to reproduce this with a sample program also. Below, I have pasted the code of the sample program and the stack where it is hanging. How to solve this hanging?
Also, this is not just TCP/IP waiting that will come out in few minutes. I have waited for more than 12 hours, it did not come out.
The stack trace: -
Call graph:
2466 Thread_2507
2466 start
2466 read$UNIX2003
2466 read$UNIX2003
The program :-
#include <stdio.h>
#include <string.h>
#include <netdb.h>
#include <sys/socket.h>
#include <unistd.h>
#define buflen 131072
unsigned int portno = 80;
char hostname[] = "192.168.1.9";
int main()
{
int sd = socket(AF_INET, SOCK_STREAM, 0); /* init socket descriptor */
struct sockaddr_in sin;
struct hostent *host = gethostbyname(hostname);
char buf[buflen];
int len;
int ret;
FILE *fp;
int i;
if(sd == -1){
printf("Could not create client socket\n");
return 1;
}
/*set keep alive*/
int optval = 1;
int optlen = sizeof(optval);
ret = setsockopt(sd, SOL_SOCKET, SO_KEEPALIVE, &optval, optlen);
if(ret != 0){
printf("could not set socket option.\n");
return 1;
}
/*** PLACE DATA IN sockaddr_in struct ***/
memcpy(&sin.sin_addr.s_addr, host->h_addr, host->h_length);
sin.sin_family = AF_INET;
sin.sin_port = htons(portno);
/*** CONNECT SOCKET TO THE SERVICE DESCRIBED BY sockaddr_in struct ***/
if (connect(sd, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
perror("connecting");
return 1;
}
char *str = "GET /general-log.exe / HTTP/1.0\n\n";
ret = write(sd, str, strlen(str));
if(ret < 0){
printf("error while writing\n");
return 1;
}
fp = fopen("downloaded.file", "wb+");
if(fp == NULL){
printf("not able to open the file.\n");
return 1;
}
i = 0;
while ((len = read(sd, buf, buflen)) > 0) {
printf("%d\t%d\n", i++, len);
fwrite(buf, len, 1, fp); //we should check for return
}
if(len < 0){
printf("Error while reading\n");
}
fclose(fp);
close(sd);
return 0;
}
Update apparently the SO_RCVTIMEOUT is solving the problem.
struct timeval tv;
tv.tv_sec=10;
tv.tv_usec=0;
setsockopt ( m_sock, SOL_SOCKET, SO_RCVTIMEO, (char *) &tv, sizeof ( tv ) );
Is it okay to use SO_RCVTIMEO?
TCP/IP connections don't survive sleep mode. SO_KEEPALIVE doesn't help in this case since it has no effect on the server side. Just wait two minutes and the read will time out. After the timeout, you can connect again.
And that sleep(1) is unnecessary. The server will respond as soon as the data is available. If you don't fetch is right away, you'll allocate a connection on the server for longer than you need.
I couldn't solve it using blocking sockets. I had to change the IMAP library to non-blocking sockets.

Resources