Bash Script - Read Binary File - bash

I'm new to scripting, but I have a lot of experience programming in languages such as C# and Java.
I have a file that contains binary data. I want to write a Bash script that reads the year, month, and day contained in that file so I can sort the associated MOD files into folders according to the date they were recorded. I'm having trouble finding a way to read binary data and parsing it in a bash script. Is there any way to do this?

You can use od (plus head and awk for a little post-processing) for this. To get the year:
year=$(od -t x2 --skip-bytes=6 --read-bytes=2 file.moi | head -1 | awk '{print $2}')
For the month:
month=$(od -t x1 --skip-bytes=8 --read-bytes=1 file.moi | head -1 | awk '{print $2}')
And the day:
day=$(od -t x1 --skip-bytes=9 --read-bytes=1 file.moi | head -1 | awk '{print $2}')

I would recommend using python for this.
However, if you insist on bash, i would try using either sed in binary mode (never tried it) or using dd for extracting specific bytes and then convert them.

If this is not too hardcore for you I suggest compiling the following C-language program:
#include <stdio.h>
#include <inttypes.h>
typedef union {
char array[sizeof(int32_t)];
int32_t val;
} int32_u;
typedef union {
char array[sizeof(uint32_t)];
uint32_t val;
} uint32_u;
typedef union {
char array[sizeof(uint64_t)];
uint64_t val;
} uint64_u;
typedef union {
char array[sizeof(int64_t)];
int64_t val;
} int64_u;
int swap(char* mem, int size) {
if (size & 1 != 0)
return -1;
int i;
for (i = 0; i < size / 2; i++) {
char tmp = mem[i];
mem[i] = mem[size - i - 1];
mem[size - i - 1] = tmp;
}
return 0;
}
int sys_big_endian() {
int x = 1;
return !(*(char*)&x);
}
int main(int argc, char** argv) {
char* file_name = NULL;
int offset = 0;
char* type = "int32";
int big_endian = 0;
int i;
for(i = 1; i < argc; i++) {
if(!strncmp("-o", argv[i], 2)) {
++i;
sscanf(argv[i], "%d", &offset);
} else if(!strncmp("-t", argv[i], 2)) {
++i;
type = argv[i];
} else if(!strncmp("-e", argv[i], 2)) {
++i;
big_endian = !strncmp("big", argv[i], 3);
} else {
file_name = argv[i];
break;
}
}
if (i < argc - 1) {
fprintf(stderr, "Ignoring extra arguments: ");
++i;
for (; i < argc; i++) {
fprintf(stderr, "%s ", argv[i]);
}
fprintf(stderr, "\n");
}
if (file_name == NULL) {
fprintf(stderr, "Syntax: readint [-o offset] [-t type] [-e endian] <filename>\n"
"Where:\n"
" type 'uint32', 'uint64', 'int32' (default), 'int64'.\n"
" endian 'big' or 'little' (default).\n"
" offset offset in a file from where the read will happen, default is 0.\n"
);
return -1;
}
FILE* fp = fopen(file_name, "rb");
if (fp == NULL) {
fprintf(stderr, "Could not open the file: %s\n", file_name);
return -1;
}
fseek(fp, offset, SEEK_SET);
if (!strncmp("uint32", type, 6)) {
uint32_u u;
fread(u.array, sizeof(u.array), 1, fp);
if (big_endian ^ sys_big_endian())
swap(u.array, sizeof(u.array));
printf("%u\n", u.val);
} else if (!strncmp("int32", type, 5)) {
int32_u u;
fread(u.array, sizeof(u.array), 1, fp);
if (big_endian ^ sys_big_endian())
swap(u.array, sizeof(u.array));
printf("%d\n", u.val);
} else if (!strncmp("uint64", type, 6)) {
uint64_u u;
fread(u.array, sizeof(u.array), 1, fp);
if (big_endian ^ sys_big_endian())
swap(u.array, sizeof(u.array));
printf("%"PRIu64"\n", u.val);
} else if (!strncmp("int64", type, 5)) {
int64_u u;
fread(u.array, sizeof(u.array), 1, fp);
if (big_endian ^ sys_big_endian())
swap(u.array, sizeof(u.array));
printf("%"PRId64"\n", u.val);
} else {
printf("Unknown type: %s\n", type);
}
fclose(fp);
return 0;
}
Then do this:
gcc -o readint readint.c
sudo mv readint /usr/local/bin
Now you have a handy tool called 'readint' with the following syntax:
readint [-o offset] [-t int32|uint32|int64|uint64 ] [-e little|big ] <filename>

you can search the net for modules to interpret MOI files (either Perl or Python). Otherwise, i don't really think you can get the date just like that from the binary file because if you look inside, its really "garbage" since its binary. Although you may also give the strings command a try to see if there are legible strings that match the date

Related

Cannot access memory at address using gdb in CLION IDE. How do I setup gdb?

I get the error "Cannot access memory at address 0x100403055" when I try and set a memory value to 0x00 when stopped in the debugger.
Is there a special switch I need to set to enable the set operation?
Here is my complete C code file "main.c"
#include <stdio.h>
#include <string.h>
/*
separator - consume all non-token characters until next token. This includes:
comments: '#'
nesting: '{'
unnesting: '}'
whitespace: ' ','\t','\n'
*nest is changed according to nesting/unnesting processed
*/
static void separator(int *nest, char **tokens) {
char c, *s;
s = *tokens;
while ((c = *s)) {
/* #->eol = comment */
if (c == '#') {
s++;
while ((c = *s)) {
s++;
if (c == '\n')
break;
}
continue;
}
if (c == '{') {
(*nest)++;
s++;
continue;
}
if (c == '}') {
(*nest)--;
s++;
continue;
}
if (c == ' ' || c == '\n' || c == '\t') {
s++;
continue;
}
break;
}
*tokens = s;
}
/*
token - capture all characters until next separator, then consume separator,
return captured token, leave **tokens pointing to next token.
*/
static char *token(int *nest, char **tokens) {
char c, *s, *t;
char terminator = '\0';
s = t = *tokens;
while ((c = *s)) {
if (c == '#'
|| c == ' ' || c == '\t' || c == '\n' || c == '{' || c == '}')
break;
s++;
}
*tokens = s;
separator(nest, tokens);
/* Breakpoint here to examine and manipulate memory */
*s = '\0';
return t;
}
struct test_case {
char *input;
int nest;
char *expected_output;
};
int main() {
int nest = 0;
int TESTSEP = 0;
if (TESTSEP>0) {
char *tokens = "# this is a comment\n{nesting {example} unnesting}\n \t end";
separator(&nest, &tokens);
printf("nest: %d\n", nest);
printf("tokens: %s\n", tokens);
return 0;
} else {
struct test_case test_cases[] = {
{"hello world", 0, "hello"},
{"hello#world", 0, "hello"},
{"hello{world}", 0, "hello"},
{"hello world", 0, "hello"},
{"hello\tworld", 0, "hello"},
{"hello\nworld", 0, "hello"},
};
for (int i = 0; i < sizeof(test_cases) / sizeof(test_cases[0]); i++) {
struct test_case test_case = test_cases[i];
char *tokens = test_case.input;
char *output = token(&test_case.nest, &tokens);
if (strcmp(output, test_case.expected_output) != 0) {
printf("Test case %d failed: expected %s, got %s\n", i, test_case.expected_output, output);
}
}
return 0;
}
}
In the token function there is a comment line where I place a breakpoint and drop into the gdb debugger. The code is supposed to write a '\0' at the address of the pointer *s to truncate the string.
When I'm in the debugger and I examine the 's' variable I get the following:
(gdb) x s
0x100403055: 0x726f7720
When I try and set the variable I get:
(gdb) [![set *0x0000000100403055 = 0x726f7700][1]][1]
Cannot access memory at address 0x100403055
I'm using the CLION IDE and am a novice. I'm not sure if its an IDE problem, a user problem or some external memory protection mechanism that is preventing this.
Does anyone know how to make this work?
Here is a screenshot of the IDE:
When I run the code (without the debugger) I get this output:
./explore.exe
Test case 0 failed: expected hello, got hello world
Test case 1 failed: expected hello, got hello#world
Test case 2 failed: expected hello, got hello{world}
Test case 3 failed: expected hello, got hello world
Test case 4 failed: expected hello, got hello world
Test case 5 failed: expected hello, got hello world
Process finished with exit code 0
I this case I believe I was passing in a pointer to memory in the read only space. The struct test_case is built into the code and is read only. So that when I pass that into the token function it was trying to write to read only.
Here is the code that seems to work.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
/*
separator - consume all non-token characters until next token.
This includes:
comments: '#' ... '\n'
nesting: '{'
unnesting: '}'
whitespace: ' ','\t','\n'
*nest is changed according to nesting/unnesting processed
*/
static void separator(int *nest, char **tokens) {
char c, *s;
s = *tokens;
while ((c = *s)) {
/* #->eol = comment */
if (c == '#') {
s++;
while ((c = *s)) {
s++;
if (c == '\n')
break;
}
continue;
}
if (c == '{') {
(*nest)++;
s++;
continue;
}
if (c == '}') {
(*nest)--;
s++;
continue;
}
if (c == ' ' || c == '\n' || c == '\t') {
s++;
continue;
}
break;
}
*tokens = s;
}
/*
token - capture all characters until next separator, then consume
separator,
return captured token, leave **tokens pointing to next token.
*/
static char *token(int *nest, char **tokens) {
char c, *s, *t;
char terminator = '\0';
s = t = *tokens;
while ((c = *s)) {
if (c == '#'
|| c == ' ' || c == '\t' || c == '\n' || c == '{' || c == '}')
break;
s++;
}
*tokens = s;
separator(nest, tokens);
*s = '\0';
return t;
}
struct test_case {
char *input;
int nest;
char *expected_output;
};
int main() {
int nest = 0;
int TESTSEP = 0;
char *temp_malloc_string;
if (TESTSEP>0) {
char *tokens = "# this is a comment\n{nesting {example}
unnesting}\n \t end";
temp_malloc_string = malloc(strlen(tokens)*sizeof(char));
strcpy(temp_malloc_string, tokens);
char * t = token(&nest, &temp_malloc_string);
printf("nest: %d\n", nest);
printf("tokens: %s\n", t);
separator(&nest, &temp_malloc_string);
printf("nest: %d\n", nest);
printf("tokens: %s\n", temp_malloc_string);
return 0;
} else {
struct test_case test_cases[] = {
{"hello world", 0, "hello"},
{"hello#world", 0, "hello"},
{"hello{world}", 0, "hello"},
{"hello world", 0, "hello"},
{"hello\tworld", 0, "hello"},
{"hello\nworld", 0, "hello"},
};
for (int i = 0; i < sizeof(test_cases) / sizeof(test_cases[0]); i++) {
struct test_case test_case = test_cases[i];
char *tokens = test_case.input;
printf("len of string is %d\n", strlen(tokens));
temp_malloc_string = malloc((strlen(tokens)+1)*sizeof(char));
char * tt = temp_malloc_string;
if ( temp_malloc_string==NULL ) {
printf("error!\n");
}
strcpy(temp_malloc_string, tokens);
printf("tm going in: %s\n", temp_malloc_string);
char *output = token(&test_case.nest, &temp_malloc_string);
printf("Test case %d: expected %s, got %s\n\t\ttm is now: %s\n",
i, test_case.expected_output, output, temp_malloc_string);
if (strcmp(output, test_case.expected_output) != 0) {
printf("Test case %d failed: expected %s, got %s\n",
i, test_case.expected_output, output);
}
free(tt);
temp_malloc_string = NULL;
}
return 0;
}
}
Now when I run the code I get:
./explore.exe
len of string is 11
tm going in: hello world
Test case 0: expected hello, got hello
tm is now: world
len of string is 11
tm going in: hello#world
Test case 1: expected hello, got hello
tm is now:
len of string is 12
tm going in: hello{world}
Test case 2: expected hello, got hello
tm is now: world}
len of string is 12
tm going in: hello world
Test case 3: expected hello, got hello
tm is now: world
len of string is 11
tm going in: hello world
Test case 4: expected hello, got hello
tm is now: world
len of string is 11
tm going in: hello
world
Test case 5: expected hello, got hello
tm is now: world
Process finished with exit code 0
And when I stop at the breakpoint I can write to memory.
In this modified code I malloc a char* object and copy the string from the struct into that then pass that into the token function.
I'm guess that gdb is protecting me from writing to the .text block in code.
Like I said: I'm a newbie :(

Any faster way to replace substring in AWK

I have a long string of about 50,000,000 long... , and I am substituting it part by part
cat FILE | tail -n+2 | awk -v k=100 '{
i = 1
while (i<length($0)-k+1) {
x = substr($0, i, k)
if (CONDITION) {
x changed sth
$0 = substr($0,1,i-1) x substr($0,i+k)
}
i += 1
}
gsub(sth,sth,$0)
printf("%s",$0) >> FILE
}'
Are there any ways to replace $0 at position i with x of length k without using this method?
The string is too long and the commands runs extremely slow
sample input:
NNNNNNNNNNggcaaacagaatccagcagcacatcaaaaagcttatccacAGTAATTCATTATATCAAAATGCTCCAggccaggcgtggtggcttatgcc
sample output:
NNNNNNNNNNggcnnncngnnnccngcngcncnncnnnnngcnnnnccncNGNNNNNCNNNNNNNCNNNNNGCNCCNggccnggcgnggnggcnnnngcc
If substring with length k=10 contains >50% of A || a || T || t
(so there are length($0)-k+1 substrings)
substitute A and T with N, a and t with n
The $0 string must maintain it size and sequence (Case sensitive)
EDIT:
I misunderstood the requirement of this problem, and repost the question at here.
Basically:
read a window of characters to two buffers - scratch buffer and output buffer
if in the scratch buffer there are more then some count of characters ATat
then replace all characters ATat in the output buffer buffer to Nn respectively
output one character from the output buffer
flush one character in both buffers
and go to step 1 to repeat reading the characters into buffers
when the end of line is encountered, just flush output buffer and reset it all
A small C program for sure is going to be the fastest:
// The window size
#define N 10
// The percent of the window that has to be equal to one of [AaTt]
#define PERCENT 50
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
// output a string
static void output(char *outme, size_t n) {
fwrite(outme, n, 1, stdout);
}
// is one of [AaTt]
static bool is_one_of_them(char c) {
switch(c) {
case 'A':
case 'a':
case 'T':
case 't':
return true;
}
return false;
}
// Convert one of characters to n/N depending on case
static char convert_them_to_n(char c) {
// switch(c){ case 'T': case 'A': return true; } return false;
// ASCII is assumed
const char m = ~0x1f;
const char w = 'n' & ~m;
return (c & m) | w;
}
static const unsigned threshold = N * PERCENT / 100;
// Store the input in buf
static char buf[N];
// Store the output to-be-outputted in out
static char out[N];
// The current position in buf and out
// The count of readed characters
static size_t pos;
// The count of one of searched characters in buf
static unsigned count_them;
static void buf_reset(void) {
pos = 0;
count_them = 0;
}
static void buf_flush(void) {
output(out, pos);
buf_reset();
}
static void buf_replace_them(void) {
// TODO: this could keep count of characters alrady replaced in out to save CPU
for (size_t i = 0; i < N; ++i) {
if (is_one_of_them(out[i])) {
out[i] = convert_them_to_n(out[i]);
}
}
}
static void buf_flush_one(void) {
assert(pos > 0);
assert(pos == N);
output(out, 1);
count_them -= is_one_of_them(buf[0]);
memmove(buf, buf + 1, pos - 1);
memmove(out, out + 1, pos - 1);
pos--;
}
static void buf_add(char c) {
buf[pos] = out[pos] = c;
pos++;
count_them += is_one_of_them(c);
// if we reached the substring length
if (pos == N) {
// if the count reached the threshold
if (count_them >= threshold) {
// convert the characters to n
buf_replace_them();
}
// flush one character only at a time
buf_flush_one();
}
}
int main() {
int c;
buf_reset();
while ((c = getchar()) != EOF) {
if (c == '\n') {
// If its a newline, just flush what we have buffered
buf_flush();
output("\n", 1);
continue;
}
buf_add(c);
}
buf_flush();
}
Such a C program is easily transferable to for example an awk script, just one need to read one character at a time. Below I split the characters with split, like:
awk -v N=10 -v percent=50 '
BEGIN{ threshold = N * percent / 100; pos=0 }
function is_one_of_them(c) {
return c ~ /^[aAtT]$/;
}
function buf_flush(i) {
for (i = 0; i < pos; ++i) {
printf "%s", out[i]
}
pos = 0
count_them = 0
}
function buf_replace_them(i) {
for (i = 0; i < pos; ++i) {
if (is_one_of_them(out[i])) {
out[i] = out[i] ~ /[AT]/ ? "N" : "n";
}
}
}
function buf_flush_one(i) {
printf "%s", out[0]
count_them -= is_one_of_them(buf[0])
if(0 && debug) {
printf(" count_them %s ", count_them)
for (i = 0; i < pos-1; ++i) {
printf("%s", buf[i+1])
} printf(" ");
for (i = 0; i < pos-1; ++i) {
printf("%s", out[i+1])
}
printf("\n");
}
for (i = 0; i < pos-1; ++i) {
buf[i] = buf[i+1]
out[i] = out[i+1]
}
pos--
}
function buf_add(c) {
buf[pos]=c; out[pos]=c; pos++
count_them += is_one_of_them(c)
if (pos == N) {
if (count_them >= threshold) {
buf_replace_them()
}
buf_flush_one()
}
}
{
split($0, chars, "")
for (idx = 0; idx <= length($0); idx++) {
buf_add(chars[idx])
}
buf_flush();
printf "\n";
}
'
Both programs when run with the input presented in the first line produce the output presented in the second line (note that lone a near the end is not replaced, because there are no 5 charactets ATat in a window of 10 characters from it):
NNNNNNNNNNggcaaacagaatccagcagcacatcaaaaagcttatccacAGTAATTCATTATATCAAAATGCTCCAggccaggcgtggtggcttatgcc
NNNNNNNNNNggcnnncngnnnccngcngcncnncnnnnngcnnnnccncNGNNNNNCNNNNNNNCNNNNNGCNCCNggccaggcgnggnggcnnnngcc
Both solutions were tested on repl.
You need to be careful with how you address this problem. You cannot work on the substituted string. You need to keep track of the original string. Here is a simple example. Assume we have a string consisting of x and y and we want to replace all y with z if there are 8 y in a substring of 10. Imagine your input looks like:
yyyyyyyyxxy
The first substring of 10 reads yyyyyyyyxx and would be translated into zzzzzzzzxx. If you perform the substitution directly into the original string, you get zzzzzzzzxxy. The second substring now reads zzzzzzzxxy, and does not contain 8 times y, while in the original string it does. So according to the solution of the OP, this would lead into inconsistent results, depending on if you start from the front or the back. So a quick solution would be:
awk -v N=10 -v p=50 '
BEGIN { n = N*p/100 }
{ s = $0 }
{ for(i=1;i<=length-N;++i) {
str=substr($0,i,N)
c=gsub(/[AT]/,"N",str) + gsub(/[at]/,"n",str)
if(c >= n) s = substr(s,1,i-1) str substr(s,i+N)
}
}
{ print s }' file
There is ofcourse quite some work you do double here. Imagine you have a string of the form xxyyyyyyyyxx, you would perform 4 concatinations while you only need to do one. So the best idea is to minimalise the work and only check the substrings which end with the respective character:
awk -v N=10 -v p=50 '
BEGIN { n = N*p/100 }
{ s = $0 }
{ i=N; while (match(substr($0,i),/[ATat]/)) {
str=substr($0,i+RSTART-N,N)
c=gsub(/[AT]/,"N",str) + gsub(/[at]/,"n",str)
if(c >= n) { s = substr(s,1,i+RSTART-N-1) str substr(s,i+RSTART)}
i=i+RSTART
}
}
{ print s }' file
To replace $0 at position i with x do:
awk 'BEGIN{i=12345;x="blubber"}
{
printf("%s",substr($0,1,i));
printf("%s",x);
printf("%s",substr($0,i+length(x)));
}'
I don't think there is any faster method.
To replace AGCT with N and agct with n use tr. To replace them only within a range and using awk you should do:
awk 'BEGIN{i=12345;n=123}
{
printf("%s",substr($0,1,i-1));
printf(gsub(/[atgc]/,"n",gsub(/[ATGC]/,"N",substr($0,i,i+n-1))));
printf("%s",substr($0,i+n));
}'
To do more advanced and faster processing you should consider c/c++.

Is there any cli program that can proxy a pipe and throw a failure exit code if the throughput is under a certain threshold?

Many times I have typical ETL code that looks like this
./call_some_api.py | ./extract_transform | ./load_to_some_sql
What if either of the first two scripts stop sending bytes because of some internal error that causes them to stall. I wish there was another program I could put before ./load_to_some_sql that will detect that 0 bytes has been sent in 5 minutes, and throw an exit code.
Curl has --speed-limit and --speed-time which do exactly this, but it's not generalized for other cli apps.
Is there a way to do this? Have a way to crash if the throughput hits a certain level on a pipe?
I know state machines and other orchestration tools can do this, but in general if there's a way to do it from bash it would be helpful!
If you are interested in inactivity timeout, these can do it.
One liner (mentioned in comments). Adjust $t for inactivity seconds:
perl -e'$t=300;$SIG{ALRM}=sub{die"Timeout\n"};alarm$t;while(<>){print;alarm$t}'
More robust:
#include <sys/epoll.h>
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <errno.h>
#include <stdlib.h>
#include <signal.h>
long secs = 300;
int stdin_mode, stdout_mode;
void my_exit(int code) {
fcntl(STDIN_FILENO, F_SETFL, stdin_mode);
fcntl(STDOUT_FILENO, F_SETFL, stdout_mode);
exit(code);
}
char* progname = "";
void perror_die(char* msg) {
dprintf(STDERR_FILENO, "%s: %s: %m\n", progname, msg);
my_exit(EXIT_FAILURE);
}
int check(int ret, char *msg) {
if (ret == -1) perror_die(msg);
return ret;
}
void usage() {
dprintf(STDERR_FILENO, "Usage:\n-s <secs> inactivity timeout in seconds (default: %ld seconds)\n", secs);
exit(EXIT_SUCCESS);
}
int main(int argc, char* argv[]) {
progname = argv[0];
signal(SIGHUP, SIG_IGN);
stdin_mode = fcntl(STDIN_FILENO, F_GETFL);
stdout_mode = fcntl(STDOUT_FILENO, F_GETFL);
int opt;
while((opt = getopt(argc, argv, "s:h")) != -1)
switch(opt) {
case 's':
secs = strtol(optarg, NULL, 10);
break;
case 'h':
case '?':
usage();
};
if (optind < argc) usage();
int epfd = check(epoll_create(3), "epoll_create");
struct epoll_event ev = {
.events = EPOLLIN | EPOLLET,
.data.fd = STDIN_FILENO
};
check(epoll_ctl(epfd, EPOLL_CTL_ADD, STDIN_FILENO, &ev), "epoll_ctl");
check(fcntl(STDIN_FILENO, F_SETFL, stdin_mode | O_NONBLOCK), "fcntl stdin");
ev.events = 0;
ev.data.fd = STDOUT_FILENO;
int epout = !epoll_ctl(epfd, EPOLL_CTL_ADD, STDOUT_FILENO, &ev);
if (epout)
check(fcntl(STDOUT_FILENO, F_SETFL, stdout_mode | O_NONBLOCK), "fcntl stdout");
char buf[4096];
ssize_t b_read = 0, b_wrote = 0, bytes = 0;
secs *= 1000;
int epres;
while((epres = epoll_wait(epfd, &ev, 1, secs)) == 1) {
if (ev.events & EPOLLERR) {
errno = EPIPE;
break;
}
for(;;) {
if (b_read == 0) {
b_read = read(STDIN_FILENO, buf, sizeof(buf));
if (b_read > 0) bytes += b_read;
else if (b_read == -1) {
if (errno == EAGAIN) {
b_read = 0;
break;
} else perror_die("read");
} else my_exit(EXIT_SUCCESS);
}
if (b_read) {
int w = write(STDOUT_FILENO, buf + b_wrote, b_read - b_wrote);
if (w != -1) b_wrote += w;
else {
if (errno == EAGAIN && epout) {
ev.events = EPOLLOUT | EPOLLONESHOT;
ev.data.fd = STDOUT_FILENO;
check(epoll_ctl(epfd, EPOLL_CTL_MOD, STDOUT_FILENO, &ev), "epoll_ctl");
break;
} else perror_die("write");
}
if (b_wrote == b_read) b_read = b_wrote = 0;
}
}
}
if (epres) perror_die("event loop");
dprintf(STDERR_FILENO, "%s: Timeout reached\n", progname);
my_exit(EXIT_FAILURE);
}
If you are interested in measuring bytes/secs this can do it, on an interval: (calculation could be improved to keep last n secs bandwidth instead of an interval)
#include <sys/epoll.h>
#include <fcntl.h>
#include <sys/timerfd.h>
#include <stdio.h>
#include <unistd.h>
#include <errno.h>
#include <stdlib.h>
#include <signal.h>
ssize_t min_bw = 1000;
long secs = 5;
int stdin_mode, stdout_mode;
void my_exit(int code) {
fcntl(STDIN_FILENO, F_SETFL, stdin_mode);
fcntl(STDOUT_FILENO, F_SETFL, stdout_mode);
exit(code);
}
char* progname = "";
void perror_die(char* msg) {
dprintf(STDERR_FILENO, "%s: %s: %m\n", progname, msg);
my_exit(EXIT_FAILURE);
}
int check(int ret, char *msg) {
if (ret == -1) perror_die(msg);
return ret;
}
void usage() {
dprintf(STDERR_FILENO, "Usage:\n-b <bytes> minimum bytes per (default: %ld bytes)\n-s <secs> seconds (default: %ld seconds)\n", min_bw, secs);
exit(EXIT_SUCCESS);
}
int main(int argc, char* argv[]) {
progname = argv[0];
signal(SIGHUP, SIG_IGN);
stdin_mode = fcntl(STDIN_FILENO, F_GETFL);
stdout_mode = fcntl(STDOUT_FILENO, F_GETFL);
int opt;
while((opt = getopt(argc, argv, "b:s:h")) != -1)
switch(opt) {
case 'b':
min_bw = strtol(optarg, NULL, 10);
break;
case 's':
secs = strtol(optarg, NULL, 10);
break;
case 'h':
case '?':
usage();
};
if (optind < argc) usage();
int epfd = check(epoll_create(3), "epoll_create");
int tmfd = check(timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK), "timerfd_create");
struct itimerspec when = {{secs, 0}, {secs, 0}};
check(timerfd_settime(tmfd, 0, &when, NULL), "timerfd_settime");
struct epoll_event ev = {
.events = EPOLLIN | EPOLLET,
.data.fd = tmfd
};
check(epoll_ctl(epfd, EPOLL_CTL_ADD, tmfd, &ev), "epoll_ctl");
ev.data.fd = STDIN_FILENO;
check(epoll_ctl(epfd, EPOLL_CTL_ADD, STDIN_FILENO, &ev), "epoll_ctl");
check(fcntl(STDIN_FILENO, F_SETFL, stdin_mode | O_NONBLOCK), "fcntl stdin");
ev.events = 0;
ev.data.fd = STDOUT_FILENO;
int epout = !epoll_ctl(epfd, EPOLL_CTL_ADD, STDOUT_FILENO, &ev);
if (epout)
check(fcntl(STDOUT_FILENO, F_SETFL, stdout_mode | O_NONBLOCK), "fcntl stdout");
char tmbuf[64];
char buf[4096];
ssize_t b_read = 0, b_wrote = 0, bytes = 0;
int epres;
while((epres = epoll_wait(epfd, &ev, 1, -1)) == 1) {
if (ev.events & EPOLLERR) {
errno = EPIPE;
break;
}
if (ev.data.fd == tmfd) {
if (bytes < min_bw) {
dprintf(STDERR_FILENO, "Too slow\n");
my_exit(EXIT_FAILURE);
}
check(read(tmfd, tmbuf, sizeof(tmbuf)), "read timerfd");
bytes = 0;
continue;
}
for(;;) {
if (b_read == 0) {
b_read = read(STDIN_FILENO, buf, sizeof(buf));
if (b_read > 0) bytes += b_read;
else if (b_read == -1) {
if (errno == EAGAIN) {
b_read = 0;
break;
} else perror_die("read");
} else my_exit(EXIT_SUCCESS);
}
if (b_read) {
int w = write(STDOUT_FILENO, buf + b_wrote, b_read - b_wrote);
if (w != -1) b_wrote += w;
else {
if (errno == EAGAIN && epout) {
ev.events = EPOLLOUT | EPOLLONESHOT;
ev.data.fd = STDOUT_FILENO;
check(epoll_ctl(epfd, EPOLL_CTL_MOD, STDOUT_FILENO, &ev), "epoll_ctl");
break;
} else perror_die("write");
}
if (b_wrote == b_read) b_read = b_wrote = 0;
}
}
}
perror_die("event loop");
}
Both programs are for linux only.
Provided you have a newish version of Bash, the read builtin can optionally enforce a timeout [docs], which means that it's actually not hard to write a bit of pure Bash code that does exactly what you describe:
... upstream command ... \
| {
while true ; do
read -r -N 1 -t 300 char
result=$?
if (( result > 128 )) ; then
echo 'Read timed out.' >&2
exit 1
elif (( result > 0 )) ; then
exit 0
elif [[ "$char" == '' ]] ; then
printf '\0'
else
printf %s "$char"
fi
done
} \
| ... downstream command ...
(Note: the above reads one character at a time for simplicity's sake; if you expect a high volume of data to pass through this, then you may need to adjust it for better performance.)
But even though the above does exactly what you describe, I'm not sure whether it will really accomplish your goal, because the upstream command won't actually die until it tries to write something and gets SIGPIPE. So even after the above prints Read timed out, Bash will just keep hanging indefinitely, probably until the user hits Ctrl-C. (The same goes for niry's answer, of course.) That's a bit harder to fix; I guess when your command detects a timeout, it will need to find the process-IDs of the upstream commands, and use kill to proactively send them SIGPIPE (or some other signal of your choosing)?

CUDA string search in large file, wrong result

I am working on simple naive string search in CUDA.
I am new in CUDA. It works fine fol smaller files ( aprox. ~1MB ). After I make these files bigger ( ctrl+a ctrl+c several times in notepad++ ), my program's results are higher ( about +1% ) than a
grep -o text file_name | wc -l
It is very simple function, so I don't know what could cause this. I need it to work with larger files ( ~500MB ).
Kernel code ( gpuCount is a __device__ int global variable ):
__global__ void stringSearchGpu(char *data, int dataLength, char *input, int inputLength){
int id = blockDim.x*blockIdx.x + threadIdx.x;
if (id < dataLength)
{
int fMatch = 1;
for (int j = 0; j < inputLength; j++)
{
if (data[id + j] != input[j]) fMatch = 0;
}
if (fMatch)
{
atomicAdd(&gpuCount, 1);
}
}
}
This is calling the kernel in main function:
int blocks = 1, threads = fileSize;
if (fileSize > 1024)
{
blocks = (fileSize / 1024) + 1;
threads = 1024;
}
clock_t cpu_start = clock();
// kernel call
stringSearchGpu<<<blocks, threads>>>(cudaBuffer, strlen(buffer), cudaInput, strlen(input));
cudaDeviceSynchronize();
After this I just copy the result to Host and print it.
Can anyone please help me with this?
First of all, you should always check return values of CUDA functions to check for errors. Best way to do so would be the following:
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
Wrap your CUDA calls, such as:
gpuErrchk(cudaDeviceSynchronize());
Second, your kernel accesses out of bounds memory. Suppose, dataLength=100, inputLength=7 and id=98. In your kernel code:
if (id < dataLength) // 98 is less than 100, so condition true
{
int fMatch = 1;
for (int j = 0; j < inputLength; j++) // j runs from [0 - 6]
{
// if j>1 then id+j>=100, which is out of bounds, illegal operation
if (data[id + j] != input[j]) fMatch = 0;
}
Change the condition to something like:
if (id < dataLength - inputLength)

Main function with arguments

I'm trying to understand the main function with the arguments argc and argv. In the command line I am trying to copy the contents of multiple txt files on the screen (concatenation). When I write in the command line appname.exe something f1.txt, the content from the f1.txt prints in a loop. If f1.txt had the text "abcda" the output in console would be "abcdaabcdaabcda...". Sorry for my english; can someone help me understand what I did wrong?
#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>
int main(int argc, char *argv[])
{
int i;
for (i = 2; i <= argc - 1;i+2)
{
FILE *f = fopen(argv[i], "r");
if (f == 0)
{
printf("Error\n");
}
else
{
int x;
while ((x = fgetc(f)) != EOF)
{
printf("%c", x);
}
}
fclose(f);
}
}
Here's one big problem:
for (i = 2; i <= argc - 1;i+2)
I think you mean to do:
for (i = 2; i <= argc - 1; i++)

Resources