I have a written a small utility in D to convert the output of find -print0 to printf %b format. Such a utility already exists (nul2pfb from http://www.dwheeler.com/essays/filenames-in-shell.html), but the link is dead and I could not find the program, so I decided to implement it myself in D. I am using the following zsh command for testing:
diff <(find) <(for i in "$(find -print0 | dd | char2code)"; do printf '%b\n' "$i"; done)
The expected output is empty, but I find that some filenames that include a hyphen are handled wrong.
My source code is this:
import std.stdio;
import std.conv;
import std.ascii;
import std.c.stdlib;
void main()
{
foreach (ubyte[] mybuff; chunks(stdin, 4096)) {
encodeline (mybuff);
}
}
#safe void encodeline (ubyte[] mybuff) {
// char[] outstring;
foreach (ubyte i; mybuff) {
char b = to!char(i);
switch (i) {
case 'a': .. case 'z':
case 'A': .. case 'Z':
case '0': .. case '9':
case '/':
case '.':
case '_':
case ':': writeChar(b); break;
default: writeOctal(b); break;
case 0: writeChar ('\n'); break;
case '\\': writeString(`\\`); break;
case '\t': writeString(`\t`); break;
case '\n': writeString(`\n`); break;
case '\r': writeString(`\r`); break;
case '\f': writeString(`\f`); break;
case '\v': writeString(`\v`); break;
case '\a': writeString(`\a`); break;
case '\b': writeString(`\b`); break;
}
}
// writeString (outstring);
}
#trusted void writeString (string a)
{
write (a);
}
#trusted void writeOctal (int a)
{
try
{
writef ("\\%.#o", a); // leading 0 needed for for zsh printf '%b'
}
catch (std.format.FormatException b)
{
write ("Format exception in function writeOctal");
throw b;
}
}
#trusted void writeChar (char a)
{
try
{
write (a);
}
catch (std.format.FormatException b)
{
write ("Format exception in function writeChar");
throw b;
}
}
#trusted void writeNewline ()
{
writeln;
}
Here is a part of the diff output:
Correct filenames:
./.ibam/profile-004-battery
./.ibam/profile-034-charge
./.ibam/profile-054-charge
./.ibam/profile-045-battery
(a bunch of lines skipped)
---
Wrong filenames:
./.ibam/profileh04-battery
./.ibam/profileh34-charge
./.ibam/profileh54-charge
./.ibam/profileh45-battery
It seems like -0 is being replaced by h.
UPDATE: Replacing .# with .4 in the conversion specifier for writef fixed the problem but I still see differences in output of searches through /proc. However, this is also true when I do (as root)
diff <(find / print) <(find / print) > (myhomedirectory)/log.txt
so I have chosen to ignore it.
As it turned out, the problem was that whenever an octal escape sequence was followed by a number between 0 and 7 inclusive, and the original escape sequence was not already 4 octal digits long, the number became part of the octal escape sequence, causing printf %b to produce incorrect output. This was fixed by replacing the # by .4 in the conversion specifier in the D program.
This is the corrected source code, with some unneeded functions deleted:
import std.stdio;
import std.conv;
import std.ascii;
import std.c.stdlib;
void main()
{
foreach (ubyte[] mybuff; chunks(stdin, 4096)) {
encodeline (mybuff);
}
}
#safe void encodeline (ubyte[] mybuff) {
// char[] outstring;
foreach (ubyte i; mybuff) {
char b = to!char(i);
switch (i) {
case 'a': .. case 'z':
case 'A': .. case 'Z':
case '0': .. case '9':
case '/':
case '.':
case '_':
case ':': writeChar(b); break;
default: writeOctal(b); break;
case 0: writeChar ('\n'); break;
case '\\': writeString(`\\`); break;
case '\t': writeString(`\t`); break;
case '\n': writeString(`\n`); break;
case '\r': writeString(`\r`); break;
case '\f': writeString(`\f`); break;
case '\v': writeString(`\v`); break;
case '\a': writeString(`\a`); break;
case '\b': writeString(`\b`); break;
}
}
}
#trusted void writeString (string a)
{
write (a);
}
#trusted void writeOctal (int a)
{
writef ("\\%.4o", a); // leading 0 needed for for zsh printf '%b'
}
#trusted void writeChar (char a)
{
write (a);
}
Related
This is a program to convert infix to postfix in stack data structures.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int F(char symbol)
{
switch(symbol)
{
case '+':
case '-':return 2;
case '*':
case '/':return 4;
case '^':
case '$':return 5;
case '(':return 0;
case '#':return -1;
default:return 8;
}
}
int G(char symbol)
{
switch(symbol)
{
case '+':
case '-':return 1;
case '*':
case '/':return 3;
case '^':
case '$':return 6;
case '(':return 9;
case ')':return 0;
default:return 7;
}
}
void infixtopostfix(char infix[],char postfix[])
{
int top,i,j=0;
top = -1;
char s[30],symbol;
s[++top]= '#';
for(i=0;i<strlen(infix);i++)
{
symbol = infix[i];
while(F(s[top]) > G(symbol))
{
postfix[j]=s[top--];
j++;
}
if(F(s[top]) != G(symbol))
s[++top]=symbol;
else
top--;
}
while(s[top] != '#')
{
postfix[j++]=s[top--];
}
postfix[j] = '\0';
}
int main()
{
char infix[20],postfix[20];
printf("Enter the infix expression:\n");
scanf("%s",infix);
infixtopostfix(infix,postfix);
printf("Postfix Expression is %s",postfix);
return 0;
}
In this code, what is going on with the following lines?
if(F(s[top]) != G(symbol))
s[++top]=symbol;
else
top--;
}
while(s[top] != '#')
{
postfix[j++]=s[top--];
}
I don’t understand how f(s[top]) != g(symbol) is different than f(s[top]) > g(symbol), because if it is greater, it means automatically it is not equal. And what is f(s[top]) and g(symbol)?
The conditions
f(s[top]) != g(symbol)
and
f(s[top]) > g(symbol)
are both different.
The first one gives false if f(s[top]) and g(symbol) are equal. But in the second condition if f(s[top]) is less than g(symbol) then it generates false. But according to the first condition, true should be generated.
I created a function that compares strings, and as I was frustrated about it always missing the last character in the second string and always returning "identical strings" as a result, I noticed that I was messing around and used gets() instead of fgets() for the second string. I changed that and the function works as expected.
My question is, why does the gets() function subtract that last character? Shouldn't it subtract the null and leave it at that?
Does that mean that as a newcomer to C, I should avoid using gets() and focus on fgets() instead? I'm starting to think of them in the same way I think of strcmp() vs strncmp()
Thanks for your time everyone!
Note: I'm aware that I don't really need the (i==j) at the end, I just left it there (extra security, maybe?).
bool compare_string(const char *string1, const char *string2) {
int i = 0, j = 0, result = 0;
while (string1[i] != '\0') {
i++;
}
while (string2[j] != '\0') {
j++;
}
i = 0;
j = 0;
while ((string1[i] != '\0') && (string2[j] != '\0')) {
if (string1[i] < string2[j]) {
result = -1;
break;
} else if (string1[i] > string2[j]) {
result = 1;
break;
} else if (string1[i] == string2[j]) {
result = 0;
}
i++;
j++;
}
if ((result == 0) && (i==j)) {
printf("identical strings \n");
} else if (result == -1) {
printf("not identical, -1 \n");
} else if (result == 1) {
printf("not identical, 1 \n");
}
}
//in main
char str_compare1[STRING_LIMIT];
char str_compare2[STRING_LIMIT];
printf("enter 1st string to compare, (100) characters or less: \n");
fgets(str_compare1, STRING_LIMIT, stdin);
printf("enter 2nd string to compare, (100) characters or less \n");
fgets(str_compare2, STRING_LIMIT, stdin);
result = compare_string(str_compare1, str_compare2);
I am trying to solve ONP - Transform the Expression in spoj. The question is to transform infix expression into postfix expression. I have used std::stack as my data structure and shunting-yard algorithm for solving it. The code runs fine on my computer using g++. But on spoj, it gives SIGABRT error. Even on ideone, it gives run time error free() invalid pointer.
I have tried several test cases. At first, I thought that my program was taking too much memory, but upon testing with time -v (ubuntu), I found that the maximum space taken was in KB.
// ------------------- infix to postfix conversion ---------------
#include <iostream>
#include <string>
#include <stack>
#include <algorithm>
#include <utility>
using std::stack;
using std::pair;
using std::cout;
using std::cin;
using std::endl;
using std::string;
stack< pair<char, short> > op_st; // operator stack
short op_precedence(char op) {
// return operator precedence
// input: operator; output: its precedence value
switch (op) {
case '+': return 0;
case '-': return 1;
case '*': return 2;
case '/': return 3;
case '^': return 4;
case '(': return 6;
}
}
inline bool is_operator(char sym) {
// is sym an operator?
return (sym == '+' || sym == '-' || sym == '*' || sym == '/' || sym == '^' || sym == '(');
}
inline bool is_operand(char sym) {
// is sym an operand?
return (sym >= 'a' && sym <= 'z');
}
void in_to_post(string & expr) {
// infix to postfix converter
// input: infix expression
for (int i = 0; i < expr.length(); ++i) {
if (is_operator(expr[i])) { // operator
// pop op_stack until the
// top of the stack has less precedence
// than curr operator or stack is empty
while(1) {
if (op_st.empty()) { // stack is empty; straight away push
op_st.push(std::make_pair(expr[i], op_precedence(expr[i])));
break;
}
pair <char, short> & top_op = op_st.top();
if (op_precedence(top_op.second) >= op_precedence(expr[i])) {
cout << top_op.first;
op_st.pop();
}
else {
op_st.push(std::make_pair(expr[i], op_precedence(expr[i])));
break;
}
}
}
else if (is_operand(expr[i])) { // operand; push it to output queue immediately
cout << expr[i];
}
else if (expr[i] == ')') { // right paranthesis
while (1) {
if (op_st.empty()) { // invalid expression; ')' reached before matching '('
//cout << "No matching '(' found\n";
abort();
}
pair <char, short> & top_op = op_st.top();
if (top_op.first == '(') { // matching '(' found; stop
op_st.pop();
break;
}
else {
cout << top_op.first;
op_st.pop();
}
}
}
}
// pop out the whole op_st (if any)
while (!(op_st.empty())) {
pair <char, short> & top_op = op_st.top();
cout << top_op.first;
op_st.pop();
}
}
int main() {
int t;
cin >> t;
for (int i = 0; i < t; ++i) {
string expr;
cin >> expr;
//cout << expr.length() << endl;
in_to_post(expr);
cout << "\n";
}
return 0;
}
Input to program given on my system:
((a+b)-c*(d+e))^((a+b)-c*(d+e))+((a+b)-c*(d+e))-((a+b)-c*(d+e))+((a+b)-c*(d+e))^((a+b)-c*(d+e))+((a+b)-c*(d+e))-((a+b)-c*(d+e))-((a+b)-c*(d+e))^((a+b)-c*(d+e))+((a+b)-c*(d+e))-((a+b)-c*(d+e))*((a+b)-c*(d+e))^((a+b)-c*(d+e))+((a+b)-c*(d+e))-((a+b)-c*(d+e))^((a+b)-c*(d+e))^((a+b)-c*(d+e))+((a+b)-c*(d+e))-((a+b)-c*(d+e))+((a+b)-c*(d+e))^((a+b)-c*(d+e))+((a+b)-c*(d+e))-((a+b)-c*(d+e))^((a+b)-c*(d+e))^((a+b)-c*(d+e))+((a+b)-c*(d+e))-((a+b)-c*(d+e))
Successfully gives the output:
ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*-ab+cde+*--+^^-+^+-+^^-+^*-+^--+^+-+^.
But, the same code gives free() invalid pointer error in ideone. Why is that?
op_precedence(top_op.second) calls op_precedence with the number returned by earlier op_precedence call - not with the operator character.
When op_precedence is passed an argument that doesn't match one of the recognized operators, the program exhibits undefined behavior, by way of reaching the end of a non-void function without encountering a return statement.
So, after Igor Tandetnik kindly pointed the mistake to me, in line 58, I changed op_precedence(top_op.second) to top_op.second. I also added default case to op_precedence function to correct the warning. After compiling and running, this code did actually abort() on line 75 for simple input ((a+b)). It turns out, that my implementation of the algorithm was wrong. My code didn't take into consideration associativity of operator '('. According to the question, we don't need to take into consideration associativity for other operators but for '(' we need its precedence to be higher than all other operators when outside of the stack, and lower than all other operators when inside of the stack. This is to ensure that when any operator other than '(' is the input and the top of the stack is '(', we could push the input operator into the stack. The corrected code is below:
// ------------------- infix to postfix conversion ---------------
#include <iostream>
#include <string>
#include <stack>
#include <algorithm>
#include <utility>
using std::stack;
using std::pair;
using std::cout;
using std::cin;
using std::endl;
using std::string;
short op_out_precedence(char op) {
// return operator precedence (when outside of stack)
// in and out precedence is to take care of associativity
// Here we don't require the associativity of any other operator except '('
// input: operator; output: its precedence value
switch (op) {
case '+': return 1;
case '-': return 2;
case '*': return 3;
case '/': return 4;
case '^': return 5;
case '(': return 6;
default : abort(); // not supposed to happen
// operator can't be other than the mentioned cases
}
}
short op_in_precedence(char op) {
// return operator precedence (when inside of stack)
// in and out precedence is to take care of associativity
// Here we don't require the associativity of any other operator except '('
// input: operator; output: its precedence value
switch (op) {
case '+': return 1;
case '-': return 2;
case '*': return 3;
case '/': return 4;
case '^': return 5;
case '(': return 0;
default : abort(); // not supposed to happen
// operator can't be other than the mentioned cases
}
}
inline bool is_operator(char sym) {
// is sym an operator?
return (sym == '+' || sym == '-' || sym == '*' || sym == '/' || sym == '^' || sym == '(');
}
inline bool is_operand(char sym) {
// is sym an operand?
return (sym >= 'a' && sym <= 'z');
}
void in_to_post(string & expr) {
// infix to postfix converter
// input: infix expression
stack< pair<char, short> > op_st; // operator stack
int len = expr.length();
for (int i = 0; i < len; ++i) {
if (is_operator(expr[i])) { // operator
// pop op_stack until the
// top of the stack has less or equal precedence
// than curr operator or stack is empty
while(1) {
if (op_st.empty()) { // stack is empty; straight away push
op_st.push(std::make_pair(expr[i], op_in_precedence(expr[i])));
break;
}
pair <char, short> & top_op = op_st.top();
if (top_op.second > op_out_precedence(expr[i])) {
cout << top_op.first;
op_st.pop();
}
else {
op_st.push(std::make_pair(expr[i], op_in_precedence(expr[i])));
break;
}
}
}
else if (is_operand(expr[i])) { // operand; push it to output queue immediately
cout << expr[i];
}
else if (expr[i] == ')') { // right paranthesis
while (1) {
if (op_st.empty()) { // invalid expression; ')' reached before matching '('
cout << "No matching '(' found\n";
abort();
}
pair <char, short> & top_op = op_st.top();
if (top_op.first == '(') { // matching '(' found; stop
op_st.pop();
break;
}
else {
cout << top_op.first;
op_st.pop();
}
}
}
}
// pop out the whole op_st (if any)
while (!(op_st.empty())) {
pair <char, short> & top_op = op_st.top();
cout << top_op.first;
op_st.pop();
}
}
int main() {
int t;
cin >> t;
for (int i = 0; i < t; ++i) {
string expr;
cin >> expr;
//cout << expr.length() << endl;
in_to_post(expr);
cout << "\n";
}
return 0;
}
I am using the following one liner to list the occurrences of combinations of ATCG, forming string of length 6. It works fine aside from not printing the occurrence of 0 matches. Is there a way to change the regex, or another part, to where it will print something like "0 ATTTAG"?
#!/bin/bash
for file in e_coli.fa
do
base=$(basename $file .fa)
cat $file | perl -nE 'say for /(?<=([ATCG]{6}))/g' \
| sort | uniq -c >> ${base}_hexhits_6mer.txt
done
stdout:
465 AAAAAA
607 AAAAAC
661 AAAAAG
581 AAAAAT
563 AAAACA
807 AAAACC
770 AAAACG
373 AAAACT
663 AAAAGA
1213 AAAAGC
Since uniq -c counts the number of times a line occurs, it can't possibly return 0. The requested change requires a complete rewrite.
perl -e'
while (<>) {
++$counts{$_} for /(?=([ATCG]{6}))/g;
}
for my $seq (glob("{A,C,G,T}" x 6)) {
printf("%7d %s\n", $counts{$seq}, $seq);
}
' "$file" >"${base}_hexhits_6mer.txt"
The simplest way is to build a hash of occurrence counts for each pattern, and then print the count of all possible patterns
This program uses the glob trick to generate a list of all possible six-character strings formed from A, T, C and G
use strict;
use warnings 'all';
my #files = qw/ e_coli.fa /;
my %counts;
for my $file ( #files ) {
open my $fh, '<', $file or die qq{Unable to open "$file" for input: $!};
while ( <$fh> ) {
++$counts{$1} while /(?= ( [ATCG]{6} ) ) /gx;
}
}
for my $pattern ( glob '{A,T,C,G}' x 6 ) {
printf "%4d %s\n", $counts{$pattern} // 0, $pattern;
}
In case you have a lot of data and you need something a little faster, here's a C solution:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void reader(FILE* in, unsigned long hist[4096]) {
for (unsigned long key=0, count=0;;) {
switch(getc(in)) {
case EOF: return;
case 'A': key <<= 2; break;
case 'C': key <<= 2; key += 1; break;
case 'G': key <<= 2; key += 2; break;
case 'T': key <<= 2; key += 3; break;
default: count=0; continue;
}
if (count == 5) ++hist[key & 0xFFF];
else ++count;
}
}
int putkey(FILE* out, unsigned long key) {
char s[6];
for (int j=6; j--; key >>= 2) s[j] = "ACGT"[key&3];
return fprintf(out, "%.6s", s);
}
void writer(FILE* out, unsigned long hist[4096]) {
for (unsigned long key = 0; key < 4096; ++key) {
fprintf(stdout, "%7lu ", hist[key]);
putkey(out, key);
putchar('\n');
}
}
int main(int argc, char** argv) {
FILE* in = stdin;
if (argc > 1) in = fopen(argv[1], "r");
if (!in) { perror(argv[1]); exit(1); }
unsigned long hist[4096] = {0};
reader(in, hist);
writer(stdout, hist);
return 0;
}
It took a bit less than half a second to process a 31MB fastq sample (which, as it happens, includes all 4096 possible six-character sequences); the Perl solutions took 12 seconds (fugu) and 18 seconds (ikegami/borodin), respectively.
What you're asking to do is much more complicated. To know what you haven't seen requires that you first know all the possible combinations of characters, that you can then filter against.
Here, I use a sliding window approach using substr in Perl to find tally all "seen" ATCG characters in a string of As, Ts, Cs and Gs (as read from __DATA__) in a hash. These are then sorted, so that the most commonly observed 6-mer is shown first, and printed out.
use strict;
use warnings;
my #bases = qw/ A G C T /;
my %data;
for my $a1(#bases){
for my $a2(#bases){
for my $a3(#bases){
for my $a4(#bases){
for my $a5(#bases){
for my $a6(#bases){
$data{"$a1$a2$a3$a4$a5$a6"} = 0;
}
}
}
}
}
}
my $nucs = <DATA>;
my $len = length($nucs);
for (my $i = 0; $i <= $len - 6; $i++) {
my $kmer = substr($nucs, $i, 6);
next if $kmer =~ tr/ACGT//c;
$data{$kmer}++; # populate hash with "seen" 6-mers
}
# print out sorted hash
foreach my $seq (sort { $data{$b} <=> $data{$a} } keys %data ){
print "$seq,$data{$seq}\n";
}
__DATA__
ATGCCCGTCGTAGTCATGCATGCATCGATCGATGCATGCTACGTGTTGT
There is clearly going to be a better/prettier way of calculating all permutations of characters in a string than what I've done, but it works.
As Borodin says, this prints out mostly variations of "unseen" strings.
#!/bin/bash
for i in {10..11}
do
./duffing -a 1 -b -1 -u 0.25 -w -1 -A 0.4 -t $i | ./stroboscopic > $i.data
done
The $i doesn't seem to work in the program parameter line but I get the data files. This is the error
error converting i to a double for max-time:
leftover characters: i
Bad input data
error converting i to a double for max-time:
leftover characters: i
Bad input data
This is where I parse arguments in duffing program :
void parse_args(int argc, char **argv, state_t *state, system_t *system,
simulation_t *simulation, int *read_initial, int *print_final)
{
int ch;
duffing_state_t *duffing = (duffing_state_t *)system->dx_dt_state;
double dtemp;
size_t i;
while (1) {
ch = getopt_long(argc, argv, short_options, long_options, NULL);
if (ch == -1)
break;
switch(ch) {
case 'd':
simulation->dt = safe_strtod(optarg, "time-step");
break;
case 't':
simulation->t_max = safe_strtod(optarg, "max-time");
break;
case 'T':
duffing->T = safe_strtod(optarg, "transient-time");
break;
case 'x':
state->x[0] = safe_strtod(optarg, "x0");
break;
case 'v':
state->x[1] = safe_strtod(optarg, "v0");
break;
case 'm':
system->m = safe_strtod(optarg, "mass");
break;
case 'a':
duffing->a = safe_strtod(optarg, "alpha");
break;
case 'b':
duffing->b = safe_strtod(optarg, "beta");
break;
case 'u':
duffing->u = safe_strtod(optarg, "mu");
break;
case 'w':
duffing->w = safe_strtod(optarg, "omega");
break;
case 'A':
duffing->A = safe_strtod(optarg, "amplitude");
break;
case 'E':
simulation->step_fn = &euler_step;
break;
case 'M':
simulation->step_fn = &midpoint_step;
break;
case 'R':
simulation->step_fn = &rk4_step;
break;
case 'i':
*read_initial = 1;
break;
case 'f':
*print_final = 1;
break;
case '?':
exit(EXIT_FAILURE);
default:
fprintf(stderr, "?? getopt returned character code 0%o ??\n", ch);
}
}
/* convert input from periods to seconds */
simulation->t_max *= 2*M_PI / duffing->w;
duffing->T *= 2*M_PI / duffing->w;
return;
}
I have ran the program straight from the terminal with the -t 10 so I'm pretty confused why the program is refusing to accept the input from the script.