How to reformat a text file - awk - bash

I have a text file containing 3 columns:
broke banana 192
broke apple 175
broke avocado 20
fixed banana 117
fixed apple 89
I need the output below:
Issue,banana,apple,avocado
broke,192,175,20
fixed,117,90,0
I am new to this and have no idea how to get this result.
I appreciate any help,
Thanks

Input
$ cat file
broke banana 192
broke apple 175
fixed banana 117
fixed apple 89
I don't understand where from you got fixed,117,90 in expected o/p
Output
$ awk -v OFS=, '{
is_fr[$1,$2]=$3
if(!($1 in i_tmp))issue[++i]=$1;
if(!($2 in f_tmp))fruit[++f]=$2;
i_tmp[$1]; f_tmp[$2]
}
END{
printf ("%s","issue");
for(i=1; i in fruit; i++)
printf("%s%s",OFS,fruit[i]);
print "";
for(i=1; i in issue; i++)
{
printf("%s",issue[i]);
for(j=1; j in fruit; j++)
{
printf("%s%s",OFS,(issue[i],fruit[j]) in is_fr ? is_fr[issue[i],fruit[j]]:"")
}
print ""
}
}' file
issue,banana,apple
broke,192,175
fixed,117,89
If order doesn't matter then you may try below awk
$ awk -v OFS=, '{
issue[$1];
fruit[$2];
is_fr[$1,$2]=$3
}
END{
printf ("%s","issue");
for(i in fruit)
printf("%s%s",OFS,i);
print "";
for(i in issue)
{
printf("%s",i);
for(j in fruit)
{
printf("%s%s",OFS,(i,j) in is_fr ? is_fr[i,j]:"")
}
print ""
}
}' file
issue,apple,banana
fixed,89,117
broke,175,192
For new input edited by OP
akshay#db-3325:/tmp$ cat f
broke banana 192
broke apple 175
broke avocado 20
fixed banana 117
fixed apple 89
akshay#db-3325:/tmp$ awk -v OFS=, '{
is_fr[$1,$2]=$3
if(!($1 in i_tmp))issue[++i]=$1;
if(!($2 in f_tmp))fruit[++f]=$2;
i_tmp[$1]; f_tmp[$2]
}
END{
printf ("%s","issue");
for(i=1; i in fruit; i++)
printf("%s%s",OFS,fruit[i]);
print "";
for(i=1; i in issue; i++)
{
printf("%s",issue[i]);
for(j=1; j in fruit; j++)
{
printf("%s%s",OFS,(issue[i],fruit[j]) in is_fr ? is_fr[issue[i],fruit[j]]+0:0)
}
print ""
}
}' f
issue,banana,apple,avocado
broke,192,175,20
fixed,117,89,0

$ cat tst.awk
BEGIN { OFS="," }
{
states[$1]
fruits[$2]
count[$1,$2] += $NF
}
END {
printf "%s", "Issue"
for (fruit in fruits) {
printf "%s%s", OFS, fruit
}
print ""
for (state in states) {
printf "%s", state
for (fruit in fruits) {
printf "%s%s", OFS, count[state,fruit]+0
}
print ""
}
}
$ awk -f tst.awk file
Issue,apple,banana,avocado
fixed,89,117,0
broke,175,192,20

Related

Conditional vlookup in bash with awk or sed?

I have those two files (both have headers), each line of both files are starting with a date on the first column with the same format. the separator is a semicolon.
On the 9th column of the first file, I can only have those id: UK or JPN or EUR.
I need to aggregate file1 with the intel from file2 with the corresponding date intel.
I can try to do it with a bash script and a "for" loop of course, but I'm sure that resource wise it will be better with an awk or else bash command... if possible!
Thanks in advance for any hint.
ps: I tried unsuccessfully to adapt this method: https://unix.stackexchange.com/questions/428861/vlookup-equivalent-in-awk-scripting
The first file :
Date;$2;$3;$4;$5;$6;$7;$8;Id
2018-01-01; ;UK
2018-01-02; ;JPN
2018-01-03; ;EUR
2018-01-04; ;JPN
the second file :
Date;UKDIR;JPNDIR;EURDIR
2018-01-01;1;2;3
2018-01-02;4;5;6
2018-01-03;7;8;9
2018-01-04;11;10;12
Expected return
Date;$2;$3;$4;$5;$6;$7;$8;Id ;Intel
2018-01-01; ;UK ;1
2018-01-02; ;JPN ;5
2018-01-03; ;EUR ;9
2018-01-04; ;JPN ;10
You may use this awk:
awk -F';' -v OFS='; ' 'NR==1 { for (i=2; i<=NF; i++) h[i]=$i; next }
FNR==NR { for (i=2; i<=NF; i++) a[$1,h[i]]=$i; next }
FNR==1 { print $0, "Intel"; next }
{ print $0, a[$1,$NF "DIR"] }' file2 file1
Date;$2;$3;$4;$5;$6;$7;$8;Id; Intel
2018-01-01; ;UK; 1
2018-01-02; ;JPN; 5
2018-01-03; ;EUR; 9
2018-01-04; ;JPN; 10
Could you please try following.
awk '
BEGIN{
count=count1=1
FS=OFS=";"
}
FNR!=NR && FNR==1{
print $0 OFS "Intel"
}
FNR==NR && /^[0-9]/{
a[$1]=$(++count)
count=count==4?1:count
next
}
NF && /^[0-9]/{
print $0 OFS a[$1]
count1=count1==4?1:count1
}
' second_file first_file
Output will be as follows.
Date;$2;$3;$4;$5;$6;$7;$8;Id;Intel
2018-01-01; ;UK;1
2018-01-02; ;JPN;5
2018-01-03; ;EUR;9
2018-01-04; ;JPN;11
$ cat tst.awk
BEGIN { FS=OFS=";" }
NR==FNR {
if (NR == 1) {
for (fldNr=2; fldNr<=NF; fldNr++) {
fldName = $fldNr
sub(/DIR/,"",fldName)
fldNr2name[fldNr] = fldName
}
}
else {
for (fldNr=2; fldNr<=NF; fldNr++) {
fldName = fldNr2name[fldNr]
dateFldName2val[$1,fldName] = $fldNr
}
}
next
}
{
print $0, (FNR>1 ? dateFldName2val[$1,$NF] : "Intel")
}
$ awk -f tst.awk file2 file1
Date;$2;$3;$4;$5;$6;$7;$8;Id;Intel
2018-01-01; ;UK;1
2018-01-02; ;JPN;5
2018-01-03; ;EUR;9
2018-01-04; ;JPN;10

match pattern and print corresponding columns from a file using awk or grep

I have a input file with repetitive headers (below):
A1BG A1BG A1CF A1CF A2ML1
aa bb cc dd ee
1 2 3 4 5
I want to print all columns with same header in one file. e.g for above file there should be three output files; 1 for A1BG with 2 columns; 2nd for A1CF with 2 columns; 3rd for A2ML1 with 1 column. I there any way to do it using one-liners by awk or grep?
I tried following one-liner:
awk -v f="A1BG" '!o{for(x=1;x<=NF;x++)if($x==f){o=1;next}}o{print $x}' trial.txt
but this searches the pattern in only one column (1 in this case). I want to look through all the header names and print all the corresponding columns which have A1BG in their header.
This awk solution takes the same approach as Lars but uses gawk 4.0 2D arrays
awk '
# fill cols map of header to its list of columns
NR==1 {
for(i=1; i<=NF; ++i) {
if(!($i in cols))
j=0
cols[$i][j++]=i
}
}
{
# write tab-delimited columns for each header to its cols.header file
for(h in cols) {
of="cols."h
for(i=0; i < length(cols[h]); ++i) {
if(i > 0) printf("\t") >of
printf("%s", $cols[h][i]) >of
}
printf("\n") >of
}
}
'
awk solution should be pretty fast - output files are tab-delimited and named cols.A1BG cols.A1CF etc
awk '
# fill cols columns map to header and tab map to track tab state per header
NR==1 {
for(i=1; i<=NF; ++i) {
cols[i]=$i
tab[$i]=0
}
}
{
# reset tab state for every header
for(h in tab) tab[h]=0
# write tab-delimited column to its cols.header file
for(i=1; i<=NF; ++i) {
hdr=cols[i]
of="cols." hdr
if(tab[hdr]) {
printf("\t") >of
} else
tab[hdr]=1
printf("%s", $i) >of
}
# newline for every header file
for(h in tab) {
of="cols." h
printf("\n") >of
}
}
'
This is the output from both of my awk solutions:
$ ./scr.sh <in.txt; head cols.*
==> cols.A1BG <==
A1BG A1BG
aa bb
1 2
==> cols.A1CF <==
A1CF A1CF
cc dd
3 4
==> cols.A2ML1 <==
A2ML1
ee
5
I cannot help you with a 1-liner but here is a 10-liner for GNU awk:
script.awk
NR == 1 { PROCINFO["sorted_in"] = "#ind_num_asc"
for( i=1; i<=NF; i++ ) { f2c[$i] = (i==1)? i : f2c[$i] " " i } }
{ for( n in f2c ) {
split( f2c[n], fls, " ")
tmp = ""
for( f in fls ) tmp = (f ==1) ? $fls[f] : tmp "\t" $fls[f]
print tmp > n
}
}
Use it like this: awk -f script.awk your_file
In the first action: it determines filenames from the columns in the first record (NR == 1).
In the second action: for each record: for each output file: its columns (as defined in the first record) are collected into tmp and written to the output file.
The use of PROCINFO requires GNU awk, see Ed Mortons comments for alternatives.
Example run and ouput:
> awk -f mpapccfaf.awk mpapccfaf.csv
> cat A1BG
A1BG A1BG
aa bb
1 2
Here y'go, a one-liner as requested:
awk 'NR==1{for(i=1;i<=NF;i++)a[$i][i]}{PROCINFO["sorted_in"]="#ind_num_asc";for(n in a){c=0;for(f in a[n])printf"%s%s",(c++?OFS:""),$f>n;print"">n}}' file
The above uses GNU awk 4.* for true multi-dimensional arrays and sorted_in.
For anyone else reading this who prefers clarity over the brevity the OP needs, here it is as a more natural multi-line script:
$ cat tst.awk
NR==1 {
for (i=1; i<=NF; i++) {
names2fldNrs[$i][i]
}
}
{
PROCINFO["sorted_in"] = "#ind_num_asc"
for (name in names2fldNrs) {
c = 0
for (fldNr in names2fldNrs[name]) {
printf "%s%s", (c++ ? OFS : ""), $fldNr > name
}
print "" > name
}
}
$ awk -f tst.awk file
$ cat A1BG
A1BG A1BG
aa bb
1 2
$ cat A1CF
A1CF A1CF
cc dd
3 4
$ cat A2ML1
A2ML1
ee
Since you wrote in one of the comments to my other answer that you have 20000 columns, lets consider a two step approach to ease debugging to find out which of the steps breaks.
step1.awk
NR == 1 { PROCINFO["sorted_in"] = "#ind_num_asc"
for( i=1; i<=NF; i++ ) { f2c[$i] = (f2c[$i]=="")? "$" i : (f2c[$i] " $" i) } }
NR== 2 { for( fn in f2c) printf("%s:%s\n", fn,f2c[fn])
exit
}
Step1 should give us a list of files together with their columns:
> awk -f step1.awk yourfile
Mpap_1:$1, $2, $3, $5, $13, $19, $25
Mpap_2:$4, $6, $8, $12, $14, $16, $20, $22, $26, $28
Mpap_3:$7, $9, $10, $11, $15, $17, $18, $21, $23, $24, $27, $29, $30
In my test data Mpap_1 is the header in column 1,2,3,5,13,19,25. Lets hope that this first step works with your large set of columns. (To be frank: I dont know if awk can deal with $20000.)
Step 2: lets create one of those famous one liners:
> awk -f step1.awk yourfile | awk -F : 'BEGIN {print "{"}; {print " print " $2, "> \"" $1 "\"" }; END { print "}" }' | awk -v "OFS=\t" -f - yourfile
The first part is our step 1, the second part builds on-the-fly a second awk script, with lines like this: print $1, $2, $3, $5, $13, $19, $25 > "Mpap_1". This second awk script is piped to the third part, which read the script from stdin (-f -) and applies the script to your input file.
In case something does not work: watch the output of each part of step2, you can execute the parts from the left up to (but not including) each of the | symbols and see what is going on, e.g.:
awk -f step1.awk yourfile
awk -f step1.awk yourfile | awk -F : 'BEGIN {print "{"}; {print " print " $2, "> \"" $1 "\"" }; END { print "}" }'
Following worked for me:
code for step1.awk:
NR == 1 { PROCINFO["sorted_in"] = "#ind_num_asc"
for( i=1; i<=NF; i++ ) { f2c[$i] = (f2c[$i]=="")? "$" i : (f2c[$i] " \"\t\" $" i) } }
NR== 2 { for( fn in f2c) printf("%s:%s\n", fn,f2c[fn])
exit
}
Then run one liner which uses above awk script:
awk -f step1.awk file.txt | awk -F : 'BEGIN {print "{"}; {print " print " $2, "> \"" $1".txt" "\"" }; END { print "}" }'| awk -f - file.txt
This outputs tab delimited .txt files having all the columns with same header in one file. (separate files for each type of header)
Thanks Lars Fischer and others.
Cheers

Awk script within shell script

I wrote some awk script to be executed while looping over {a..z}.txt files. I've been staring at this code for 30 minutes, but I just can't find what's wrong. The terminal complains that there is some syntax error around >, but I don't think that's where the bug is.
Basically, what I'm trying to do is this:
Each line contains a string and a following set of numbers. I want to re-print the numbers so that the first number is the smallest one of them.
input: a 1125159 2554 290 47364290 47392510 48629708 68
60771
output:a 290 1125159 2554 47364290 47392510 48629708 68
60771
Could anyone help me find what is wrong with the below code?
for alphabet in {a..z}
do
awk -F$'\t' "NF>2{maxId=\$2;maxIndex=2;
for(i=2; i<=NF; i++){
if(maxId>\$i){maxId=\$i; maxIndex=i}
};
printf \"%s \t %s \t\",\$1, maxId;
for(i=2; i<=NF; i++){
if(i!=maxIndex)
printf \"%d \t\", \$i};
printf \"\n\";
}" $alphabet.merged > $alphabet.out
done
Here's how your script should really be written:
awk 'BEGIN { FS=OFS="\t" }
NF>2 {
minIndex = 2
for (i=3; i<=NF; i++) {
if ( $minIndex > $i ) {
minIndex = i
}
}
printf "%s%s%s", $1, OFS, $minIndex
for (i=2; i<=NF; i++) {
if ( i != minIndex ) {
printf "%s%s", OFS, $i
}
}
print ""
}' file
a 68 2554 290 47364290 47392510 48629708 1125159 60771
Don't shy away from white space and brackets as they help readability. I don't understand the purpose of the surrounding shell loop in your question though - I suspect all you really need is:
awk 'BEGIN { FS=OFS="\t" }
FNR==1 { close(out); out=FILENAME; sub(/merged/,"out",out) }
NF>2 {
minIndex = 2
for (i=3; i<=NF; i++) {
if ( $minIndex > $i ) {
minIndex = i
}
}
printf "%s%s%s", $1, OFS, $minIndex > out
for (i=2; i<=NF; i++) {
if ( i != minIndex ) {
printf "%s%s", OFS, $i > out
}
}
print "" > out
}' *.merged

Rearranging a csv file

I have a file with contents similar to the below
Boy,Football
Boy,Football
Boy,Football
Boy,Squash
Boy,Tennis
Boy,Football
Girl,Tennis
Girl,Squash
Girl,Tennis
Girl,Tennis
Boy,Football
How can I use 'awk' or similar to rearrange this to the below:
Football Tennis Squash
Boy 5 1 1
Girl 0 3 1
I'm not even sure if this is possible, but any help would be great.
$ cat tst.awk
BEGIN{ FS=","; OFS="\t" }
{
genders[$1]
sports[$2]
count[$1,$2]++
}
END {
printf ""
for (sport in sports) {
printf "%s%s", OFS, sport
}
print ""
for (gender in genders) {
printf "%s", gender
for (sport in sports) {
printf "%s%s", OFS, count[gender,sport]+0
}
print ""
}
}
$ awk -f tst.awk file
Squash Tennis Football
Boy 1 1 5
Girl 1 3 0
In general when you know the end point of the loop you put the OFS or ORS after each field:
for (i=1; i<=n; i++) {
printf "%s%s", $i, (i<n?OFS:ORS)
}
but if you don't then you put the OFS before the second and subsequent fields and print the ORS after the loop:
for (x in array) {
printf "%s%s", (++i>1?OFS:""), array[x]
}
print ""
I do like the:
n = length(array)
for (x in array) {
printf "%s%s", array[x], (++i<n?OFS:ORS)
}
idea to get the end of the loop too, but length(array) is gawk-specific.
Another approach to consider:
$ cat tst.awk
BEGIN{ FS=","; OFS="\t" }
{
for (i=1; i<=NF; i++) {
if (!seen[i,$i]++) {
map[i,++num[i]] = $i
}
}
count[$1,$2]++
}
END {
for (i=0; i<=num[2]; i++) {
printf "%s%s", map[2,i], (i<num[2]?OFS:ORS)
}
for (i=1; i<=num[1]; i++) {
printf "%s%s", map[1,i], OFS
for (j=1; j<=num[2]; j++) {
printf "%s%s", count[map[1,i],map[2,j]]+0, (j<num[2]?OFS:ORS)
}
}
}
$ awk -f tst.awk file
Football Squash Tennis
Boy 5 1 1
Girl 0 1 3
That last will print the rows and columns in the order they were read. Not quite as obvious how it works though :-).
I would just loop normally:
awk -F, -v OFS="\t" '
{names[$1]; sport[$2]; count[$1,$2]++}
END{printf "%s", OFS;
for (i in sport)
printf "%s%s", i, OFS;
print "";
for (n in names) {
printf "%s%s", n, OFS
for (s in sport)
printf "%s%s", count[n,s]?count[n,s]:0, OFS; print ""
}
}' file
This keeps track of three arrays: names[] for the first column, sport[] for the second column and count[name,sport] to count the occurrences of every combination.
Then, it is a matter of looping through the results and printing them in a fancy way and making sure 0 is printed if the count[a,b] does not exist.
Test
$ awk -F, -v OFS="\t" '{names[$1]; sport[$2]; count[$1,$2]++} END{printf "%s", OFS; for (i in sport) printf "%s%s", i, OFS; print ""; for (n in names) {printf "%s%s", n, OFS; for (s in sport) printf "%s%s", count[n,s]?count[n,s]:0, OFS; print ""}}' a
Squash Tennis Football
Boy 1 1 5
Girl 1 3 0
Format is a bit ugly, there are some trailing OFS.
To get rid of trailing OFS:
awk -F, -v OFS="\t" '{names[$1]; sport[$2]; count[$1,$2]++} END{printf "%s", OFS; for (i in sport) {cn++; printf "%s%s", i, (cn<length(sport)?OFS:ORS)} for (n in names) {cs=0; printf "%s%s", n, OFS; for (s in sport) {cs++; printf "%s%s", count[n,s]?count[n,s]:0, (cs<length(sport)?OFS:ORS)}}}' a
You can always pipe to column -t for a nice output.

How to Convert two columns of CSV files to consecutive integers?

Hello say I have this file file1.csv and it has 2 columns a and b which are both 22 char strings. It looks like something like this:
hWcYwgRKOD77hfm1oKE0IA,5HleiJXMsFkGEsr8Jqr3Ug
hWcYwgRKOD77hfm1oKE0IA,rCDlYd2WHJuiT05sYGxaVA
65q0c2Iw03B8eSuHHTETHw,G40NUD0/op+13yjzBw+hrw
65q0c2Iw03B8eSuHHTETHw,1u8UW/cQ4i1vbSF9wvzu3w
...
And I would like to convert the a, b columns into consecutive integers like:
1,1
1,2
2,3
2,4
Does anyone know how can I do it? I am using Ubuntu 12.04 by the way
And what if I have another file file2.csv with column a' and b'. And is there any way to do the same thing to file2 and if "hWcYwgRKOD77hfm1oKE0IA" is 1 in file1 then "hWcYwgRKOD77hfm1oKE0IA" is 1 in file2 if it appears. Same for column b and b'. And I would like to have single output from those two files: result1.csv and result2.csv
awk -F, -v OFS=, '{ if ($1 in a) { $1 = a[$1] } else { $1 = a[$1] = ++x }
if ($2 in b) { $2 = b[$2] } else { $2 = b[$2] = ++y } } 1' file
Or perhaps simpler but may be less efficient:
awk -F, -v OFS=, '!($1 in a) { a[$1] = ++x } { $1 = a[$1] }
!($2 in b) { b[$2] = ++y } { $2 = b[$2] } 1' file
Or dynamic to any number of columns:
awk -F, -v OFS=, '{ for (i = 1; i <= NF; ++i)
if ((i, $i) in a) { $i = a[i, $i] }
else { $i = a[i, $i] = ++x[i] } } 1' file
Which is also similar to
awk -F, -v OFS=, '{ for (i = 1; i <= NF; ++i) {
if (!((i, $i) in a)) a[i, $i] = ++x[i]
$i = a[i, $i] } } 1' file
Output:
1,1
1,2
2,3
2,4
UPDATE
To apply on two files, try:
awk -F, -v OFS=, '{ if ($1 in a) { $1 = a[$1] } else { $1 = a[$1] = ++x }
if ($2 in b) { $2 = b[$2] } else { $2 = b[$2] = ++y }
print > "result_" FILENAME }' file1 file2
UPDATE 02
awk -F, -v OFS=, '!($1 in a) { a[$1] = ++x } !($2 in b) { b[$2] = ++y }
{ print $1, $2, a[$1], b[$2] }' file
Output:
hWcYwgRKOD77hfm1oKE0IA,5HleiJXMsFkGEsr8Jqr3Ug,1,1
hWcYwgRKOD77hfm1oKE0IA,rCDlYd2WHJuiT05sYGxaVA,1,2
65q0c2Iw03B8eSuHHTETHw,G40NUD0/op+13yjzBw+hrw,2,3
65q0c2Iw03B8eSuHHTETHw,1u8UW/cQ4i1vbSF9wvzu3w,2,4
File by file version:
awk -F, -v OFS=, '!($1 in a) { a[$1] = ++x } !($2 in b) { b[$2] = ++y }
{ print $1, $2, a[$1], b[$2] > "result_" FILENAME }' file1 file2

Resources