find difference and similarities between two text files using awk - bash

I have two files:
file 1
1
2
34:rt
4
file 2
1
2
34:rt
7
I want to display rows that are in file 2 but not in file 1, vice versa, and the same values in both text files. So file the expected result should look like:
1 in both
2 in both
34:rt in both
4 in file 1
7 in file 2
This is what I have so far but I am not sure if this is the right structure:
awk '
FNR == NR {
a[$0]++;
next;
}
!($0 in a) {
// print not in file 1
}
($0 in a) {
for (i = 0; i <= NR; i++) {
if (a[i] == $0) {
// print same in both
}
}
delete a[$0] # deletes entries which are processed
}
END {
for (rest in a) {
// print not in file 2
}
}' $PWD/file1 $PWD/file2
Any suggestions?

If the order is not relevant then you can do:
awk '
NR==FNR { a[$0]++; next }
{
print $0, ($0 in a ? "in both" : "in file2");
delete a[$0]
}
END {
for(x in a) print x, "in file1"
}' file1 file2
1 in both
2 in both
34:rt in both
7 in file2
4 in file1
Or using comm as suggested by choroba in comments:
comm --output-delimiter="|" file1 file2 |
awk -F'|' '{print (NF==3 ? $NF " in both" : NF==2 ? $NF "in file2" : $NF " in file1")}'
1 in both
2 in both
34:rt in both
4 in file1
7 in file2

Related

AWK: Concatenate and process three or more files with a method similar to FNR==NR approach

Since I am learning awk; I found out FNR==NR approach is a very common method to process two files. If FNR==NR; then it is the first file, when FNR reset to 1 while reading every line from concatenated files it means !(FNR==NR) and it is obviously the second file.
When it comes to three or more files I can't see a way which is second and third file as both have the same !(FNR==NR) condition. This made me to try to figure out how can there be something like FNR2 and FNR3?
So I implemented a method to process three files in one awk. Assuming like there is FNR1 FNR2 FNR3 for each file. For every file I made for loop that runs seperately. Condition is same for every loop NR==FNR# and actually get what I expected:
So I wonder if there are more sober, concise methods that deliver similar results with belowawkcode
Sample File Contents
$ cat file1
X|A1|Z
X|A2|Z
X|A3|Z
X|A4|Z
$ cat file2
X|Y|A3
X|Y|A4
X|Y|A5
$ cat file3
A1|Y|Z
A4|Y|Z
AWK for loop
$ cat fnrarray.sh
awk -v FS='[|]' '{ for(i=FNR ; i<=NR && i<=FNR && NR==FNR; i++) {x++; print "NR:",NR,"FNR1:",i,"FNR:",FNR,"\tfirst file\t"}
for(i=FNR ; i+x<=NR && i<=FNR && NR==FNR+x; i++) {y++; print "NR:",NR,"FNR2:",i+x,"FNR:",FNR,"\tsecond file\t"}
for(i=FNR ; i+x+y<=NR && i<=FNR && NR==FNR+x+y; i++) {print "NR:",NR,"FNR3:",i+x+y,"FNR:",FNR,"\tthird file\t"}
}' file1 file2 file3
Current and desired output
$ sh fnrarray.sh
NR: 1 FNR1: 1 FNR: 1 first file
NR: 2 FNR1: 2 FNR: 2 first file
NR: 3 FNR1: 3 FNR: 3 first file
NR: 4 FNR1: 4 FNR: 4 first file
NR: 5 FNR2: 5 FNR: 1 second file
NR: 6 FNR2: 6 FNR: 2 second file
NR: 7 FNR2: 7 FNR: 3 second file
NR: 8 FNR3: 8 FNR: 1 third file
NR: 9 FNR3: 9 FNR: 2 third file
You can see NR is aligning with FNR# and it is readable which NR is for which file#.
Another Method
I found this method FNR==1{++f} f==1 {} here Handling 3 Files using awk
But this method is replacing arr1[1] when new line is read every time
Fail attempt 1
$ awk -v FS='[|]' 'FNR==1{++f} f==1 {split($2,arr); print arr1[1]}' file1 file2 file3
A1
A2
A3
A4
Success with for loop (arr1[1] is not changed)
$ awk -v FS='[|]' '{for(i=FNR ; i<=NR && i<=FNR && NR==FNR; i++) {arr1[++k]=$2; print arr1[1]}}' file1 file2 file3
A1
A1
A1
A1
When it comes to three or more files I can't see a way which is second
and third file as both have the same !(FNR==NR) condition. This made
me to try to figure out how can there be something like FNR2 and FNR3?
Here is example:
$ cat f1
X|A1|Z
X|A2|Z
X|A3|Z
X|A4|Z
$ cat f2
X|Y|A3
X|Y|A4
X|Y|A5
$ cat f3
A1|Y|Z
A4|Y|Z
Sample output:
$ awk -F '|' 'FNR==1{file++}{array[file, FNR]=$0; max=max>FNR?max:FNR}END{for(f=1; f<=file; f++){ for(row=1; row<=max; row++){ key=f SUBSEP row; if(key in array)print "file: "f,"row :"row,"record: "array[key] } }}' f1 f2 f3
file: 1 row :1 record: X|A1|Z
file: 1 row :2 record: X|A2|Z
file: 1 row :3 record: X|A3|Z
file: 1 row :4 record: X|A4|Z
file: 2 row :1 record: X|Y|A3
file: 2 row :2 record: X|Y|A4
file: 2 row :3 record: X|Y|A5
file: 3 row :1 record: A1|Y|Z
file: 3 row :2 record: A4|Y|Z
Explanation:
awk -F '|' 'FNR==1{ # FNR will reset for every file
file++ # so whenever FNR==1 increment variable file
}
{
# array name : array
# array key being : file, FNR
# array value : $0 which current record/row
array[file, FNR] = $0;
# here we find which row count in all available files
max = max > FNR ? max : FNR
}
END{ # end block when all files are read
# start iterating over file
# as we now variable file hold total no files read
for(f=1; f<=file; f++)
{
# iterate now for record from each file
# variable max holds max row count
for(row=1; row<=max; row++)
{
# variable key will now have
# key = file-number SUBSET row-number
key=f SUBSEP row;
# if key exists in array
# print array value
if(key in array)
print "file: "f,"row :"row,"record: "array[key]
}
}
}' f1 f2 f3
Other option would be to use true multi-dimensional arrays like below. gawk specific of course.
Assuming filenames are unique, otherwise use FNR==1{ file++} and in place of FILENAME use file
$ awk --version
GNU Awk 4.2.1, API: 2.0 (GNU MPFR 3.1.6-p2, GNU MP 6.1.2)
Copyright (C) 1989, 1991-2018 Free Software Foundation.
$ awk -F '|' '{
true_multi_array[FILENAME][FNR] = $0
}
END{
for(file in true_multi_array)
for(row in true_multi_array[file])
print "file:",file, "row :" row, "record:" true_multi_array[file][row]
}' f1 f2 f3
file: f1 row :1 record:X|A1|Z
file: f1 row :2 record:X|A2|Z
file: f1 row :3 record:X|A3|Z
file: f1 row :4 record:X|A4|Z
file: f2 row :1 record:X|Y|A3
file: f2 row :2 record:X|Y|A4
file: f2 row :3 record:X|Y|A5
file: f3 row :1 record:A1|Y|Z
file: f3 row :2 record:A4|Y|Z
To identify files in order using GNU awk no matter what:
awk '
ARGIND == 1 { do 1st file stuff }
ARGIND == 2 { do 2nd file stuff }
ARGIND == 3 { do 3rd file stuff }
' file1 file2 file3
e.g. to get the text under "output" in your question from the 3 sample input files you provided:
awk '
ARGIND == 1 { pos = "first" }
ARGIND == 2 { pos = "second" }
ARGIND == 3 { pos = "third" }
{ print "NR:", NR, "FNR" ARGIND ":", NR, "FNR:", FNR, pos " file" }
' file1 file2 file3
NR: 1 FNR1: 1 FNR: 1 first file
NR: 2 FNR1: 2 FNR: 2 first file
NR: 3 FNR1: 3 FNR: 3 first file
NR: 4 FNR1: 4 FNR: 4 first file
NR: 5 FNR2: 5 FNR: 1 second file
NR: 6 FNR2: 6 FNR: 2 second file
NR: 7 FNR2: 7 FNR: 3 second file
NR: 8 FNR3: 8 FNR: 1 third file
NR: 9 FNR3: 9 FNR: 2 third file
or using any awk if all file names are unique whether any of them are empty or not:
awk '
FILENAME == ARGV[1] { do 1st file stuff }
FILENAME == ARGV[2] { do 2nd file stuff }
FILENAME == ARGV[3] { do 3rd file stuff }
' file1 file2 file3
or if the files aren't empty then whether unique or not (note file1 twice in the arg list):
awk '
FNR == 1 { argind++ }
argind == 1 { do 1st file stuff }
argind == 2 { do 2nd file stuff }
argind == 3 { do 3rd file stuff }
' file1 file2 file1
if a file names can appear multiple times in the arg list and some of the files could be empty then it becomes trickier with a non-GNU awk which is why GNU awk has ARGIND, e.g. something like (untested):
awk '
BEGIN {
for (i=1; i<ARGC; i++) {
fname = ARGV[i]
if ( (getline line < fname) > 0 ) {
# file is not empty so save its position in the args
# list in an array indexed by its name and the number
# of times that name has been seen so far
arginds[fname,++tmpcnt[fname]] = i
}
close(fname)
}
}
FNR == 1 { argind = arginds[FILENAME,++cnt[FILENAME]] }
argind == 1 { do 1st file stuff }
argind == 2 { do 2nd file stuff }
argind == 3 { do 3rd file stuff }
' file1 file2 file1

merging 2 csv files using awk

I have 3 CSV files:
Base File(values initialised with 0)
steve tignor ash michael jose sam joshua
0 0 0 0 0 0 0
File 1:
tignor michael jose
888 9 -2
File 2:
ash joshua
77 66
Output I need:
steve tignor ash michael jose sam joshua
File1 0 888 0 9 -2 0 0
File2 0 0 77 0 0 0 66
I tried with sorting the files first with awk and then merge with paste but as I have 1000+ columns and having 30 files it just did not work.
Code:
awk -F"," 'NR==1{
split($0,a,FS);asort(a);
for(i=1;i<=NF;i++)b[$i]=i
} {
for(i=1;i<=NF;i++)printf("%s,",$(b[a[i]]));
print x
}' File1 > 1.csv
awk -F"," 'NR==1{
split($0,a,FS);asort(a);
for(i=1;i<=NF;i++)b[$i]=i
} {
for(i=1;i<=NF;i++)printf("%s,",$(b[a[i]]));
print x
}' File2 > 2.csv
paste -d"\n" 1.csv 2.csv > merge.csv
Need some assistance here. Thanks in advance.
I assumed that you omitted the commas in the files. If you're using space separated files you could just change the separator used in the split function.
awk '
ARGIND==1 && FNR==1{
split($0, base, ",")
printf("file,%s\n",$0)
}
ARGIND > 1 && FNR==1{
split($0, names, ",")
printf("%s", ARGV[ARGIND])
}
ARGIND > 1 && FNR==2{
split($0, values, ",")
for(i in names)
line[names[i]] = values[i]
for(i in base){
if(base[i] in line)
printf(",%s", line[base[i]])
else
printf(",0")
}
delete line
print ""
}
' base.csv file1.csv file2.csv
Example:
file1.csv:
tignor,michael,jose
888,9,-2
file2.csv:
ash,joshua
77,66
and base.csv:
steve,tignor,ash,michael,jose,sam,joshua
0,0,0,0,0,0,0
the output is:
file,steve,tignor,ash,michael,jose,sam,joshua
file1.csv,0,888,0,9,-2,0,0
file2.csv,0,0,77,0,0,0,66
Basically, the script is running in 2 steps:
First we read the names from the base.csv and store them into an
array.
Then, for each file we store the names appearing in its header and
try to print one value for each column in the base csv. if we don't
have the value corresponding to a column in a particular file we just
print 0 instead.
P.S. I made a new POSIX awk compatible version of the script:
awk --posix '
NR==FNR && FNR==1{
split($0, base, ",")
printf("file,%s\n",$0)
}
NR>FNR && FNR==1{
split($0, names, ",")
printf("%s", FILENAME)
}
NR>FNR && FNR==2{
split($0, values, ",")
for(i in names)
line[names[i]] = values[i]
for(i in base){
if(base[i] in line)
printf(",%s", line[base[i]])
else
printf(",0")
}
delete line
print ""
}
' base.csv file1.csv file2.csv

3 file string matching pattern awk in tab separated file

I've got 3 file:
FILE 1
NODE_2020 Cancer
NODE_2029 Thug
NODE_0902 Snap
FILE 2
NODE_2020 Mikro
NODE_2029 Bold
NODE_0902 Mini
FILE 3
NODE_2020 Gold
NODE_2080 Damn
NODE_0900 Gueo
I need to search for the first column of file 1 into the other two: if value matches, then column 2 of file 2 and column 2 of file 3 will be printed into a single file; if not, a "NO MATCH" string will be printed in return. Output file will be made like this:
Query File1 File2 File3
NODE_2020 Cancer Mikro Gold
NODE_2029 Thug Bold NO MATCH
NODE_0902 Snap Mini NO MATCH
Awk/sed/perl solutions are really appreciated. What I'm stuck on doing is to use first column of file 1 as a variable to look with just an if statement into other 2 files.
Here's what I've tried, to use column from file 1 and match into file 2:
awk 'NR==FNR{a[NR]=$1;next} { print a[FNR],"\t", $2 }' file1 file2
It actually works for 2 files. No idea on how to extend to three file, and to add the "NO MATCH" pattern.
With GNU awk for true multi-dimensional arrays and ARGIND:
$ cat tst.awk
BEGIN { OFS="\t" }
(NR==FNR) || ($1 in vals) {
vals[$1][ARGIND] = $2
}
END {
printf "%s%s", "Query", OFS
for (fileNr=1; fileNr<=ARGIND; fileNr++) {
printf "%s%s", ARGV[fileNr], (fileNr<ARGIND ? OFS : ORS)
}
for (key in vals) {
printf "%s%s", key, OFS
for (fileNr=1; fileNr<=ARGIND; fileNr++) {
val = (fileNr in vals[key] ? vals[key][fileNr] : "NO MATCH")
printf "%s%s", val, (fileNr<ARGIND ? OFS : ORS)
}
}
}
$ awk -f tst.awk file1 file2 file3
Query file1 file2 file3
NODE_2020 Cancer Mikro Gold
NODE_0902 Snap Mini NO MATCH
NODE_2029 Thug Bold NO MATCH
You may use this awk:
awk -v OFS='\t' 'function bval(p,q) {
return ((p,q) in b ? b[p,q] : "NO MATCH")
}
FNR == NR {
a[$1] = $2
next
}
{
b[FILENAME,$1] = $2
}
END {
print "Query", ARGV[1], ARGV[2], ARGV[3]
for (i in a)
print i, a[i], bval(ARGV[2],i), bval(ARGV[3],i)
}' file{1,2,3}
Query file1 file2 file3
NODE_2020 Cancer Mikro Gold
NODE_0902 Snap Mini NO MATCH
NODE_2029 Thug Bold NO MATCH

match pattern and print corresponding columns from a file using awk or grep

I have a input file with repetitive headers (below):
A1BG A1BG A1CF A1CF A2ML1
aa bb cc dd ee
1 2 3 4 5
I want to print all columns with same header in one file. e.g for above file there should be three output files; 1 for A1BG with 2 columns; 2nd for A1CF with 2 columns; 3rd for A2ML1 with 1 column. I there any way to do it using one-liners by awk or grep?
I tried following one-liner:
awk -v f="A1BG" '!o{for(x=1;x<=NF;x++)if($x==f){o=1;next}}o{print $x}' trial.txt
but this searches the pattern in only one column (1 in this case). I want to look through all the header names and print all the corresponding columns which have A1BG in their header.
This awk solution takes the same approach as Lars but uses gawk 4.0 2D arrays
awk '
# fill cols map of header to its list of columns
NR==1 {
for(i=1; i<=NF; ++i) {
if(!($i in cols))
j=0
cols[$i][j++]=i
}
}
{
# write tab-delimited columns for each header to its cols.header file
for(h in cols) {
of="cols."h
for(i=0; i < length(cols[h]); ++i) {
if(i > 0) printf("\t") >of
printf("%s", $cols[h][i]) >of
}
printf("\n") >of
}
}
'
awk solution should be pretty fast - output files are tab-delimited and named cols.A1BG cols.A1CF etc
awk '
# fill cols columns map to header and tab map to track tab state per header
NR==1 {
for(i=1; i<=NF; ++i) {
cols[i]=$i
tab[$i]=0
}
}
{
# reset tab state for every header
for(h in tab) tab[h]=0
# write tab-delimited column to its cols.header file
for(i=1; i<=NF; ++i) {
hdr=cols[i]
of="cols." hdr
if(tab[hdr]) {
printf("\t") >of
} else
tab[hdr]=1
printf("%s", $i) >of
}
# newline for every header file
for(h in tab) {
of="cols." h
printf("\n") >of
}
}
'
This is the output from both of my awk solutions:
$ ./scr.sh <in.txt; head cols.*
==> cols.A1BG <==
A1BG A1BG
aa bb
1 2
==> cols.A1CF <==
A1CF A1CF
cc dd
3 4
==> cols.A2ML1 <==
A2ML1
ee
5
I cannot help you with a 1-liner but here is a 10-liner for GNU awk:
script.awk
NR == 1 { PROCINFO["sorted_in"] = "#ind_num_asc"
for( i=1; i<=NF; i++ ) { f2c[$i] = (i==1)? i : f2c[$i] " " i } }
{ for( n in f2c ) {
split( f2c[n], fls, " ")
tmp = ""
for( f in fls ) tmp = (f ==1) ? $fls[f] : tmp "\t" $fls[f]
print tmp > n
}
}
Use it like this: awk -f script.awk your_file
In the first action: it determines filenames from the columns in the first record (NR == 1).
In the second action: for each record: for each output file: its columns (as defined in the first record) are collected into tmp and written to the output file.
The use of PROCINFO requires GNU awk, see Ed Mortons comments for alternatives.
Example run and ouput:
> awk -f mpapccfaf.awk mpapccfaf.csv
> cat A1BG
A1BG A1BG
aa bb
1 2
Here y'go, a one-liner as requested:
awk 'NR==1{for(i=1;i<=NF;i++)a[$i][i]}{PROCINFO["sorted_in"]="#ind_num_asc";for(n in a){c=0;for(f in a[n])printf"%s%s",(c++?OFS:""),$f>n;print"">n}}' file
The above uses GNU awk 4.* for true multi-dimensional arrays and sorted_in.
For anyone else reading this who prefers clarity over the brevity the OP needs, here it is as a more natural multi-line script:
$ cat tst.awk
NR==1 {
for (i=1; i<=NF; i++) {
names2fldNrs[$i][i]
}
}
{
PROCINFO["sorted_in"] = "#ind_num_asc"
for (name in names2fldNrs) {
c = 0
for (fldNr in names2fldNrs[name]) {
printf "%s%s", (c++ ? OFS : ""), $fldNr > name
}
print "" > name
}
}
$ awk -f tst.awk file
$ cat A1BG
A1BG A1BG
aa bb
1 2
$ cat A1CF
A1CF A1CF
cc dd
3 4
$ cat A2ML1
A2ML1
ee
Since you wrote in one of the comments to my other answer that you have 20000 columns, lets consider a two step approach to ease debugging to find out which of the steps breaks.
step1.awk
NR == 1 { PROCINFO["sorted_in"] = "#ind_num_asc"
for( i=1; i<=NF; i++ ) { f2c[$i] = (f2c[$i]=="")? "$" i : (f2c[$i] " $" i) } }
NR== 2 { for( fn in f2c) printf("%s:%s\n", fn,f2c[fn])
exit
}
Step1 should give us a list of files together with their columns:
> awk -f step1.awk yourfile
Mpap_1:$1, $2, $3, $5, $13, $19, $25
Mpap_2:$4, $6, $8, $12, $14, $16, $20, $22, $26, $28
Mpap_3:$7, $9, $10, $11, $15, $17, $18, $21, $23, $24, $27, $29, $30
In my test data Mpap_1 is the header in column 1,2,3,5,13,19,25. Lets hope that this first step works with your large set of columns. (To be frank: I dont know if awk can deal with $20000.)
Step 2: lets create one of those famous one liners:
> awk -f step1.awk yourfile | awk -F : 'BEGIN {print "{"}; {print " print " $2, "> \"" $1 "\"" }; END { print "}" }' | awk -v "OFS=\t" -f - yourfile
The first part is our step 1, the second part builds on-the-fly a second awk script, with lines like this: print $1, $2, $3, $5, $13, $19, $25 > "Mpap_1". This second awk script is piped to the third part, which read the script from stdin (-f -) and applies the script to your input file.
In case something does not work: watch the output of each part of step2, you can execute the parts from the left up to (but not including) each of the | symbols and see what is going on, e.g.:
awk -f step1.awk yourfile
awk -f step1.awk yourfile | awk -F : 'BEGIN {print "{"}; {print " print " $2, "> \"" $1 "\"" }; END { print "}" }'
Following worked for me:
code for step1.awk:
NR == 1 { PROCINFO["sorted_in"] = "#ind_num_asc"
for( i=1; i<=NF; i++ ) { f2c[$i] = (f2c[$i]=="")? "$" i : (f2c[$i] " \"\t\" $" i) } }
NR== 2 { for( fn in f2c) printf("%s:%s\n", fn,f2c[fn])
exit
}
Then run one liner which uses above awk script:
awk -f step1.awk file.txt | awk -F : 'BEGIN {print "{"}; {print " print " $2, "> \"" $1".txt" "\"" }; END { print "}" }'| awk -f - file.txt
This outputs tab delimited .txt files having all the columns with same header in one file. (separate files for each type of header)
Thanks Lars Fischer and others.
Cheers

Min and max coordinates for same values in different column

I have one question, I think about script for my data and I am totally lost.
INPUT:
1 BR.100.200
2 BR.100.200
3 BR.100.200
4 BR.100.200
1 BAL.11.235
2 BAL.11.235
3 BAL.11.235
1 JOJ.21.354
2 JOJ.21.354
OUTPUT :
BR.100.200 1 4
BAL.11.235 1 3
JOJ.21.354 1 2
Than I want: if the $2 is same for columns, write for this same values maximal and minimal values in $1. Please i prefer awk language or bash or sed.
Thank you
Filip
Could probz be made better but this works
awk '!x[$2]{x[$2]=$1}y[$2]<$1{y[$2]=$1}x[$2]>$1{x[$2]=$1}END{for(i in y)print i,x[i],y[i]}' file
More readable
awk '!min[$2]{min[$2]=$1} max[$2]<$1{max[$2]=$1} min[$2]>$1{min[$2]=$1} END{for(i in max)print i, min[i], max[i]}' file
#!/usr/bin/awk -f
NF == 0 { next }
$2 in min {
if ($1 < min[$2]) {
min[$2] = $1
} else if ($1 > max[$2]) {
max[$2] = $1
}
next
}
{
min[$2] = max[$2] = $1
keys[i++] = $2
}
END {
for (i = 0; i in keys; ++i) {
key = keys[i]
if (i) {
print ""
}
printf "%s\t%s\t%s\n", key, min[key], max[key]
}
}
Run with:
awk -f script.awk your_file.txt
Output:
BR.100.200 1 4
BAL.11.235 1 3
JOJ.21.354 1 2
awk '{if (NR == 1) {temp1=$2;min=$1;max=$1;} else if ((NR % 2)!=0) {temp2=$2; if (temp1 == temp2) {max=$1} else {print (temp1,min,max); temp1=$2;min=$1;max=$1} } } END{if ((NR % 2)!=0) {temp2=$2; if (temp1 == temp2) {max=$1} else {print (temp1,min,max);} print (temp2,min,max) } }' inputfile

Resources