awk with duplicate values - bash

File:
22 Hello
22 Hi
1 What
34 Where
21 is
44 How
44 are
44 you
Desired Output:
22 HelloHi
1 What
34 Where
21 is
44 Howareyou
If there are duplicate values in first field($1) the second field should have appended text
How to achieve this using awk?
Thanks

$ awk '
!seen[$1]++ { keys[++numKeys] = $1 }
{ str[$1] = str[$1] $2 }
END{
for (keyNr=1; keyNr<=numKeys; keyNr++) {
key = keys[keyNr]
print key, str[key]
}
}
' file
22 HelloHi
1 What
34 Where
21 is
44 Howareyou

Using awk:
awk '!($1 in a){a[$1]=$2;next} $1 in a{a[$1]=a[$1] $2} END{for (i in a) print i, a[i]}' file
22 HelloHi
44 Howareyou
34 Where
21 is
1 What
EDIT: To preserve the order:
awk '!($1 in a){b[++n]=$1; a[$1]=$2;next} $1 in a{a[$1] = a[$1] $2}
END{for (i=1; i<=n; i++) print b[i], a[b[i]]}' file
22 HelloHi
1 What
34 Where
21 is
44 Howareyou

To maintain the order, you need to keep track of it:
awk '
! seen[$1]++ {order[++n] = $1}
{value[$1] = value[$1] $2}
END {for (i=1; i<=n; i++) print order[i], value[order[i]]}
' <<END
22 Hello
22 Hi
1 What
34 Where
21 is
44 How
44 are
44 you
END
22 HelloHi
1 What
34 Where
21 is
44 Howareyou
If you know the values in the 1st column are contiguous, as in your sample text, then:
awk '
prev != $1 {printf "%s%s ", sep, $1; sep=RS}
{printf "%s", $2; prev = $1}
END {print ""}
'
A couple of other approaches:
perl -lane '
push #keys, $F[0] unless grep {$_ eq $F[0]} #keys;
$val{$F[0]} .= $F[1]
} END {
print "$_ $val{$_}" for #keys
' file
and, reaching way into the niche zone
#!/usr/bin/env tclsh
while {[gets stdin line] != -1} {dict append val {*}$line}
dict for {k v} $val {puts "$k $v"}

Here is an alternate solution in Python, as requested by #shellter:
from collections import defaultdict
with open("file") as infile:
d = defaultdict(str)
#Build dictionary of values
for line in infile:
line = line.strip()
k, _, v = line.partition(" ")
d[k] += v
#Print everything
for k, v in d.iteritems():
print k,v
Note that the ordering is not preserved in this solution. Here is an alternate solution that provides exactly the desired output:
from collections import defaultdict
with open("file") as infile:
d = defaultdict(str)
orig_order = []
#Build dictionary of values
for line in infile:
line = line.strip()
k, _, v = line.partition(" ")
d[k] += v
#Add to original order if not seen yet
if not k in orig_order:
orig_order.append(k)
#Print everything
for k in orig_order:
print k, d[k]
Note that these are quickly-crafted solution, I am sure it is possible without too much effort to either make them shorter or more flexible.

if the order is not important, this will work:
awk '{a[$1]=a[$1]$2}; END {for (i in a) {print a[i]}}' file
.. and if order is important:
awk '{if (!a[$1]) b[++i]=$1;a[$1]=a[$1]$2}; END {for (j=1;j<i;j++) {print a[b[j]]}}' file

Related

Merging multiple files with two common columns, and replace the blank to 0

I extremely appreciate if anyone could me help to merge multiple files (up to 8) with two common columns ($1$2). I want to get all values of $3 and replace the blank with 0. Here are the samples from 4 files
File1:
chr1 111001 234
chr2 22099 108
File2:
chr1 111001 42
chr1 430229 267
File3:
chr1 111001 92
chr5 663800 311
File4:
chr1 111001 129
chr2 22099 442
Desired output
chr1 111001 234 42 92 129
chr1 430229 0 267 0 0
chr2 22099 108 0 0 442
chr5 663800 0 0 311 0
I tried
awk '{ a[$1 OFS $2 FS] = a[$1 OFS $2 FS] ( a[$1 OFS $2 FS] == "" ? "" : OFS) $3 }END{ for (i in a){print i,"0",a[i]} }' OFS="\t" file1.txt file2.txt file3.txt file4.txt | sort -k1
output
chr1 111001 0 234 42 92 129
chr1 430229 0 267
chr2 22099 0 108 442
chr5 663800 0 311
Thank very much in advance
One more variant, could you please try following, written and teste with shown samples.
awk '
{
if(!a[FILENAME]++){
file[++count]=FILENAME
}
b[$1 OFS $2 OFS FILENAME]=$NF
c[$1 OFS $2]++
if(!d[$1 OFS $2]++){
e[++count1]=$1 OFS $2
}
}
END{
for(i=1;i<=length(c);i++){
printf("%s ",e[i])
for(j=1;j<=count;j++){
printf("%s %s",(b[e[i] OFS file[j]]!=""?b[e[i] OFS file[j]]:0),j==count?ORS:OFS)
}
}
}
' file{1..4} | sort -k1
Output will be as follows.
chr1 111001 234 42 92 129
chr1 430229 0 267 0 0
chr2 22099 108 0 0 442
chr5 663800 0 0 311 0
Explanation: Adding detailed explanation for above.
awk ' ##Starting awk program from here.
{
if(!a[FILENAME]++){ ##Checking condition if FILENAME is present in a then do following.
file[++count]=FILENAME ##Creating file with index of count and value is current file name.
}
b[$1 OFS $2 OFS FILENAME]=$NF ##Creating array b with index of 1st 2nd and filename and which has value as last field.
c[$1 OFS $2]++ ##Creating array c with index of 1st and 2nd field and keep increasing its value with 1.
if(!d[$1 OFS $2]++){ ##Checking condition if 1st and 2nd field are NOT present in d then do following.
e[++count1]=$1 OFS $2 ##Creating e with index of count1 with increasing value of 1 and which has first and second fields here.
}
}
END{ ##Starting END block of this awk program from here.
for(i=1;i<=length(c);i++){ ##Starting for loop which runs from i=1 to till length of c here.
printf("%s ",e[i]) ##Printing value of array e with index i here.
for(j=1;j<=count;j++){ ##Starting for loop till value of count here.
printf("%s %s",(b[e[i] OFS file[j]]!=""?b[e[i] OFS file[j]]:0),j==count?ORS:OFS) ##Printing value of b with index of e[i] OFS file[j] if it present then print else print 0, print new line if j==count or print space.
}
}
}
' file{1..4} | sort -k1 ##Mentioning Input_files 1 to 4 here and sorting output with 1st field here.
EDIT: As per GREAT regex GURU #anubhava sir's comments adding solution with ARGC and ARGV with GNU awk.
awk '
{
b[$1 OFS $2 OFS FILENAME]=$NF
c[$1 OFS $2]++
if(!d[$1 OFS $2]++){
e[++count1]=$1 OFS $2
}
}
END{
count=(ARGC-1)
for(i=1;i<=length(c);i++){
printf("%s ",e[i])
for(j=1;j<=(ARGC-1);j++){
printf("%s %s",(b[e[i] OFS ARGV[j]]!=""?b[e[i] OFS ARGV[j]]:0),j==count?ORS:OFS)
}
}
}
' file{1..4} | sort -k1
You may use this gnu-awk:
awk 'BEGIN {
for (k=1; k<ARGC; ++k)
s = s " " 0
}
{
key=$1 OFS $2
if (!(key in map))
map[key] = s
map[key] = gensub("^( ([0-9]+ ){" ARGIND-1 "})[0-9]+", "\\1" $3, "1", map[key])
}
END {
PROCINFO["sorted_in"]="#ind_str_asc"
for (k in map)
print k map[k]
}' file{1..4} | column -t
chr1 111001 234 42 92 129
chr1 430229 0 267 0 0
chr2 22099 108 0 0 442
chr5 663800 0 0 311 0
Explanation:
We are building a string with all zeroes, one for each file in arguments
Using gensub we build a regex using ARGIND (current argument index)
This regex replaces 0 in current ARGINDth position with $3
END block just prints out associative array content stored in map
column -t is used for tabular display of data
Here is an equivalent command to make it work in POSIX awk (non-gnu):
awk 'BEGIN {
for (k=1; k<ARGC; ++k)
s = s " " 0
}
FNR == 1 {
++ARGIND
}
{
key=$1 OFS $2
if (!(key in map))
map[key] = s
split(map[key], a)
a[ARGIND] = $3
v = ""
for (k=1; k<ARGC; ++k)
v = v " " a[k]
map[key]=v
}
END {
for (k in map)
print k map[k]
}' file{1..4}
These files look like they are derived from bed of vcf files. If so, do not reinvent the wheel. Use any of specialized bioinformatics tools to manipulate these files. For example: bedtools,
bcftools,
Picard MergeVcfs, etc
Find more by searching for merge bed files or merge vcf files. Most of these bioinformatics tools/packages can be installed using conda from bioconda channel.
After the bed/vcf files are merged/joined/intersected/etc, use common *NIX utilities and scripting languages to extract and manipulate the files when they are not in any of the common bioinformatics formats.

Extract desired column with values

Please help me with this small script I am making I am trying to grep some columns with values from a big file (tabseparated) (mainFileWithValues.txt) which has this format:
A B C ......... (total 700 columns)
80 2.08 23
14 1.88 30
12 1.81 40
Column names are in column.nam
cat columnnam.nam
A
B
.
.
.
till 20 nmes
I am first taking column number from a big file using:
sed -n "1 s/${i}.*//p" mainFileWithValues.txt | sed 's/[^\t*]//g' |wc -c
Then using cut I am extracting values
I have made a for loop
#/bin/bash
for i in `cat columnnam.nam`
do
cut -f`sed -n "1 s/${i}.*//p" mainFileWithValues.txt | sed 's/[^\t*]//g' |wc -c` mainFileWithValues.txt > test.txt
done
cat test.txt
A
80
14
12
B
2.08
1.88
1.81
my problem is I want output test.txt to be in columns like main file.
i.e.
A B
80 2.08
How can I fix this in this script?
Here is one-liner:
awk 'FNR==NR{h[NR]=$1;next}{for(i=1; i in h; i++){if(FNR==1){for(j=1; j<=NF; j++){if(tolower(h[i])==tolower($j)){d[i]=j; break }}}printf("%s%s",i>1 ? OFS:"", i in d ?$(d[i]):"")}print ""}' columns.nam mainfile
Explanation:
[ note : case insensitive header match, remove tolower(), if you want strict match ]
awk '
FNR==NR{ # Here we read columns.nam file
h[NR]=$1; # h -> array, NR -> as array key, $1 -> as array value
next # go to next line
}
{ # Here we read second file
for(i=1; i in h; i++) # iterate array h
{
if(FNR==1) # if we are reading 1st row of second file, will parse header
{
for(j=1; j<=NF; j++) # iterate over fields of 1st row fields
{
# if it was the field we are looking for
if(tolower(h[i])==tolower($j))
{
# then
# d -> array, i -> as array key which is column order number
# j -> as array value which is column number
d[i]=j;
break
}
}
}
# for all records
# if field we searched was found then print such field
# from d[i] we access, column number
printf("%s%s",i>1 ? OFS:"", i in d ? $(d[i]): "");
}
# print newline char
print ""
}
' columns.nam mainfile
Test Results:
$ cat mainfile
A B C
80 2.08 23
14 1.88 30
12 1.81 40
$ cat columns.nam
A
C
$ awk 'FNR==NR{h[NR]=$1;next}{for(i=1; i in h; i++){if(FNR==1){for(j=1; j<=NF; j++){if(tolower(h[i])==tolower($j)){d[i]=j; break }}}printf("%s%s",i>1 ? OFS:"", i in d ?$(d[i]):"")}print ""}' columns.nam mainfile
A C
80 23
14 30
12 40
You can also make script and run
akshay#db-3325:/tmp$ cat col_parser.awk
FNR == NR {
h[NR] = $1;
next
}
{
for (i = 1; i in h; i++) {
if (FNR == 1) {
for (j = 1; j <= NF; j++) {
if (tolower(h[i]) == tolower($j)) {
d[i] = j;
break
}
}
}
printf("%s%s", i > 1 ? OFS : "", i in d ? $(d[i]) : "");
}
print ""
}
akshay#db-3325:/tmp$ awk -v OFS="\t" -f col_parser.awk columns.nam mainfile
A C
80 23
14 30
12 40
Similar Answer
AWK to display a column based on Column name and remove header and last delimiter
Another awk approach:
awk 'NR == FNR {
hdr[$1]
next
}
FNR == 1 {
for (i=1; i<=NF; i++)
if ($i in hdr)
h[i]
}
{
s=""
for (i in h)
s = s (s == "" ? "" : OFS) $i
print s
}' column.nam mainFileWithValues.txt
A B
80 2.08
14 1.88
12 1.81
To get formatted output pipe above command to column -t

Find positions of all occurrences of a pattern in a string when every line have different patterns defined in other column (UNIX)

I have this tabulated file as shown:
1 MGNVFEKLFKSLFGKKEMRILMVGLDAAGKTTILYKLKLGEIVTTIPTIGFNVETVEYKNISFTVWDVGGQDKIRPLWRHYFQNTQGLIFVVDSNDRERVNEAREELTRMLAEDELRDAVLLVFVNKQDLPNAMNAAEITDKLGLHSLRQRNWYIQATCATSGDGLYEGLDWLSNQLKNQK V
2 MGNVFEKLFKSLFGKKEMRILMVGLDAAGKTTILYKLKLGEIVTTIPTIGFNVETVEYKNISFTVWDVGGQDKIRPLWRHYFQNTQGLIFVVDSNDRERVNEAREELTRMLAEDELRDAVLLVFVNKQDLPNAMNAAEITDKLGLHSLRQRNWYIQATCATSGDGLYEGLDWLSNQLKNQK M
.
.
And so on...
The first column is the number, second column corresponds to protein sequence and third column is the last character and the pattern to find in the corresponding sequence for each case.
Thus, the desired output will be something like that:
1:positions:4 23 43 53 56 65 68 91 92 100 120 123 125
2:positions:1 18 22 110 134
I have tried with awk and index function.
nawk -F'\t' -v p=$3 'index($2,p) {printf "%s:positions:", NR; s=$2; m=0; while((n=index(s, p))>0) {m+=n; printf "%s ", m; s=substr(s, n+1)} print ""}' "file.tsv"
However it works only specifying the variable -v as a character or string but not $3. How can I get it in unix environment? Thanks in advance
You can do:
awk -F'\t' '{ len=split($2,arr,""); printf "%s:positions:",$1 ; for(i=0;i<len;i++) { if(arr[i] == $3 ) { printf "%s ",i } }; print "" }' file.tsv
First split the subject $2 entirely into an array, then loop it, check for occurances of $3 and print the array index when found
Perl to the rescue:
perl -wane '
print "$F[0]:positions:";
$i = 0;
print " ", $i while ($i = 1 + index $F[1], $F[2], $i) > 0;
print "\n";
' -- file
If the space after : is a problem, you can complicate it to
$i = $f = 0;
$f = print " " x $f, $i while ($i = 1 + index $F[1], $F[2], $i) > 0;
gawk solution:
awk -v FPAT="[[:digit:]]+|[[:alpha:]]" '{
r=$1":positions:"; for(i=2;i<NF;i++) { if($i==$NF) r=r" "i-1 } print r
}' file.tsv
FPAT="[[:digit:]]+|[[:alpha:]]" - regex pattern defining field value
for(i=2;i<NF;i++) - iterating though the fields (letters of the 2nd column)
The output:
1:positions: 4 23 43 53 56 65 68 91 92 100 120 123 125
2:positions: 1 18 22 110 134
awk '{
str=$1":positions:";
n=0;split($2,a,$3); # adopt $3 as the delimeter to split $2
for(i=1;i<length(a);i++){ # save the result to a
n+=length(a[i])+1;str=str" "n # locate the delimeter $3 by compute n+length(a[i])+1
}
print str
}' file.tsv
$ awk '{out=$1 ":positions:"; for (i=1;i<=length($2);i++) { c=substr($2,i,1); if (c == $3) out = out " " i}; print out}' file
1:positions: 4 23 43 53 56 65 68 91 92 100 120 123 125
2:positions: 1 18 22 110 134
Simple perl solution
use strict;
use warnings;
while( <DATA> ) {
chomp;
next if /^\s*$/; # just in case if you have empty line
my #data = split "\t"; # record is tabulated
my %result; # hash to store result
my $c = 0; # position in the string
map { $c++; push #{$result{$data[0]}}, $c if $_ eq $data[2] } split '', $data[1];
print "$data[0]:position:"
. join(' ', #{$result{$data[0]}}) # assemble result to desired form
. "\n";
}
__DATA__
1 MGNVFEKLFKSLFGKKEMRILMVGLDAAGKTTILYKLKLGEIVTTIPTIGFNVETVEYKNISFTVWDVGGQDKIRPLWRHYFQNTQGLIFVVDSNDRERVNEAREELTRMLAEDELRDAVLLVFVNKQDLPNAMNAAEITDKLGLHSLRQRNWYIQATCATSGDGLYEGLDWLSNQLKNQK V
2 MGNVFEKLFKSLFGKKEMRILMVGLDAAGKTTILYKLKLGEIVTTIPTIGFNVETVEYKNISFTVWDVGGQDKIRPLWRHYFQNTQGLIFVVDSNDRERVNEAREELTRMLAEDELRDAVLLVFVNKQDLPNAMNAAEITDKLGLHSLRQRNWYIQATCATSGDGLYEGLDWLSNQLKNQK M
I would use a small script, which goes through every line of your file, gets the last field as search_string and then use grep to get the position of the search_string. All you have to do now is shift the result, since you have an offset of 1. The sed command removes new lines from the grep output.
while read p; do
search_string=`echo $p |awk '{print $NF}'`
echo $p |grep -aob $search_string | sed ':a;N;$!ba;s/\n/ /g'
done < file.tsv

How to parse through a csv file by awk?

Actually I have csv file with suppose 20 headers and they have corresponding values for those headers in the next row for a particular record.
Example : Source file
Age,Name,Salary
25,Anand,32000
I want my output file to be in this format.
Example : Output file
Age
25
Name
Anand
Salary
32000
So for doing this which awk/grep/sed command to be used?
I'd say
awk -F, 'NR == 1 { split($0, headers); next } { for(i = 1; i <= NF; ++i) { print headers[i]; print $i } }' filename
That is
NR == 1 { # in the first line
split($0, headers) # remember the headers
next # do nothing else
}
{ # after that:
for(i = 1; i <= NF; ++i) { # for all fields:
print headers[i] # print the corresponding header
print $i # followed by the field
}
}
Addendum: Obligatory, crazy sed solution (not recommended for productive use; written for fun, not profit):
sed 's/$/,/; 1 { h; d; }; G; :a s/\([^,]*\),\([^\n]*\n\)\([^,]*\),\(.*\)/\2\4\n\3\n\1/; ta; s/^\n\n//' filename
That works as follows:
s/$/,/ # Add a comma to all lines for more convenient processing
1 { h; d; } # first line: Just put it in the hold buffer
G # all other lines: Append hold bufffer (header fields) to the
# pattern space
:a # jump label for looping
# isolate the first fields from the data and header lines,
# move them to the end of the pattern space
s/\([^,]*\),\([^\n]*\n\)\([^,]*\),\(.*\)/\2\4\n\3\n\1/
ta # do this until we got them all
s/^\n\n// # then remove the two newlines that are left as an artifact of
# the algorithm.
Here is one awk
awk -F, 'NR==1{for (i=1;i<=NF;i++) a[i]=$i;next} {for (i=1;i<=NF;i++) print a[i] RS $i}' file
Age
25
Name
Anand
Salary
32000
First for loop store the header in array a
Second for loop prints header from array a with corresponding data.
Using GNU awk 4.* for 2D arrays:
$ awk -F, '{a[NR][1];split($0,a[NR])} END{for (i=1;i<=NF;i++) for (j=1;j<=NR;j++) print a[j][i]}' file
Age
25
Name
Anand
Salary
32000
In general to transpose rows and columns:
$ cat file
11 12 13
21 22 23
31 32 33
41 42 43
with GNU awk:
$ awk '{a[NR][1];split($0,a[NR])} END{for (i=1;i<=NF;i++) for (j=1;j<=NR;j++) printf "%s%s", a[j][i], (j<NR?OFS:ORS)}' file
11 21 31 41
12 22 32 42
13 23 33 43
or with any awk:
$ awk '{for (i=1;i<=NF;i++) a[NR][i]=$i} END{for (i=1;i<=NF;i++) for (j=1;j<=NR;j++) printf "%s%s", a[j][i], (j<NR?OFS:ORS)}' file
11 21 31 41
12 22 32 42
13 23 33 43

awk compare fields from two different files

Here I have tried awk script to compare fields from two different files.
awk 'NR == FNR {if (NF >= 4) a[$1] b[$4]; next} {for (i in a) for (j in b) if (i >= $2 && i <=$3 && j>=$2 && j<=$3 ) {print $1, $2, $3, i, j; next}}' file1 file2
Input files:
File1:
24926 17 206 25189 5.23674 5.71882 4.04165 14.99721 c
50760 17 48 50874 3.49903 4.25043 7.66602 15.41548 c
104318 15 269 104643 2.94218 5.18301 5.97225 14.09744 c
126088 17 70 126224 3.12993 5.32649 6.14936 14.60578 c
174113 16 136 174305 4.32339 2.36452 8.60971 15.29762 c
196474 14 89 196626 2.24367 5.16966 7.33723 14.75056 c
......
......
File2:
GT_004279 1 280
GT_003663 19891 20217
GT_003416 22299 23004
GT_003151 24916 25391
GT_001715 39470 39714
GT_001585 40896 41380
....
....
The output which I got is:
GT_004279 1 280 2465483 2639576
GT_003663 19891 20217 2005645 2005798
GT_003416 22299 23004 2291204 2269898
GT_003151 24916 25391 2501183 25189
GT_001715 39470 39714 3964440 3950417
......
......
The desired output should be 1st and 4th field values from file1 lies in between 2nd and 3rd field values from file2. For example, If I have taken above given lines as INPUT files, the output must be..
GT_003151 24916 25391 24926 25189
If I guess correctly the problem is within the If loop. So, Could someone help to rectify this problem.
Thanks
You need to make composite keys and iterate through them. When you create such composite keys they are separated by SUBSEP variable. So you just split based on that and do the check.
awk '
NR==FNR{ flds[$1,$4]; next }
{
for (key in flds) {
split (key, fld, SUBSEP)
if ($2<=fld[1] && $3>=fld[2])
print $0, fld[1], fld[2]
}
}' file1 file2
GT_003151 24916 25391 24926 25189

Resources