awk manipulates csv file - shell

I want to use awk to read a csv file. The csv file contains 5 columns, c1, c2, c3, c4, c5. I want judge the c1, c2 and c3 together is unique, like database constraint.
here is sample csv file:
c1,c2,c3,c4,c5
1886,5141,11-2011,62242.57,52.71
1886,5140,11-2011,63763.75,52.22
23157666,4747,11-2011,71.07,83.33
1886,5141,11-2011,4645.45,2135.45
In this case, row1 and row4 violate the unique constraint, and prompt the error message.
How to implement it with awk? Thanks a lot in advance.

awk -F, 'line[$1,$2,$3] {printf "Error: lines %d and %d collide\n", line[$1,$2,$3], NR; next} {line[$1,$2,$3] = NR}'

This lists all the lines for each duplication. It only outputs the duplication message once for each set.
awk -F, '{count[$1,$2,$3]++; line[$1,$2,$3] = line[$1,$2,$3] ", " NR} END {for (i in count) {if (count[i] > 1) {v=i; gsub(SUBSEP, FS, v); print "Error: lines", substr(line[i], 3), "collide on value:", v}}}'
Broken out on multiple lines:
awk -F, '
{
count[$1,$2,$3]++;
line[$1,$2,$3] = line[$1,$2,$3] ", " NR
}
END {
for (i in count) {
if (count[i] > 1) {
v = i;
gsub(SUBSEP, FS, v);
print "Error: lines", substr(line[i], 3), "collide on value:", v
}
}
}'
This is a variation on Kevin's answer.

Related

Finding hits based on subgroups (A, B, C) in big data set

I have a huge amount of data to analyze.
From this example file I need to know all hits ("samples") that were found either
only in B,
only in C,
in A and C or
in A and B,
but not the ones that are found
in B and C or
in A, B and C.
[edit: to keep it more simple: there should be no co-occurence of B and C]
These letters are found in column $8.
The first two columns together can be used as an identifier for each "sample".
Example: You can see that for “463;88” we find A and C in column $8 which would make “463;88” a hit that I need in a separate output file. "348;64" is found in A, B and C and would therefore be discarded/ignored.
File1.csv
463;88;1;193187729;280062;CDC73;IS;A;0.0
463;88;1;193188065;280062;CDC73;IS;A;0.0
463;88;1;193188527;280062;CDC73;IS;A;0.0
463;88;1;193188542;280062;CDC73;IS;C;0.0
348;64;1;155219446;384172;GBAP1;IS;B;0.0
348;64;1;155224629;384172;GBAP1;IS;C;0.0
348;64;1;155224965;384172;GBAP1;IS;A;0.0
71;35;2;27400461;145220;PPM1G;IS;A;0.0
71;35;2;27400930;145220;PPM1G;IS;A;0.0
71;35;2;27401162;145220;PPM1G;IS;A;0.0
71;35;2;27403518;145220;PPM1G;IS;B;0.0
71;35;2;27403545;145220;PPM1G;IS;B;0.0
71;35;2;27404353;145220;PPM1G;IS;B;0.0
71;35;2;27419156;145220;NRBP1;IS;B;0.0
7;14;20;2894103;92099;PTPRA;IS;B;0.0
7;14;20;2906211;92099;PTPRA;IS;C;0.0
7;14;20;2907301;92099;PTPRA;IS;C;0.0
...
Does anyone have a suggestion how to do this, eg with bash, awk, grep...?
It does not need to be very efficient or fast, it just needs to be reliable.
Edit:
I generated a csv table with the columns 1 and 2 of lines that contain <3 different entries in column $8 in several steps.
awk print $1, $2, $8 | sort –n | uniq > file.tmp1
awk print $1, $2 from file.tmp1 | sort –n | unic –c | sed for csv format > file.tmp2
Finally,
awk to print only the identifier columns from file.tmp2 where the count was <3 (= only one or two different letters in column $8 of original file).
File2.csv
6;3;
12;9;
348;40;
463;88;
...
Then, I wanted to use
fgrep --file=File2.csv File1.csv
but this does not seem to work properly. And it still requires manual analysis as it gives me also false hits.
another alternative
keeps only the keys of the lines to be deleted, but scans the files twice. Also, file doesn't need to be sorted.
$ awk -F';' '{k=$1 FS $2}
NR==FNR {if($8=="B") b[k];
else if($8=="C") c[k];
if(k in b && k in c) d[k];
next}
!(k in d)' file{,}
463;88;1;193187729;280062;CDC73;IS;A;0.0
463;88;1;193188065;280062;CDC73;IS;A;0.0
463;88;1;193188527;280062;CDC73;IS;A;0.0
463;88;1;193188542;280062;CDC73;IS;C;0.0
71;35;2;27400461;145220;PPM1G;IS;A;0.0
71;35;2;27400930;145220;PPM1G;IS;A;0.0
71;35;2;27401162;145220;PPM1G;IS;A;0.0
71;35;2;27403518;145220;PPM1G;IS;B;0.0
71;35;2;27403545;145220;PPM1G;IS;B;0.0
71;35;2;27404353;145220;PPM1G;IS;B;0.0
71;35;2;27419156;145220;NRBP1;IS;B;0.0
with gawks bitwise operations, can be further simplified to
$ awk -F';' 'BEGIN {c["B"]=1; c["C"]=2}
{k=$1 FS $2}
NR==FNR {d[k]=or(d[k],c[$8]); next}
d[k]!=3' file{,}
or is an idempotent function updates the array if "B" or "C" are seen. If both seen the value will be 3, in the second round print everything else.
You don't show the expected output in your question so it's a guess but is this what you're looking for?
$ cat tst.awk
BEGIN { FS=OFS=";" }
{ curr = $1 FS $2 }
curr != prev { prt(); prev=curr }
{ lines[++numLines]=$0; seen[$8]++ }
END { prt() }
function prt() {
if ( !(seen["B"] && seen["C"]) ) {
for ( lineNr=1; lineNr<=numLines; lineNr++) {
print lines[lineNr]
}
}
delete seen
numLines=0
}
$ awk -f tst.awk file
463;88;1;193187729;280062;CDC73;IS;A;0.0
463;88;1;193188065;280062;CDC73;IS;A;0.0
463;88;1;193188527;280062;CDC73;IS;A;0.0
463;88;1;193188542;280062;CDC73;IS;C;0.0
71;35;2;27400461;145220;PPM1G;IS;A;0.0
71;35;2;27400930;145220;PPM1G;IS;A;0.0
71;35;2;27401162;145220;PPM1G;IS;A;0.0
71;35;2;27403518;145220;PPM1G;IS;B;0.0
71;35;2;27403545;145220;PPM1G;IS;B;0.0
71;35;2;27404353;145220;PPM1G;IS;B;0.0
71;35;2;27419156;145220;NRBP1;IS;B;0.0
Something like this should work, unless you run out of memory:
BEGIN { FS=";"; }
{
keys[$1,$2] = 1;
data[$1,$2,$8] = $0;
}
END {
for (key in keys) {
a = data[key,"A"];
b = data[key,"B"];
c = data[key,"C"];
if (!(b && c)) {
if (a) { print a; }
if (b) { print b; }
if (c) { print c; }
}
}
}
Assuming that all lines with the same key are consecutive, this should work:
BEGIN { FS=";"; }
$1";"$2 != key {
if (!(data["B"] && data["C"])) {
print key;
}
delete data;
key = $1";"$2;
}
{
data[$8] = 1;
}

Split column using awk or sed

I have a file containing the following text.
dog
aa 6469
bb 5946
cc 715
cat
aa 5692
Bird
aa 3056
bb 2893
cc 1399
dd 33
I need the following output:
A-Z ,aa ,bb, cc, dd
dog, 6469, 5946 ,715, 0
cat ,5692, 0, 0, 0
Bird ,3056, 2893, 1399, 33
I tried:
awk '{$1=$1}1' OFS="," RS=
But is not giving the format I need.
Thanks in advance for your help.
Cris
With Perl
perl -00 -nE'
($t, %p) = split /[\n\s]/; $h{$t} = {%p}; # Top line, Pairs on lines
$o{$t} = ++$c; # remember Order
%k = map { $_, 1} keys %p; # find full set of subKeys
}{ # END block starts
say join ",", "A-Z", sort keys %k;
for $t (sort { $o{$a} <=> $o{$b} } keys %h) {
say join ",", $k, map { ($h{$k}{$_} // 0) } sort keys %k;
}
' data.txt
prints, in the original order
A-Z,aa,bb,cc,dd
dog,6469,5946,715,0
cat,5692,0,0,0
Bird,3056,2893,1399,33
Here's a sed solution, which works on your input, but requires that you know the column names in advance and that the column names are given as sorted full ranges starting with the first column name (so nothing like aa, cc or bb, aa or bb, cc) and that every paragraph is followed by one empty line. You would also need to adjust the script if you don't have exactly four numeric columns:
echo 'A-Z, aa, bb, cc, dd';sed -e '/./{s/.* //;H;d};x;s/\n/, /g;s/, //;s/$/, 0, 0, 0, 0/;:a;s/,[^,]*//5;ta' file
If you need to look up the sed commands, you can look at info sed, especially 3.5 Less Frequently-Used Commands.
awk to the rescue!
awk -v OFS=, 'NF==1 {h[++c]=$1}
NF==2 {v[c,$1]=$2; ks[$1]}
END {printf "%s", "A-Z";
for(k in ks) printf "%s", OFS k;
print "";
for(i=1;i<=c;i++)
{printf "%s", h[i];
for(k in ks) printf "%s", OFS v[i,k]+0;
print ""}}' file'
order of the columns will be random.

How to concatenate corresponding columns from multiple files into one single file, using a specified column as ID?

If I have a number of files, such as
1.txt:
1;ab, bc
2;cd, de, ef
3;fgh
2.txt:
4;bc
1;cd, ef
5;ab
2;g
3.txt:
5;ef, hl
7;a, b, c
3;k, jk
1;b
6;x
Assuming that ; is a delimiter and the first column serves as ID, how to concatenate corresponding second columns (using eg. commas), so that the output becomes
output.txt:
1;ab, bc, cd, ef, b
2;cd, de, ef, g
3;fgh, k, jk
4;bc
5;ab, ef, hl
7;a, b, c
6;x
awk to the rescue!
$ awk -F";" '{a[$1]=a[$1]?a[$1]","$2:$2}
END{for(k in a) print k";"a[k]}' file{1,2,3} | sort
1;ab, bc,cd, ef,b
2;cd, de, ef,g
3;fgh,k, jk
4;bc
5;ab,ef, hl
6;x
7;a, b, c
cause join(1) is designed for join two files, and the input has to be sort, so why bother. the awk source:
#!/usr/bin/env awk -f
BEGIN { FS = ";" }
FNR==NR { a[$1] = $2; next}
{
if ($1 in a) {
a[$1] = a[$1] ", " $2
} else {
a[$1] = $2;
}
}
END {
for (i in a) {
printf("%s%s%s\n", i,FS,a[i]);
}
}

Unix/Bash: Uniq on a cell

I have a tab-separated fileA where the 12th column (starting from 1) contain several comma separated identifiers. Some of them in the same row, however, can occur more than once:
GO:0042302, GO:0042302, GO:0042302
GO:0004386,GO:0005524,GO:0006281, GO:0004386,GO:0005524,GO:0006281
....
....
(some with a white-space after the comma, some where it is not).
I would like to only get the unique identifiers and remove the multiples for each row in the 12th column:
GO:0042302
GO:0004386,GO:0005524,GO:0006281
....
....
Here is what I have so far:
for row in `fileA`
do
cut -f12 $row | sed "s/,/\n/" | sort | uniq | paste fileA - | \
awk 'BEGIN {OFS=FS="\t"}{print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $13}'
done > out
The idea was to go over each row at a time, cut out the 12th column, replace all commas with newlines and then sort and take uniq to get rid of duplicates, paste it back and print the columns in the right order, skipping the original identifier column.
However, this does not seem to work. Any ideas?
Just for completeness, and because I personally prefer Perl over Awk for this sort of thing, here's a Perl one-liner solution:
perl -F'\t' -le '%u=();#k=split/,/,$F[11];#u{#k}=#k;$F[11]=join",",sort
keys%u;print join"\t",#F'
Explanation:
-F'\t' Loop over input lines, splitting each one into fields at tabs
-l automatically remove newlines from input and append on output
-e get code to execute from the next argument instead of standard input
%u = (); # clear out the hash variable %u
#k = split /,/, $F[11]; # Split 12th field (1st is 0) on comma into array #k
#u{#k} = #k; # Copy the contents of #k into #u as key/value pairs
Because hash keys are unique, that last step means that the keys of %u are now a deduplicated copy of #k.
$F[11] = join ",", sort keys %u; # replace the 12th field with the sorted unique list
print join "\t", #F; # and print out the modified line
If I understand you correctly, then with awk:
awk -F '\t' 'BEGIN { OFS = FS } { delete b; n = split($12, a, /, */); $12 = ""; for(i = 1; i <= n; ++i) { if(!(a[i] in b)) { b[a[i]]; $12 = $12 a[i] "," } } sub(/,$/, "", $12); print }' filename
This works as follows:
BEGIN { OFS = FS } # output FS same as input FS
{
delete b # clear dirty table from last pass
n = split($12, a, /, */) # split 12th field into tokens,
$12 = "" # then clear it out for reassembly
for(i = 1; i <= n; ++i) { # wade through those tokens
if(!(a[i] in b)) { # those that haven't been seen yet:
b[a[i]] # remember that they were seen
$12 = $12 a[i] "," # append to result
}
}
sub(/,$/, "", $12) # remove trailing comma from resulting field
print # print the transformed line
}
The delete b; has been POSIX-conforming for only a short while, so if you're working with an old, old awk and it fails for you, see #MarkReed's comment for another way that ancient awks should accept.
Using field 2 instead of field 12:
$ cat tst.awk
BEGIN{ FS=OFS="\t" }
{
split($2,f,/ *, */)
$2 = ""
delete seen
for (i=1;i in f;i++) {
if ( !seen[f[i]]++ ) {
$2 = $2 (i>1?",":"") f[i]
}
}
print
}
.
$ cat file
a,a,a GO:0042302, GO:0042302, GO:0042302 b,b,b
c,c,c GO:0004386,GO:0005524,GO:0006281, GO:0004386,GO:0005524,GO:0006281 d,d,d
$ awk -f tst.awk file
a,a,a GO:0042302 b,b,b
c,c,c GO:0004386,GO:0005524,GO:0006281 d,d,d
If your awk doesn't support delete seen you can use split("",seen).
Using this awk:
awk -F '\t' -v OFS='\t' '{
delete seen;
split($12, a, /[,; ]+/);
for (i=1; i<=length(a); i++) {
if (!(a[i] in seen)) {
seen[a[i]];
s=sprintf("%s%s,", s, a[i])
}
}
$12=s} 1' file
GO:0042302,
GO:0042302,GO:0004386,GO:0005524,GO:0006281,
In your example data, the comma followed by a space is the delimiter of the 12th field. Every subfield after that is merely a repeat of the first field. The subfields appear to already be in sorted order.
GO:0042302, GO:0042302, GO:0042302
^^^dup1^^^ ^^^dup2^^^
GO:0004386,GO:0005524,GO:0006281, GO:0004386,GO:0005524,GO:0006281
^^^^^^^^^^^^^^^dup1^^^^^^^^^^^^^
Based on that, you could simply keep the first of the subfields and toss the rest:
awk -F"\t" '{sub(/, .*/, "", $12)} 1' fileA
If instead, you can have different sets of repeated subfields, where keys are not sorted like this:
GO:0042302, GO:0042302, GO:0042302, GO:0062122,GO:0055000, GO:0055001, GO:0062122,GO:0055000
GO:0004386,GO:0005524,GO:0006281, GO:0005525, GO:0004386,GO:0005524,GO:0006281
If you were stuck with a default MacOS awk you could introduce a sort/uniq functions in an awk executable script:
#!/usr/bin/awk -f
BEGIN {FS="\t"}
{
c = uniq(a, split($12, a, /, |,/))
sort(a, c)
s = a[1]
for(i=2; i<=c; i++) { s = s "," a[i] }
$2 = s
}
47 # print out the modified line
# take an indexed arr as from split and de-dup it
function uniq(arr, len, i, uarr) {
for(i=len; i>=1; i--) { uarr[arr[i]] }
delete arr
for(k in uarr) { arr[++i] = k }
return( i )
}
# slightly modified from
# http://rosettacode.org/wiki/Sorting_algorithms/Bubble_sort#AWK
function sort(arr, len, haschanged, tmp, i)
{
haschanged = 1
while( haschanged==1 ) {
haschanged = 0
for(i=1; i<=(len-1); i++) {
if( arr[i] > arr[i+1] ) {
tmp = arr[i]
arr[i] = arr[i + 1]
arr[i + 1] = tmp
haschanged = 1
}
}
}
}
If you had GNU-awk, I think you could swap out the sort(a, c) call with asort(a), and drop the bubble-sort local function completely.
I get the following for the 12th field:
GO:0042302,GO:0055000,GO:0055001,GO:0062122
GO:0004386,GO:0005524,GO:0005525,GO:0006281

Transpose CSV data with awk (pivot transformation)

my CSV data looks like this:
Indicator;Country;Value
no_of_people;USA;500
no_of_people;Germany;300
no_of_people;France;200
area_in_km;USA;18
area_in_km;Germany;16
area_in_km;France;17
proportion_males;USA;5.3
proportion_males;Germany;7.9
proportion_males;France;2.4
I want my data to look like this:
Country;no_of_people;area_in_km;proportion_males
USA;500;18;5.3
Germany;300;16;7.9
France;200;17;2.4
There are more Indicators and more countries than listed here.
Pretty large files (number of rows something with 5 digits).
Looked around for some transpose threads, but nothing matched my situation (also I'm quite new to awk, so I couldn't change the code I found to fit my data).
Thanks for your help.
Regards
Ad
If the number of Ind fields is fixed, you can do:
awk 'BEGIN{FS=OFS=";"}
{a[$2,$1]=$3; count[$2]}
END {for (i in count) print i, a[i,"Ind1"], a[i, "Ind2"], a[i, "Ind3"]}' file
Explanation
BEGIN{FS=OFS=";"} set input and output field separator as semicolon.
{a[$2,$1]=$3; count[$2]} get list of countries in count[] array and values of each Ind on a["country","Ind"] array.
END {for (i in count) print i, a[i,"Ind1"], a[i, "Ind2"], a[i, "Ind3"]} print the summary of the values.
Output
$ awk 'BEGIN{FS=OFS=";"} {a[$2,$1]=$3; count[$2]} END {for (i in count) print i, a[i,"Ind1"], a[i, "Ind2"], a[i, "Ind3"]}' file
France;200;17;2.4
Germany;300;16;7.9
USA;500;18;5.3
Update
unfortunately, the number of Indicators is not fixed. Also, they are
not named like "Ind1", "Ind2" etc. but are just strings.' I clarified
my question.
$ awk -v FS=";" '{a[$2,$1]=$3; count[$2]; indic[$1]} END {for (j in indic) printf "%s ", j; printf "\n"; for (i in count) {printf "%s ", i; for (j in indic) printf "%s ", a[i,j]; printf "\n"}}' file
proportion_males no_of_people area_in_km
France 2.4 200 17
Germany 7.9 300 16
USA 5.3 500 18
To have ; separated, do replace each space with ;:
$ awk -v FS=";" '{a[$2,$1]=$3; count[$2]; indic[$1]} END {for (j in indic) printf "%s ", j; printf "\n"; for (i in count) {printf "%s ", i; for (j in indic) printf "%s ", a[i,j]; printf "\n"}}' file | tr ' ' ';'
proportion_males;no_of_people;area_in_km;
France;2.4;200;17;
Germany;7.9;300;16;
USA;5.3;500;18;
Using awk and maintaining the order of output:
awk -F\; '
NR>1 {
if(!($1 in indicators)) { indicator[++types] = $1 }; indicators[$1]++
if(!($2 in countries)) { country[++num] = $2 }; countries[$2]++
map[$1,$2] = $3
}
END {
printf "%s;" ,"Country";
for(ind=1; ind<=types; ind++) {
printf "%s%s", sep, indicator[ind];
sep = ";"
}
print "";
for(coun=1; coun<=num; coun++) {
printf "%s", country[coun]
for(val=1; val<=types; val++) {
printf "%s%s", sep, map[indicator[val], country[coun]];
}
print ""
}
}' file
Country;no_of_people;area_in_km;proportion_males
USA;500;18;5.3
Germany;300;16;7.9
France;200;17;2.4

Resources