How to Pivot Data Using AWK - shell

From:
DT X Y Z
10 75 0 3
20 100 1 6
30 125 2 9
To:
DT ID VALUE
10 X 75
20 Y 0
30 Z 3
10 X 100
20 Y 1
30 Z 6
10 X 125
20 Y 2
30 Z 9

it's done
#my original dataset is separated by "," and have 280 cols
tempfile=dataset.csv;
col_count=`head -n1 $tempfile | tr -cd "," | wc -c`;
col_count=`expr $col_count + 1`;
for i in `seq 4 $col_count`; do
echo $i;
pt="{print \$"$i"}";
col_name=`head -n 1 $tempfile | sed s'/ //'g | awk -F"," "$pt"`;
awk -F"," -v header="DT,ID,$col_name" -f st.awk $tempfile | awk 'NR>1 {print substr($0,index($0,$1))",'"$col_name"'"}' | sed 's/ //g' >> New$tempfile;
done;
# file st.awk:
# the code below was found on some stackoverflow page, with some minor changes
BEGIN {
# Parse headers into an assoc array h
split(header, a, ",")
for(i in a) {
h[a[i]]=2
}
}
# Find the column numbers in the first line of a file
FNR==1{
split("", cols) # This will re-init cols
for(i=1;i<=NF;i++) {
if($i in h) {
cols[i]=1
}
}
next
}
# Print those columns on all other lines
{
res = ""
for(i=1;i<=NF;i++) {
if(i in cols) {
s = res ? OFS : ""
res = res "," $i
}
}
if (res) {
print res
}
}

You can try this awk (MAWK Version 1.2)
Your data can been 5x5 or more
mawk -v OFS='\t' '
NR==1 {
nbfield=(NF-1)
for(i=1;i<NF;i++)
ID[i]=$(i+1)
print $1 OFS "ID" OFS "VALUE"
next
}
{
numrecord=((NR-1)%nbfield)
numrecord = numrecord ? numrecord : nbfield
for(i=0;i<=nbfield;i++)
val[ID[i],numrecord]=$(i+1)
}
numrecord==nbfield {
for(i=1;i<=nbfield;i++)
for(j=1;j<=nbfield;j++)
print val[ID[0],j] OFS ID[j] OFS val[ID[j],i]
}
' infile

Input:
-- ColOne ColTwo ColThr
RowOne A B C D E
RowTwo F G H I J
RowThr K L M N O
RowFor P Q R S T
RowFiv U V W X Y
Output:
RowNbr | ColNbr | RowColVal
------ | ------ | ---------
RowOne | ColOne | A
RowOne | ColTwo | B
RowOne | ColThr | C
RowTwo | ColOne | F
RowTwo | ColTwo | G
RowTwo | ColThr | H
RowThr | ColOne | K
RowThr | ColTwo | L
RowThr | ColThr | M
Pivot script:
# pivot a table
BEGIN { # before processing innput lines, emit output header
OFS = " | " # set the output field-separator
fmtOutDtl = "%6s | %-6s | %-9s" "\n" # set the output format for all detail lines: InpRowHdr, InpColHdr, InpVal
fmtOutHdr = "%6s | ColNbr | RowColVal" "\n" # set the output format for the header line
strOutDiv = "------ | ------ | ---------" # set the divider line
print "" # emit blank line before output
} # done with output header
NR == 1 { # when we are on the innput header line / the first row
FldCnt = ( NF - 1 ) # number of columns to process is number of fields on this row, except for the first val
for( idxCol = 1; idxCol < NF; idxCol++ ) # scan col numbers after the first, ignoring the first val
ColHds[ idxCol ] = $( idxCol + 1 ) # store the next col-val as this ColHdr
printf( fmtOutHdr, "RowNbr" ) # emit header line: RowNbr-header, innput column headers
print strOutDiv # emit divider row after header line and before data lines
next # skip to the next innput row
} # done with first innput row
{ # for each body innput row
RecNbr = ( ( NR - 1 ) % FldCnt ) # get RecNum for this row: ( RecNum - 1 ) Mod [number of fields]: zero-based / 0..[number_of_cols-1]
RecNbr = RecNbr ? RecNbr : FldCnt # promote from zero-based to one-based: 0 => [number of fields]: one -based / 1..[number_of_cols ]
for( idxCol = 0; idxCol <= FldCnt; idxCol++ ) # scan col numbers including the first
Rws[ ColHds[ idxCol ], RecNbr ] = $( idxCol + 1 ) # store this row+col val in this Row position under this ColHdr
} # done with this body innput row
RecNbr == FldCnt { # when we are on the last innput row that we are processing (lines beyond FldCnt are not emitted)
for( idxCol = 1; idxCol <= FldCnt; idxCol++ ) { # scan col numbers after the first
for( idxRow = 1; idxRow <= FldCnt; idxRow++ ) { # scan row numbers after the first, up to number of cols
printf( fmtOutDtl \
,Rws[ ColHds[ 0 ] , idxCol ] \
, ColHds[ idxRow ] \
,Rws[ ColHds[ idxRow ] , idxCol ] ) # emit innput rowHdr, colHdr, row+col val
} # done scanning row numbers
print "" # emit a blank line after each innput row
} # done scanning col numbers
} # done with the last innput row
END { # after processing innput lines
} # do nothing

Related

How to normalize the values of specific columns of a csv with awk?

I have a csv with several variables and I would like to normalize only some specific columns using the standard deviation.
The value minus the mean of the variable divided by the standard deviation of the variable.
The file is comma separated and the transformations needs to be done only with awk to the variables months_loan_duration and amount.
The input would look like this but with a thousand rows:
checking_balance,months_loan_duration,credit_history,purpose,amount
< 0 DM,6,critical,radio/tv,1169.53
1 - 200 DM,48,repaid,radio/tv,5951.78
,12,critical,education,2096.23
And the output would be like this:
checking_balance,months_loan_duration,credit_history,purpose,amount
< 0 DM,-1.236,critical,radio/tv,-0.745
1 - 200 DM,2.248,repaid,radio/tv,0.95
,-0.738,critical,education,-0.417
So far I have tried the following unsuccessfully:
#! /usr/bin/awk -f
BEGIN{FS=","; OFS=",";numberColumn=NF}
NR!=1
{
for(i=1;i <= numberColumn;i++)
{
total[i]+=$i;
totalSquared[i]+=$i^2;
}
for (i=1;i <= numberColumn;i++)
{
avg[i]=total[i]/(NR-1);
std[i]=sqrt((totalSquared[i]/(NR-1))-avg[i]^2);
}
for (i=1;i <= numberColumn;i++)
{
norm[i]=(($i-avg[i])/std[i])
}
}
{
print $1,$norm[2],3,4,$norm[5]
}
It will be easier to read the file twice:
awk -F, -v OFS=, '
NR==FNR { # 1st pass: accumulate values
if (FNR > 1) {
sx2 += $2 # sum of col2
sxx2 += $2 * $2 # sum of col2^2
sx5 += $5 # sum of col5
sxx5 += $5 * $5 # sum of col5^2
n++ # count of samples
}
next
}
FNR==1 { # 2nd pass, 1st line: calc means and stdevs
ave2 = sx2 / n # mean of col2
var2 = sxx2 / (n - 1) - ave2 * ave2 * n / (n - 1)
if (var2 < 0) var2 = 0 # avoid rounding error
sd2 = sqrt(var2) # stdev of col2
ave5 = sx5 / n
var5 = sxx5 / (n - 1) - ave5 * ave5 * n / (n - 1)
if (var5 < 0) var5 = 0
sd5 = sqrt(var5)
print # print the header line
}
FNR>1 {
if (sd2 > 0) $2 = ($2 - ave2) / sd2
if (sd5 > 0) $5 = ($5 - ave5) / sd5
print
}
' input_file.csv input_file.csv
Output:
checking_balance,months_loan_duration,credit_history,purpose,amount
< 0 DM,-0.704361,critical,radio/tv,-0.750328
1 - 200 DM,1.14459,repaid,radio/tv,1.13527
,-0.440225,critical,education,-0.384939
Please note the calculated values differ from your expected result.
thousands of rows isn't all that big a file for awk : might as well load it in all at once - here i created a 23.6 mn rows synthetic version of it (tested on both gawk and mawk) -
while overall performance is similar to other solutions, this code avoids having to explicitly list the input file twice to perform its equivalent of 2-pass processing
INPUT
rows = 23,622,127. | UTF8 chars = 799192890. | bytes = 799192890.
1 checking_balance,months_loan_duration,credit_history,purpose,amount
2 < 0 DM,889,critical,luna,758.61
3 ,150,critical,terra,1823.93
4 1 - 200 DM,883,repaid,stablecoin,2525.55
5 1 - 200 DM,65,repaid,terra,2405.67
6 < 0 DM,9,critical,luna,4059.34
7 < 0 DM,201,critical,stablecoin,5043
8 1 - 200 DM,549,repaid,terra,471.92
9 < 0 DM,853,critical,stablecoin,422.78
10 < 0 DM,659,critical,luna,684.94
CODE
# gawk profile, created Tue May 24 04:11:02 2022
'function abs(_) {
return \
+_<-_?-_:_
} BEGIN {
split(_____=(_=length(FS = RS = "^$"))+_,____,"")
}
END {
1 gsub("\n", ",&")
1 FS = "["(OFS= ",")"]"
1 $!_ = $!( __ = _)
1 __+= --NF
23622126 while ((_____+_) < (__-=_)) {
23622126 ____[___=_____] += ($__)^_
23622126 ____[ -—___ ] += ($__)
23622126 ____[___ * _] += -_^!_
23622126 ____[___-=+_] += ($(__-=_+_^!_))
23622126 ____[ ++___ ] += ($__)^_
}
1 ___ = (__=-____[_+_+_])-_^!_
1 RS = -(abs((____[(_)]/___-(((NR=____[+_^!+_]/__)^_)*__/___)))^_^(_/-_)
___ = -(abs((____[_+_]/___-(((RT=____[_+_^!_]/__)^_)*__/___)))^_^(_/-_)
1 ORS = "\n"
1 gsub(ORS, "")
1 OFS = ","
1 print $(_^=_<_), $(__=++_), $++_, $++_, $++_
1 OFMT = "%."(__*__+!(__=NF-__-__))"f"
23622126 while (++_ <= __) {
23622126 print $_, (NR-$++_)/RS, $++_, $++_, (RT-$++_)/___
}
}'
OUTPUT
out9: 837MiB 0:00:28 [29.2MiB/s] [29.2MiB/s] [ <=> ]
in0: 762MiB 0:00:00 [2.95GiB/s] [2.95GiB/s] [======>] 100%
( pvE 0.1 in0 < "${f}" | LC_ALL=C mawk2 ; )
26.98s user 1.58s system 99% cpu 28.681 total
23622127 878032266 878032266 testfile_stdnorm_test_004.txt_out.txt
1 checking_balance,months_loan_duration,credit_history,purpose,amount
2 < 0 DM,1.2000,critical,luna,-1.2939
3 ,-1.2949,critical,terra,-0.6788
4 1 - 200 DM,1.1798,repaid,stablecoin,-0.2737
5 1 - 200 DM,-1.5818,repaid,terra,-0.3429
6 < 0 DM,-1.7709,critical,luna,0.6119
7 < 0 DM,-1.1227,critical,stablecoin,1.1798
8 1 - 200 DM,0.0522,repaid,terra,-1.4594
9 < 0 DM,1.0785,critical,stablecoin,-1.4878
ALTERNATE SOLUTION OPTIMIZED FOR SMALLER INPUTS (e.g. up to 10^6 (1 mn) rows)
# gawk profile, created Tue May 24 06:19:24 2022
# BEGIN rule(s)
BEGIN {
1 __ = (FS = RS = "^$") * (ORS = "")
}
# END rule(s)
END {
1 _ = $__
1 gsub("[\n][,]","\n_,",_)
1 sub("^.+amount\n","",_)+gsub("[,][0-9.+-]+[,\n]", "\3&\1", _)
1 _____ = "[^0-9.+-]+"
1 gsub("^" (_____) "|\1[^\1\3]+\3","",_)
1 _____ = __ = split(_,___,_____)
1048575 while (-(--__) < +__) {
1048575 ___["_"] += _=___[(__)]
1048575 ___["="] += _*_
1048575 ___["~"] += _=___[--__]
1048575 ___["^"] += _*_
1048575 ___[":"]++
}
1 _ = (__=___[":"])-(____ ^= _<_)
1 ++____
1 ___["}"] = -(abs((___["^"]/_)-(((___["{"] = ___["~"] / __)^____)*__/_)))^____^(-(_^(!_)))
1 ___[")"] = -(abs((___["="]/_)-(((___["("] = ___["_"] / __)^____)*__/_)))^____^(-(_^(!_)))
1 if (_ < _) {
for (_ in ___) {
print "debug", _, ___[_]
}
}
1 ____ = split($(_ < _), ______, ORS = "\n")
1 _ = index(FS = "[" (OFS = ",") "]", OFS)
1 print ______[_ ^ (! _)]
1048574 for (__ += __ ^= _ < _; __ < ____; __++) {
1048574 print sprintf("%.*s%s,%+.*f,%s,%s,%+.*f", ! __, $! _ = ______[__], $(_ ~ _), _ + _, (___["{"] - $_) / ___["}"], $++_, $(--_ + _), _ + _, (___["("] - $NF) / ___[")"])
}
}
# Functions, listed alphabetically
2 function abs(_)
{
2 return (+_ < -_ ? -_ : _)
}
PERFORMANCE OF SOLUTION # 2 : End-to-End 2.57 secs for 2^20 rows
rows = 1048575. | UTF8 chars = 39912117. | bytes = 39912117.
( pvE 0.1 in0 < "${f}" | LC_ALL=C mawk2 ; )
2.46s user 0.13s system 100% cpu 2.573 total

How to outer-join two CSV files, using shell script?

I have two CSV files, like the following:
file1.csv
label,"Part-A"
"ABC mn","2.0"
"XYZ","3.0"
"PQR SN","6"
file2.csv
label,"Part-B"
"XYZ","4.0"
"LMN Wv","8"
"PQR SN","6"
"EFG","1.0"
Desired Output.csv
label,"Part-A","Part-B"
"ABC mn","2.0",NA
"EFG",NA,"1.0"
"LMN Wv",NA,"8"
"PQR SN","6","6"
"XYZ","3.0","4.0"
Currently with the below awk command i am able to combine the matching one's which have entries for label in both the files like PQR and XYZ but unable to append the ones that are not having label values present in both the files:
awk -F, 'NR==FNR{a[$1]=substr($0,length($1)+2);next} ($1 in a){print $0","a[$1]}' file1.csv file2.csv
This solution prints exactly the wished result with any AWK.
Please note that the sorting algorithm is taken from the mawk manual.
# SO71053039.awk
#-------------------------------------------------
# insertion sort of A[1..n]
function isort( A,A_SWAP, n,i,j,hold ) {
n = 0
for (j in A)
A_SWAP[++n] = j
for( i = 2 ; i <= n ; i++)
{
hold = A_SWAP[j = i]
while ( A_SWAP[j-1] "" > "" hold )
{ j-- ; A_SWAP[j+1] = A_SWAP[j] }
A_SWAP[j] = hold
}
# sentinel A_SWAP[0] = "" will be created if needed
return n
}
BEGIN {
FS = OFS = ","
out = "Output.csv"
# read file 1
fnr = 0
while ((getline < ARGV[1]) > 0) {
++fnr
if (fnr == 1) {
for (i=1; i<=NF; i++)
FIELDBYNAME1[$i] = i # e.g. FIELDBYNAME1["label"] = 1
}
else {
LABEL_KEY[$FIELDBYNAME1["label"]]
LABEL_KEY1[$FIELDBYNAME1["label"]] = $FIELDBYNAME1["\"Part-A\""]
}
}
close(ARGV[1])
# read file2
fnr = 0
while ((getline < ARGV[2]) > 0) {
++fnr
if (fnr == 1) {
for (i=1; i<=NF; i++)
FIELDBYNAME2[$i] = i # e.g. FIELDBYNAME2["label"] = 1
}
else {
LABEL_KEY[$FIELDBYNAME2["label"]]
LABEL_KEY2[$FIELDBYNAME2["label"]] = $FIELDBYNAME2["\"Part-B\""]
}
}
close(ARGV[2])
# print the header
print "label" OFS "\"Part-A\"" OFS "\"Part-B\"" > out
# get the result
z = isort(LABEL_KEY, LABEL_KEY_SWAP)
for (i = 1; i <= z; i++) {
result_string = sprintf("%s", LABEL_KEY_SWAP[i])
if (LABEL_KEY_SWAP[i] in LABEL_KEY1)
result_string = sprintf("%s", result_string OFS LABEL_KEY1[LABEL_KEY_SWAP[i]] OFS (LABEL_KEY_SWAP[i] in LABEL_KEY2 ? LABEL_KEY2[LABEL_KEY_SWAP[i]] : "NA"))
else
result_string = sprintf("%s", result_string OFS "NA" OFS LABEL_KEY2[LABEL_KEY_SWAP[i]])
print result_string > out
}
}
Call:
awk -f SO71053039.awk file1.csv file2.csv
=> result file Output.csv with content:
label,"Part-A","Part-B"
"ABC mn","2.0",NA
"EFG",NA,"1.0"
"LMN Wv",NA,"8"
"PQR SN","6","6"
"XYZ","3.0","4.0"
I would like to introduce Miller to you. It is a tool that can do a few things with a few file formats and that is available as a stand-alone binary. You just have to download the archive, put the mlr executable somewhere (preferably in your PATH) and you're done with the installation.
mlr --csv \
join -f file1.csv -j 'label' --ul --ur \
then \
unsparsify --fill-with 'NA' \
then \
sort -f 'label' \
file2.csv
Command parts:
mlr --csv
means that you want to read CSV files and output a CSV format. As an other example, if you want to read CSV files and output a JSON format it would be mlr --icsv --ojson
join -f file1.csv -j 'label' --ul --ur ...... file2.csv
means to join file1.csv and file2.csv on the field label and emit the unmatching records of both files
then is Miller's way of chaining operations
unsparsify --fill-with 'NA'
means to create the fields that didn't exist in each file and fill them with NA. It's needed for the records that had a uniq label
then sort -f 'label'
means to sort the records on the field label
Regarding the updated question: mlr handles the CSV quoting on its own. The only difference with your new expected output is that it removes the superfluous quotes:
label,Part-A,Part-B
ABC mn,2.0,NA
EFG,NA,1.0
LMN Wv,NA,8
PQR SN,6,6
XYZ,3.0,4.0
awk -v OFS=, '{
if(!o1[$1]) { o1[$1]=$NF; o2[$1]="NA" } else { o2[$1]=$NF }
}
END{
for(v in o1) { print v, o1[v], o2[v] }
}' file{1,2}
## output
LMN,8,NA
ABC,2,NA
PQR,6,6
EFG,1,NA
XYZ,3,4
I think this will do nicely.
We suggest gawk script which is standard Linux awk:
script.awk
NR == FNR {
valsStr = sprintf("%s,%s", $2, "na");
rowsArr[$1] = valsStr;
}
NR != FNR && $1 in rowsArr {
split(rowsArr[$1],valsArr);
valsStr = sprintf("%s,%s", valsArr[1], $2);
rowsArr[$1] = valsStr;
next;
}
NR != FNR {
valsStr = sprintf("%s,%s", "na", $2);
rowsArr[$1] = valsStr;
}
END {
printf("%s,%s\n", "label", rowsArr["label"]);
for (rowName in rowsArr) {
if (rowName == "label") continue;
printf("%s,%s\n", rowName, rowsArr[rowName]);
}
}
output:
awk -F, -f script.awk input.{1,2}.txt
label,Part-A,Part-B
LMN,na,8
ABC,2,na
PQR,6,6
EFG,na,1
XYZ,3,4
Since your question was titled with "how to do ... in a shell script?" and not necessarily with awk, I'm going to recommend GoCSV, a command-line tool with several sub-commands for processing CSVs (delimited files).
It doesn't have a single command that can accomplish what you need, but you can compose a number of commands to get the correct result.
The core of this solution is the join command which can perform inner (default), left, right, and outer joins; you want an outer join to keep the non-overlapping elements:
gocsv join -c 'label' -outer file1.csv file2.csv > joined.csv
echo 'Joined'
gocsv view joined.csv
Joined
+-------+--------+-------+--------+
| label | Part-A | label | Part-B |
+-------+--------+-------+--------+
| ABC | 2 | | |
+-------+--------+-------+--------+
| XYZ | 3 | XYZ | 4 |
+-------+--------+-------+--------+
| PQR | 6 | PQR | 6 |
+-------+--------+-------+--------+
| | | LMN | 8 |
+-------+--------+-------+--------+
| | | EFG | 1 |
+-------+--------+-------+--------+
The data-part is correct, but it'll take some work to get the columns correct, and to get the NA values in there.
Here's a complete pipeline:
gocsv join -c 'label' -outer file1.csv file2.csv \
| gocsv rename -c 1 -names 'Label_A' \
| gocsv rename -c 3 -names 'Label_B' \
| gocsv add -name 'label' -t '{{ list .Label_A .Label_B | compact | first }}' \
| gocsv select -c 'label','Part-A','Part-B' \
| gocsv replace -c 'Part-A','Part-B' -regex '^$' -repl 'NA' \
| gocsv sort -c 'label' \
> final.csv
echo 'Final'
gocsv view final.csv
which gets us the correct, final, file:
Final pipeline
+-------+--------+--------+
| label | Part-A | Part-B |
+-------+--------+--------+
| ABC | 2 | NA |
+-------+--------+--------+
| EFG | NA | 1 |
+-------+--------+--------+
| LMN | NA | 8 |
+-------+--------+--------+
| PQR | 6 | 6 |
+-------+--------+--------+
| XYZ | 3 | 4 |
+-------+--------+--------+
There's a lot going on in that pipeline, the high points are:
Merge the the two label fields
| gocsv rename -c 1 -names 'Label_A' \
| gocsv rename -c 3 -names 'Label_B' \
| gocsv add -name 'label' -t '{{ list .Label_A .Label_B | compact | first }}' \
Pare-down to just the 3 columns you want
| gocsv select -c 'label','Part-A','Part-B' \
Add the NA values and sort by label
| gocsv replace -c 'Part-A','Part-B' -regex '^$' -repl 'NA' \
| gocsv sort -c 'label' \
I've made a step-by-step explanation at this Gist.
You mentioned join in the comment on my other answer, and I'd forgotten about this utility:
#!/bin/sh
rm -f *sorted.csv
# Join two files, normally inner-join only, but
# - `-a 1 -a 2`: include "unpaired lines" from file 1 and file 2
# - `-1 1 -2 1`: the first column from each is the "join column"
# - `-o 0,1.2,2.2`: output the "join column" (0) and the second fields from files 1 and 2
join -a 1 -a 2 -1 1 -2 1 -o '0,1.2,2.2' -t, file1.csv file2.csv > joined.csv
# Add NA values
cat joined.csv | sed 's/,,/,NA,/' | sed 's/,$/,NA/' > unsorted.csv
# Sort, pull out header first
head -n 1 unsorted.csv > sorted.csv
# Then sort remainder
tail -n +2 unsorted.csv | sort -t, -k 1 >> sorted.csv
And, here's sorted.csv
+--------+--------+--------+
| label | Part-A | Part-B |
+--------+--------+--------+
| ABC mn | 2.0 | NA |
+--------+--------+--------+
| EFG | NA | 1.0 |
+--------+--------+--------+
| LMN Wv | NA | 8 |
+--------+--------+--------+
| PQR SN | 6 | 6 |
+--------+--------+--------+
| XYZ | 3.0 | 4.0 |
+--------+--------+--------+
As #Fravadona stated correctly in his comment, for CSV files that can contain the delimiter, a newline or double quotes inside a field a proper CSV parser is needed.
Actually, only two functions are needed: One for unquoting CSV fields to normal AWK fields and one for quoting the AWK fields to write the data back to CSV fields.
I have written a variant of my previous answer (https://stackoverflow.com/a/71056926/18135892) that uses Ed Morton's CSV parser (https://stackoverflow.com/a/45420607/18135892 with the gsub variant which works with any AWK version) to give an example of proper CSV parsing:
This solution prints the wished result correctly sorted with any AWK.
Please note that the sorting algorithm is taken from the mawk manual.
# SO71053039_2.awk
# unquote CSV:
# Ed Morton's CSV parser: https://stackoverflow.com/a/45420607/18135892
function buildRec( fpat,fldNr,fldStr,done) {
CurrRec = CurrRec $0
if ( gsub(/"/,"&",CurrRec) % 2 ) {
# The string built so far in CurrRec has an odd number
# of "s and so is not yet a complete record.
CurrRec = CurrRec RS
done = 0
}
else {
# If CurrRec ended with a null field we would exit the
# loop below before handling it so ensure that cannot happen.
# We use a regexp comparison using a bracket expression here
# and in fpat so it will work even if FS is a regexp metachar
# or a multi-char string like "\\\\" for \-separated fields.
CurrRec = CurrRec ( CurrRec ~ ("[" FS "]$") ? "\"\"" : "" )
$0 = ""
fpat = "([^" FS "]*)|(\"([^\"]|\"\")+\")"
while ( (CurrRec != "") && match(CurrRec,fpat) ) {
fldStr = substr(CurrRec,RSTART,RLENGTH)
# Convert <"foo"> to <foo> and <"foo""bar"> to <foo"bar>
if ( sub(/^"/,"",fldStr) && sub(/"$/,"",fldStr) ) {
gsub(/""/, "\"", fldStr)
}
$(++fldNr) = fldStr
CurrRec = substr(CurrRec,RSTART+RLENGTH+1)
}
CurrRec = ""
done = 1
}
return done
}
# quote CSV:
# Quote according to https://datatracker.ietf.org/doc/html/rfc4180 rules
function csvQuote(field, sep) {
if ((field ~ sep) || (field ~ /["\r\n]/)) {
gsub(/"/, "\"\"", field)
field = "\"" field "\""
}
return field
}
#-------------------------------------------------
# insertion sort of A[1..n]
function isort( A,A_SWAP, n,i,j,hold ) {
n = 0
for (j in A)
A_SWAP[++n] = j
for( i = 2 ; i <= n ; i++)
{
hold = A_SWAP[j = i]
while ( A_SWAP[j-1] "" > "" hold )
{ j-- ; A_SWAP[j+1] = A_SWAP[j] }
A_SWAP[j] = hold
}
# sentinel A_SWAP[0] = "" will be created if needed
return n
}
BEGIN {
FS = OFS = ","
# read file 1
fnr = 0
while ((getline < ARGV[1]) > 0) {
if (! buildRec())
continue
++fnr
if (fnr == 1) {
for (i=1; i<=NF; i++)
FIELDBYNAME1[$i] = i # e.g. FIELDBYNAME1["label"] = 1
}
else {
LABEL_KEY[$FIELDBYNAME1["label"]]
LABEL_KEY1[$FIELDBYNAME1["label"]] = $FIELDBYNAME1["Part-A"]
}
}
close(ARGV[1])
# read file2
fnr = 0
while ((getline < ARGV[2]) > 0) {
if (! buildRec())
continue
++fnr
if (fnr == 1) {
for (i=1; i<=NF; i++)
FIELDBYNAME2[$i] = i # e.g. FIELDBYNAME2["label"] = 1
}
else {
LABEL_KEY[$FIELDBYNAME2["label"]]
LABEL_KEY2[$FIELDBYNAME2["label"]] = $FIELDBYNAME2["Part-B"]
}
}
close(ARGV[2])
# print the header
print "label" OFS "Part-A" OFS "Part-B"
# get the result
z = isort(LABEL_KEY, LABEL_KEY_SWAP)
for (i = 1; i <= z; i++) {
result_string = sprintf("%s", csvQuote(LABEL_KEY_SWAP[i], OFS))
if (LABEL_KEY_SWAP[i] in LABEL_KEY1)
result_string = sprintf("%s", result_string OFS csvQuote(LABEL_KEY1[LABEL_KEY_SWAP[i]], OFS) OFS (LABEL_KEY_SWAP[i] in LABEL_KEY2 ? csvQuote(LABEL_KEY2[LABEL_KEY_SWAP[i]], OFS) : "NA"))
else
result_string = sprintf("%s", result_string OFS "NA" OFS csvQuote(LABEL_KEY2[LABEL_KEY_SWAP[i]], OFS))
print result_string
}
}
Call:
awk -f SO71053039_2.awk file1.csv file2.csv
=> result (superfluous quotes according to CSV rules are omitted):
label,Part-A,Part-B
ABC mn,2.0,NA
EFG,NA,1.0
LMN Wv,NA,8
PQR SN,6,6
XYZ,3.0,4.0

use awk to match rows for each column

How can awk be used to find values that match in row 2 for each column?
I would like to take in a tab limited file and for each column if any row below row 2 matches what is in row 2, print field with "match".
transforming this tab delimited file:
header1 | header2 | header3
1 | 1 | B
--------+---------+----------
3 | 1 | A
2 | A | B
1 | B | 1
To this:
header1 | header2 | header3
1 | 1 | B
--------+---------+----------
3 | 1 match | A
2 | A | B match
1 match | B | 1
I would go for something like this:
$ cat file
header1 header2 header3
1 1 B
3 1 A
2 A B
1 B 1
$ awk -v OFS='\t' 'NR == 2 { for (i=1; i<=NF; ++i) a[i] = $i }
NR > 2 { for(i=1;i<=NF;++i) if ($i == a[i]) $i = $i " match" }1' file
header1 header2 header3
1 1 B
3 1 match A
2 A B match
1 match B 1
On the second line, populate the array a with the contents of each field. On subsequent lines, add "match" when they match the corresponding value in the array. The 1 at the end is a common shorthand causing each line to be printed. Setting the output field separator OFS to a tab character preserves the format of the data.
Pedantically, with GNU Awk 4.1.1:
awk -f so.awk so.txt
header1 header2 header3
1 1 B
3 1* A
2 A B*
1* B 1
with so.awk:
{
if(1 == NR) {
print $0;
} else if(2 == NR) {
for(i = 1; i <= NF; i++) {
answers[i]=$i;
}
print $0;
} else {
for(i = 1; i <= NF; i++) {
field = $i;
if(answers[i]==$i) {
field = field "*" # a match
}
printf("%s\t",field);
}
printf("%s", RS);
}
}
and so.txt as a tab delimited data file:
header1 header2 header3
1 1 B
3 1 A
2 A B
1 B 1
This isn't homework, right...?

Sort values and output the indices of their sorted columns

I've got a file that looks like:
20 30 40
80 70 60
50 30 40
Each column represents a procedure. I want to know how the procedures did for each row. My ideal output would be
3 2 1
1 2 3
1 3 2
i.e. in row 1, the third column had the highest value, followed by the second, then the first smallest (this can be reversed, doesn't matter).
How would I do this?
I'd do it with some other Unix tools (read, cat, sort, cut, tr, sed, and bash of course):
while read line
do
cat -n <(echo "$line" | sed 's/ /\n/g') | sort -r -k +2 | cut -f1 | tr '\n' ' '
echo
done < input.txt
The output looks like this:
3 2 1
1 2 3
1 3 2
Another solution using Python:
$ python
Python 2.7.6 (default, Jan 26 2014, 17:25:18)
[GCC 4.2.1 Compatible Apple LLVM 5.0 (clang-500.2.79)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>>
>>> with open('file.txt') as f:
... lis=[x.split() for x in f]
...
>>> for each in lis:
... each = [i[0] + 1 for i in sorted(enumerate(each), key=lambda x:x[1], reverse=True)]
... print ' '.join([str(item) for item in each])
...
3 2 1
1 2 3
1 3 2
Using Gnu Awk version 4:
$ awk 'BEGIN{ PROCINFO["sorted_in"]="#val_num_desc" }
{
split($0,a," ")
for (i in a) printf "%s%s", i,OFS
print ""
}' file
3 2 1
1 2 3
1 3 2
If you have GNU awk then you can do something like:
awk '{
y = a = x = j = i = 0;
delete tmp;
delete num;
delete ind;
for(i = 1; i <= NF; i++) {
num[$i, i] = i
}
x = asorti(num)
for(y = 1; y <= x; y++) {
split(num[y], tmp, SUBSEP)
ind[++j] = tmp[2]
}
for(a = x; a >= 1; a--) {
printf "%s%s", ind[a],(a==1?"\n":" ")
}
}' file
$ cat file
20 30 40
0.923913 0.913043 0.880435 0.858696 0.826087 0.902174 0.836957 0.880435
80 70 60
50 30 40
awk '{
y = a = x = j = i = 0;
delete tmp;
delete num;
delete ind;
for(i = 1; i <= NF; i++) {
num[$i, i] = i
}
x = asorti(num)
for(y = 1; y <= x; y++) {
split(num[y], tmp, SUBSEP)
ind[++j] = tmp[2]
}
for(a = x; a >= 1; a--) {
printf "%s%s", ind[a],(a==1?"\n":" ")
}
}' file
3 2 1
1 2 6 8 3 4 7 5
1 2 3
1 3 2
Solution via perl
#!/usr/bin/perl
open(FH,'<','/home/chidori/input.txt') or die "Can't open file$!\n";
while(my $line=<FH>){
chomp($line);
my #unsorted_array=split(/\s/,$line);
my $count=scalar #unsorted_array;
my #sorted_array = sort { $a <=> $b } #unsorted_array;
my %hash=map{$_ => $count--} #sorted_array;
foreach my $value(#unsorted_array){
print "$hash{$value} ";
}
print "\n";
}

Grouping the rows of a text file based on 2 columns

I have a text file like this:
1 abc 2
1 rgt 2
1 yhj 2
3 gfk 4
5 kji 6
3 plo 4
3 vbn 4
5 olk 6
I want to group the rows on the basis of first and second column like this:
1 abc,rgt,yhj 2
3 gfk,plo,ybn 4
5 kji,olk 6
such that I can see what are the values of col2 for a particular pair of col1, col3.
How can I do this using shell script?
This should do it :
awk -F " " '{ a[$1" "$3]=a[$1" "$3]$2","; }END{ for (i in a)print i, a[i]; }' file.txt | sed 's/,$//g' | awk -F " " '{ tmp=$3;$3=$2;$2=tmp;print }' |sort
Just using awk:
#!/usr/bin/env awk -f
{
k = $1 "\x1C" $3
if (k in a2) {
a2[k] = a2[k] "," $2
} else {
a1[k] = $1
a2[k] = $2
a3[k] = $3
b[++i] = k
}
}
END {
for (j = 1; j <= i; ++j) {
k = b[j]
print a1[k], a2[k], a3[k]
}
}
One line:
awk '{k=$1"\x1C"$3;if(k in a2){a2[k]=a2[k]","$2}else{a1[k]=$1;a2[k]=$2;a3[k]=$3;b[++i]=k}}END{for(j=1;j<=i;++j){k=b[j];print a1[k],a2[k],a3[k]}}' file
Output:
1 abc,rgt,yhj 2
3 gfk,plo,vbn 4
5 kji,olk 6

Resources