Count and percentage - bash

Using the columns 4 and 2, will create a report like the output file showed below. My code works fine but I believe it can be done more shorted :).
I have a doubt in the part of the split.
CNTLM = split ("20,30,40,60", LMT
It works but will be better to have exactly the values "10,20,30,40" as values in column 4.
4052538693,2910,04-May-2018-22,10
4052538705,2910,04-May-2018-22,10
4052538717,2910,04-May-2018-22,10
4052538729,2911,04-May-2018-22,20
4052538741,2911,04-May-2018-22,20
4052538753,2912,04-May-2018-22,20
4052538765,2912,04-May-2018-22,20
4052538777,2914,04-May-2018-22,10
4052538789,2914,04-May-2018-22,10
4052538801,2914,04-May-2018-22,30
4052539029,2914,04-May-2018-22,20
4052539041,2914,04-May-2018-22,20
4052539509,2915,04-May-2018-22,30
4052539521,2915,04-May-2018-22,30
4052539665,2915,04-May-2018-22,30
4052539677,2915,04-May-2018-22,10
4052539689,2915,04-May-2018-22,10
4052539701,2916,04-May-2018-22,40
4052539713,2916,04-May-2018-22,40
4052539725,2916,04-May-2018-22,40
4052539737,2916,04-May-2018-22,40
4052539749,2916,04-May-2018-22,40
4052539761,2917,04-May-2018-22,10
4052539773,2917,04-May-2018-22,10
here is the code I use to get the output desired.
printf " Code 10 20 30 40 Total\n" > header
dd=`cat header | wc -L`
awk -F"," '
BEGIN {CNTLM = split ("20,30,40,60", LMT)
cmdsort = "sort -nr"
DASHES = sprintf ("%0*d", '$dd', _)
gsub (/0/, "-", DASHES)
}
{for (IX=1; IX<=CNTLM; IX++) if ($4 <= LMT[IX]) break
CNT[$2,IX]++
COLTOT[IX]++
LNC[$2]++
TOT++
}
END {
print DASHES
for (l in LNC)
{printf "%5d", l | cmdsort
for (IX=1; IX<=CNTLM; IX++) {printf "%9d", CNT[l,IX]+0 | cmdsort
}
printf " = %6d" RS, LNC[l] | cmdsort
}
close (cmdsort)
print DASHES
printf "Total"
for (IX=1; IX<=CNTLM; IX++) printf "%9d", COLTOT[IX]+0
printf " = %6d" RS, TOT
print DASHES
printf "PCT "
for (IX=1; IX<=CNTLM; IX++) printf "%9.1f", COLTOT[IX]/TOT*100
printf RS
print DASHES
}
' file
Output file I got
Code 10 20 30 40 Total
----------------------------------------------------
2917 2 0 0 0 = 2
2916 0 0 0 5 = 5
2915 2 0 3 0 = 5
2914 2 2 1 0 = 5
2912 0 2 0 0 = 2
2911 0 2 0 0 = 2
2910 3 0 0 0 = 3
----------------------------------------------------
Total 9 6 4 5 = 24
----------------------------------------------------
PCT 37.5 25.0 16.7 20.8
----------------------------------------------------
Appreciate if code can be improved.

without the header and cosmetics...
$ awk -F, '{a[$2,$4]++; k1[$2]; k2[$4]}
END{for(r in k1)
{printf "%5s", r;
for(c in k2) {k1[r]+=a[r,c]; k2[c]+=a[r,c]; printf "%10d", OFS a[r,c]+0}
printf " =%7d\n", k1[r]};
printf "%5s", "Total";
for(c in k2) {sum+=k2[c]; printf "%10d", k2[c]}
printf " =%7d", sum}' file | sort -nr
2917 2 0 0 0 = 2
2916 0 0 0 5 = 5
2915 2 0 3 0 = 5
2914 2 2 1 0 = 5
2912 0 2 0 0 = 2
2911 0 2 0 0 = 2
2910 3 0 0 0 = 3
Total 9 6 4 5 = 24

Related

Add Columns Values with Shell

I've an input file which looks as below.
pmx . pmnosysrelspeechneighbr -m 1 -r
INFO: The ROP files contain suspected faulty counter values.
They have been discarded but can be kept with pmr/pmx option "k" (pmrk/pmxk) or highlighted with pmx option "s" (pmxs)
Date: 2017-11-04
Object Counter 14:45 15:00 15:15 15:30
UtranCell=UE1069XA0 pmNoSysRelSpeechNeighbr 0 1 0 0
UtranCell=UE1069XA1 pmNoSysRelSpeechNeighbr 0 0 0 0
UtranCell=UE1069XA2 pmNoSysRelSpeechNeighbr 0 0 0 0
UtranCell=UE1069XA3 pmNoSysRelSpeechNeighbr 0 0 2 0
UtranCell=UE1069XB0 pmNoSysRelSpeechNeighbr 0 0 0 0
UtranCell=UE1069XB1 pmNoSysRelSpeechNeighbr 0 0 0 3
UtranCell=UE1069XB2 pmNoSysRelSpeechNeighbr 0 0 0 0
UtranCell=UE1069XB3 pmNoSysRelSpeechNeighbr 0 0 0 0
UtranCell=UE1069XC0 pmNoSysRelSpeechNeighbr 0 0 0 0
UtranCell=UE1069XC1 pmNoSysRelSpeechNeighbr 0 0 0 4
UtranCell=UE1069XC2 pmNoSysRelSpeechNeighbr 0 0 0 0
UtranCell=UE1069XC3 pmNoSysRelSpeechNeighbr 0 0 1 0
UtranCell=UE1164XA0 pmNoSysRelSpeechNeighbr 0 3 0 0
UtranCell=UE1164XA1 pmNoSysRelSpeechNeighbr 0 0 0 0
UtranCell=UE1164XA2 pmNoSysRelSpeechNeighbr 1 0 0 0
Now I want the output as below which is basically sum of the time column (from $3 to $6) values.
Counter 14:45 15:00 15:15 15:30
pmNoSysRelSpeechNeighbr 1 4 3 7
I've been trying with below command. But it's just giving sum of one column values:
pmx . pmnosysrelspeechneighbr -m 1 -r | grep - i ^Object| awk '{sum += $4} END {print $1 , sum}'
Try this out, you will get both header and trailer as sum of individual columns.
BEGIN {
trail="pmNoSysRelSpeechNeighbr";
}
{
if($1=="Object") print $2 OFS $3 OFS $4 OFS $5 OFS $6;
else if($1 ~ /^UtranCell/) {
w+=$3; x+=$4; y+=$5; z+=$6;
}
}
END {
print trail OFS w OFS x OFS y OFS z;
}
You need to sum each of the columns separately:
awk -v g=pmNoSysRelSpeechNeighbr '$0 ~ g { for(i=3;i<=6;i++) sum[i]+=$i }
END { printf g; for(i=3;i<=6;i++) printf OFS sum[i] }' file
but only for lines (records) containing the group (counter) of interest ($0~"pmNoSysRelSpeechNeighbr").
Note you (almost) never need to pipe grep's output to awk, because awk already supports extended regular expressions filtering with /regex/ { action }, or var ~ /regex/ { action }. One exception would be the need for PCRE (grep -P).
As an alternative to awk for simple "command-line statistical operations" on textual files, you could also use GNU datamash.
For example, to sum columns 3 to 6, but group by column 2:
grep 'UtranCell' file | datamash -W -g2 sum 3-6

Find header value for first occurance of "1" instance in column

I have a matrix example:
1 3 5 8 10 12
50 1 1 1 1 1 1
100 0 0 1 1 1 1
150 0 0 1 1 1 1
200 0 0 0 1 1 1
250 0 0 0 0 1 1
300 0 0 0 0 1 1
350 0 0 0 0 0 1
For each row name (50, 100, 150, 200, etc.) I want to know what is the "header" value when the instance "1" first occurs. Based on the example the answer is:
50 1
100 5
150 5
200 8
250 10
300 10
350 12
I am not sure how to play with IFs and WHENs to get my answer from this format. R, Excel, bash, awk, all welcome as solutions.
You can do this using awk as following :
$ awk 'FNR==1{for(i=1; i<=NF; i++){a[i]=$i}; next} {for(i=2; i<=NF; i++){if($i=="1"){print $1, a[i-1]; break}}} ' file
50 1
100 5
150 5
200 8
250 10
300 10
350 12
Explanation :
For header i.e FNR==1 we are populating all values in the array a;
For all next lines we are checking which field equates to 1, if found print the col1 value i.e $1 and the corresponding value in the array a and break the loop.
Awk solution:
awk 'NR==1{ for(i=1;i<=NF;i++) h[i]=$i; next }
{
for(i=2;i<=NF;i++) { if($i==1) { n=h[i-1]; break } }
print $1,(n)?n:"None"; n=""
}' file

Calculating the sum of every third column from many files

I have many files with three columns in a form of:
file1 | file2
1 0 1 | 1 0 2
2 3 3 | 2 3 7
3 6 2 | 3 6 0
4 1 0 | 4 1 3
5 2 4 | 5 2 1
First two columns are the same in each file. I want to calculate a sum of 3 columns from every file to receive something like this:
1 0 3
2 3 10
3 6 2
4 1 3
5 2 5
For two files awk 'FNR==NR { _a[FNR]=$3;} NR!=FNR { $3 += _a[FNR]; print; }' file*
work perfectly (I found this solution via google). How to change it on many files?
All you need is:
awk '{sum[FNR]+=$3} ARGIND==(ARGC-1){print $1, $2, sum[FNR]}' file*
The above used GNU awk for ARGIND. With other awks just add FNR==1{ARGIND++} at the start.
Since the first two columns are same in each file:
awk 'NR==FNR{b[FNR]=$1 FS $2;}{a[FNR]+=$3}END{for(i=1;i<=length(a);i++){print b[i] FS a[i];}}' file*
Array a is used to have the cumulative sum of the 3rd column of all files.
Array b is used to the 1st and 2nd column values
In the end, we print the contents of array a and b
file1
$ cat f1
1 0 1
2 3 3
3 6 2
4 1 0
5 2 4
file2
$ cat f2
1 0 2
2 3 7
3 6 0
4 1 3
5 2 1
Output
$ awk -v start=3 'NF{for(i=1; i<=NF; i++)a[FNR, i] = i>=start ? a[FNR, i]+$i : $i }END{ for(j=1; j<=FNR; j++){ s = ""; for(i=1; i<=NF; i++){ s = (s ? s OFS:"")((j,i) in a ? a[j,i] : "") } print s } }' f1 f2
1 0 3
2 3 10
3 6 2
4 1 3
5 2 5
Better Readable
variable start decides from which column start summing, suppose if you set 2 it will start summing from column2, column3 ...and so on, from all files, since you have equal no of fields and rows, it works well
awk -v start=3 '
NF{
for(i=1; i<=NF; i++)
a[FNR, i] = i>=start ? a[FNR, i]+$i : $i
}
END{
for(j=1; j<=FNR; j++)
{
s = "";
for(i=1; i<=NF; i++)
{
s = (s ? s OFS:"")((j,i) in a ? a[j,i] : "")
}
print s
}
}
' f1 f2

BASH - Summarising information from several fields in unique field using Loop and If statements

I have the following tab-separated file:
A1 A1 0 0 2 1 1 1 1 1 1 1 2 1 1 1
A2 A2 0 0 2 1 1 1 1 1 1 1 1 1 1 1
A3 A3 0 0 2 2 1 1 2 2 1 1 1 1 1 1
A5 A5 0 0 2 2 1 1 1 1 1 1 1 2 1 1
The idea is to summarise the information between column 7 (included) and the end in a new column that is added at the end of the file.
To do so, these are the rules:
If the total number of “2”s in the row (between column 7 and the end) is 0: add “1 1” to the new last column
If the total number of “2”s in the row (between column 7 and the end) is 1: add “1 2” to the new last column
If the total number of “2”s in the row (between column 7 and the end) is 2 or more: add “2 2” to the new last column
I started to extract the columns I want to work on using the command:
awk '{for (i = 7; i <= NF; i++) printf $i " "; print ""}' myfile.ped > tmp_myfile.txt
Then I count the number of occurrence in each row using:
sed 's/[^2]//g' tmp_myfile.txtt | awk '{print NR, length }' >
tmp_occurences.txt
Which outputs:
1 1
2 0
3 2
4 1
Then my idea was to write a for loop that loops through the lines to add the new summary column.
I was thinking in this kind of structure, based on what I found here: http://www.thegeekstuff.com/2010/06/bash-if-statement-examples:
while read line ;
do
set $line
If ["$2"==0]
then
$3=="1 1"
elif ["$2"==1 ]
then
$3=="1 2”
elif ["$2">=2 ]
then
$3==“2 2”
else
print ["error"]
fi
done < tmp_occurences.txt
But I am stuck here. Do I have to create the new column before starting the loop? Am I going in the right direction?
Ideally, the final output (after merging the first 6 columns from the initial file and the summary column) would be:
A1 A1 0 0 2 1 1 2
A2 A2 0 0 2 1 1 1
A3 A3 0 0 2 2 2 2
A5 A5 0 0 2 2 1 2
Thank you for your help!
Using gnu-awk you can do:
awk -v OFS='\t' '{
c=0;
for (i=7; i<=NF; i++)
if ($i==2)
c++
if (c==0)
s="1 1"
else if (c==1)
s="1 2"
else
s="2 2"
NF=6
print $0, s
}' file
A1 A1 0 0 2 1 1 2
A2 A2 0 0 2 1 1 1
A3 A3 0 0 2 2 2 2
A5 A5 0 0 2 2 1 2
PS: If not using gnu-awk you can use:
awk -v OFS='\t' '{c=0; for (i=7; i<=NF; i++) {if ($i==2) c++; $i=""} if (c==0) s="1 1"; else if (c==1) s="1 2"; else s="2 2"; NF=6; print $0, s}' file
With GNU awk for the 3rd arg to match():
$ awk '{match($0,/((\S+\s+){6})(.*)/,a); c=gsub(2,2,a[3]); print a[1] (c>1?2:1), (c>0?2:1)}' file
A1 A1 0 0 2 1 1 2
A2 A2 0 0 2 1 1 1
A3 A3 0 0 2 2 2 2
A5 A5 0 0 2 2 1 2
With other awks you'd replace \S/\s with [^[:space:]]/[[:space:]] and use substr() instead of a[].
We can keep the format by using gensub() and capturing groups: we capture the 6 first fields and replace with them + the calculated values:
awk '{for (i=7; i<=NF; i++) {
if ($i==2)
twos+=1 # count number of 2's from 7th to last field
}
f7=1; f8=0 # set 7th and 8th fields's default value
if (twos)
f8=2 # set 8th = 2 if sum is > 0
if (twos>1)
f7=2 # set 7th = 2 if sum is > 1
$0=gensub(/^((\S+\s*){6}).*/,"\\1 " f7 FS f8, 1) # perform the replacement
twos=0 # reset counter
}1' file
As a one-liner:
$ awk '{for (i=7; i<=NF; i++) {if ($i==2) twos+=1} f7=1; f8=0; if (twos) f8=2; if (twos>1) f7=2; $0=gensub(/^((\S+\s*){6}).*/,"\\1 " f7 FS f8,1); twos=0}1' a
A1 A1 0 0 2 1 1 2
A2 A2 0 0 2 1 1 0
A3 A3 0 0 2 2 2 2
A5 A5 0 0 2 2 1 2
$ cat > test.awk
{
for(i=1;i<=NF;i++) { # for every field
if(i<7)
printf "%s%s", $i,OFS # only output the first 6
else a[$i]++ # count the values of the of the fields
}
print (a[2]>1?"2 2":(a[2]==1?"1 2":"1 1")) # output logic
delete a # reset a for next record
}
$ awk -f test.awk test
A1 A1 0 0 2 1 1 2
A2 A2 0 0 2 1 1 1
A3 A3 0 0 2 2 2 2
A5 A5 0 0 2 2 1 2
Borrowing some ideas from #anubhava's solution above:
$ cat > another.awk
{
for(i=7;i<=NF;i++)
a[$i]++ # count 2s
NF=6 # truncate $0
print $0 OFS (a[2]<2?"1 "(a[2]?"2":"1"):"2 2") # print $0 AND 1 AND 1 OR 2 OR 2 AND 2
delete a # reset a for next record
}

Insert new lines with missing values in a array

I have a data like below:
2016-07-25:06 5
2016-07-25:07 1
2016-07-25:08 1
2016-07-25:09 2
2016-07-25:10 1
2016-07-25:11 1
2016-07-25:13 9
2016-07-25:14 1
In the above i should display hours from 00 to till 23, like below:
2016-07-25:00 0
2016-07-25:01 0
2016-07-25:02 0
2016-07-25:03 0
2016-07-25:04 0
2016-07-25:05 0
2016-07-25:06 5
2016-07-25:07 1
2016-07-25:08 1
2016-07-25:09 2
2016-07-25:10 1
2016-07-25:11 1
2016-07-25:12 0
2016-07-25:13 9
2016-07-25:14 1
2016-07-25:15 0
2016-07-25:16 0
2016-07-25:17 0
2016-07-25:18 0
2016-07-25:19 0
2016-07-25:20 0
2016-07-25:21 0
2016-07-25:22 0
2016-07-25:23 0
could you please let me know how can i achieve this using awk?
Thank you!!!
Using awk you can do this:
awk -F '[:[:blank:]]+' '{for (;i<$2; i++) printf "%s:%02d\t0\n", $1, i; print; i++; s=$1}
END{for (;i<24; i++) printf "%s:%02d\t0\n", s, i}' file
2016-07-25:00 0
2016-07-25:01 0
2016-07-25:02 0
2016-07-25:03 0
2016-07-25:04 0
2016-07-25:05 0
2016-07-25:06 5
2016-07-25:07 1
2016-07-25:08 1
2016-07-25:09 2
2016-07-25:10 1
2016-07-25:11 1
2016-07-25:12 0
2016-07-25:13 9
2016-07-25:14 1
2016-07-25:15 0
2016-07-25:16 0
2016-07-25:17 0
2016-07-25:18 0
2016-07-25:19 0
2016-07-25:20 0
2016-07-25:21 0
2016-07-25:22 0
2016-07-25:23 0
$ cat tst.awk
BEGIN { FS="[:[:space:]]+" }
function prt() {
if ( NR > 1 ) {
for (i=0; i<=23; i++) {
printf "%s:%02d%s%d\n", $1, i, OFS, val[$1,i]
}
delete val
}
}
$1 != prev { prt() }
{ val[$1,$2+0]=$3; prev=$1 }
END { prt() }
.
$ awk -f tst.awk file
2016-07-25:00 0
2016-07-25:01 0
2016-07-25:02 0
2016-07-25:03 0
2016-07-25:04 0
2016-07-25:05 0
2016-07-25:06 5
2016-07-25:07 1
2016-07-25:08 1
2016-07-25:09 2
2016-07-25:10 1
2016-07-25:11 1
2016-07-25:12 0
2016-07-25:13 9
2016-07-25:14 1
2016-07-25:15 0
2016-07-25:16 0
2016-07-25:17 0
2016-07-25:18 0
2016-07-25:19 0
2016-07-25:20 0
2016-07-25:21 0
2016-07-25:22 0
2016-07-25:23 0
This uses more tools than just awk, but it might be helpful:
#!/bin/bash
date="2016-07-25" #or a method to get the date you are interested in
#Generate all the zero lines
remaining=`for i in 0{0..9} {10..23}; do echo "$date:$i 0"; done | grep -v "$(cat datafile | awk '{print $1}')"`
#Add the original data and sort the lines
echo -e "$remaining\n$(cat datafile)" | sort -n

Resources