Min-Max Normalization using AWK - bash

I dont know Why I am unable to loop through all the records. currently it goes for last record and prints the normalization for it.
Normalization formula:
New_Value = (value - min[i]) / (max[i] - min[i])
Program
{
for(i = 1; i <= NF; i++)
{
if (min[i]==""){ min[i]=$i;} #initialise min
if (max[i]==""){ max[i]=$i;} #initialise max
if ($i<min[i]) { min[i]=$i;} #new min
if ($i>max[i]) { max[i]=$i;} #new max
}
}
END {
for(j = 1; j <= NF; j++)
{
normalized_value[j] = ($j - min[j])/(max[j] - min[j]);
print $j, normalized_value[j];
}
}
Dataset
4 14 24 34
3 13 23 33
1 11 21 31
2 12 22 32
5 15 25 35
Current Output
5 1
15 1
25 1
35 1
Required Output
0.75 0.75 0.75 0.75
0.50 0.50 0.50 0.50
0.00 0.00 0.00 0.00
0.25 0.25 0.25 0.25
1.00 1.00 1.00 1.00

I would process the file twice, once to determine the minima/maxima, once to calculate the normalized values:
awk '
NR==1 {
for (i=1; i<=NF; i++) {
min[i]=$i
max[i]=$i
}
next
}
NR==FNR {
for (i=1; i<=NF; i++) {
if ($i < min[i]) {min[i]=$i}
else if ($i > max[i]) {max[i]=$i}
}
next
}
{
for (i=1; i<=NF; i++) printf "%.2f%s", ($i-min[i])/(max[i]-min[i]), FS
print ""
}
' file file
# ^^^^ ^^^^ same file twice!
outputs
0.75 0.75 0.75 0.75
0.50 0.50 0.50 0.50
0.00 0.00 0.00 0.00
0.25 0.25 0.25 0.25
1.00 1.00 1.00 1.00

The given answer uses same file to be loaded twice, this can be avoided with following modified script:
# initialization on min, max and value array to be used later
NR == 1 {
for (i=1; i<=NF; i++) {
value[i] = $i
min[i] = $i
max[i] = $i
}
}
# finding min and max for each column
NR > 1 {
for (i=1; i<=NF; i++) {
value[((NR-1)*NF)+i] = $i
if ($i < min[i]) {min[i] = $i}
else if ($i > max[i]) {max[i] = $i}
}
}
END {
nrows = NF
ncolumns = NR
for (i=0; i<(ncolumns); i++ ) {
for (j=1; j<(nrows); j++ ) {
printf "%.2f%s", (value[(i*nrows)+j]-min[j])/(max[j]-min[j]), OFS
}
printf "%.2f\n", (value[(i*nrows)+j]-min[j])/(max[j]-min[j])
}
}
Save the above awk script as norm.awk. You can run this from shell (and redirect if needed) as:
awk -f norm.awk data.txt > norm_output.txt
or you can run this norm.awk script from vim itself as:
:%!awk -f norm.awk
Which will replace the existing values with min-max normalized values.

Related

How to normalize the values of specific columns of a csv with awk?

I have a csv with several variables and I would like to normalize only some specific columns using the standard deviation.
The value minus the mean of the variable divided by the standard deviation of the variable.
The file is comma separated and the transformations needs to be done only with awk to the variables months_loan_duration and amount.
The input would look like this but with a thousand rows:
checking_balance,months_loan_duration,credit_history,purpose,amount
< 0 DM,6,critical,radio/tv,1169.53
1 - 200 DM,48,repaid,radio/tv,5951.78
,12,critical,education,2096.23
And the output would be like this:
checking_balance,months_loan_duration,credit_history,purpose,amount
< 0 DM,-1.236,critical,radio/tv,-0.745
1 - 200 DM,2.248,repaid,radio/tv,0.95
,-0.738,critical,education,-0.417
So far I have tried the following unsuccessfully:
#! /usr/bin/awk -f
BEGIN{FS=","; OFS=",";numberColumn=NF}
NR!=1
{
for(i=1;i <= numberColumn;i++)
{
total[i]+=$i;
totalSquared[i]+=$i^2;
}
for (i=1;i <= numberColumn;i++)
{
avg[i]=total[i]/(NR-1);
std[i]=sqrt((totalSquared[i]/(NR-1))-avg[i]^2);
}
for (i=1;i <= numberColumn;i++)
{
norm[i]=(($i-avg[i])/std[i])
}
}
{
print $1,$norm[2],3,4,$norm[5]
}
It will be easier to read the file twice:
awk -F, -v OFS=, '
NR==FNR { # 1st pass: accumulate values
if (FNR > 1) {
sx2 += $2 # sum of col2
sxx2 += $2 * $2 # sum of col2^2
sx5 += $5 # sum of col5
sxx5 += $5 * $5 # sum of col5^2
n++ # count of samples
}
next
}
FNR==1 { # 2nd pass, 1st line: calc means and stdevs
ave2 = sx2 / n # mean of col2
var2 = sxx2 / (n - 1) - ave2 * ave2 * n / (n - 1)
if (var2 < 0) var2 = 0 # avoid rounding error
sd2 = sqrt(var2) # stdev of col2
ave5 = sx5 / n
var5 = sxx5 / (n - 1) - ave5 * ave5 * n / (n - 1)
if (var5 < 0) var5 = 0
sd5 = sqrt(var5)
print # print the header line
}
FNR>1 {
if (sd2 > 0) $2 = ($2 - ave2) / sd2
if (sd5 > 0) $5 = ($5 - ave5) / sd5
print
}
' input_file.csv input_file.csv
Output:
checking_balance,months_loan_duration,credit_history,purpose,amount
< 0 DM,-0.704361,critical,radio/tv,-0.750328
1 - 200 DM,1.14459,repaid,radio/tv,1.13527
,-0.440225,critical,education,-0.384939
Please note the calculated values differ from your expected result.
thousands of rows isn't all that big a file for awk : might as well load it in all at once - here i created a 23.6 mn rows synthetic version of it (tested on both gawk and mawk) -
while overall performance is similar to other solutions, this code avoids having to explicitly list the input file twice to perform its equivalent of 2-pass processing
INPUT
rows = 23,622,127. | UTF8 chars = 799192890. | bytes = 799192890.
1 checking_balance,months_loan_duration,credit_history,purpose,amount
2 < 0 DM,889,critical,luna,758.61
3 ,150,critical,terra,1823.93
4 1 - 200 DM,883,repaid,stablecoin,2525.55
5 1 - 200 DM,65,repaid,terra,2405.67
6 < 0 DM,9,critical,luna,4059.34
7 < 0 DM,201,critical,stablecoin,5043
8 1 - 200 DM,549,repaid,terra,471.92
9 < 0 DM,853,critical,stablecoin,422.78
10 < 0 DM,659,critical,luna,684.94
CODE
# gawk profile, created Tue May 24 04:11:02 2022
'function abs(_) {
return \
+_<-_?-_:_
} BEGIN {
split(_____=(_=length(FS = RS = "^$"))+_,____,"")
}
END {
1 gsub("\n", ",&")
1 FS = "["(OFS= ",")"]"
1 $!_ = $!( __ = _)
1 __+= --NF
23622126 while ((_____+_) < (__-=_)) {
23622126 ____[___=_____] += ($__)^_
23622126 ____[ -—___ ] += ($__)
23622126 ____[___ * _] += -_^!_
23622126 ____[___-=+_] += ($(__-=_+_^!_))
23622126 ____[ ++___ ] += ($__)^_
}
1 ___ = (__=-____[_+_+_])-_^!_
1 RS = -(abs((____[(_)]/___-(((NR=____[+_^!+_]/__)^_)*__/___)))^_^(_/-_)
___ = -(abs((____[_+_]/___-(((RT=____[_+_^!_]/__)^_)*__/___)))^_^(_/-_)
1 ORS = "\n"
1 gsub(ORS, "")
1 OFS = ","
1 print $(_^=_<_), $(__=++_), $++_, $++_, $++_
1 OFMT = "%."(__*__+!(__=NF-__-__))"f"
23622126 while (++_ <= __) {
23622126 print $_, (NR-$++_)/RS, $++_, $++_, (RT-$++_)/___
}
}'
OUTPUT
out9: 837MiB 0:00:28 [29.2MiB/s] [29.2MiB/s] [ <=> ]
in0: 762MiB 0:00:00 [2.95GiB/s] [2.95GiB/s] [======>] 100%
( pvE 0.1 in0 < "${f}" | LC_ALL=C mawk2 ; )
26.98s user 1.58s system 99% cpu 28.681 total
23622127 878032266 878032266 testfile_stdnorm_test_004.txt_out.txt
1 checking_balance,months_loan_duration,credit_history,purpose,amount
2 < 0 DM,1.2000,critical,luna,-1.2939
3 ,-1.2949,critical,terra,-0.6788
4 1 - 200 DM,1.1798,repaid,stablecoin,-0.2737
5 1 - 200 DM,-1.5818,repaid,terra,-0.3429
6 < 0 DM,-1.7709,critical,luna,0.6119
7 < 0 DM,-1.1227,critical,stablecoin,1.1798
8 1 - 200 DM,0.0522,repaid,terra,-1.4594
9 < 0 DM,1.0785,critical,stablecoin,-1.4878
ALTERNATE SOLUTION OPTIMIZED FOR SMALLER INPUTS (e.g. up to 10^6 (1 mn) rows)
# gawk profile, created Tue May 24 06:19:24 2022
# BEGIN rule(s)
BEGIN {
1 __ = (FS = RS = "^$") * (ORS = "")
}
# END rule(s)
END {
1 _ = $__
1 gsub("[\n][,]","\n_,",_)
1 sub("^.+amount\n","",_)+gsub("[,][0-9.+-]+[,\n]", "\3&\1", _)
1 _____ = "[^0-9.+-]+"
1 gsub("^" (_____) "|\1[^\1\3]+\3","",_)
1 _____ = __ = split(_,___,_____)
1048575 while (-(--__) < +__) {
1048575 ___["_"] += _=___[(__)]
1048575 ___["="] += _*_
1048575 ___["~"] += _=___[--__]
1048575 ___["^"] += _*_
1048575 ___[":"]++
}
1 _ = (__=___[":"])-(____ ^= _<_)
1 ++____
1 ___["}"] = -(abs((___["^"]/_)-(((___["{"] = ___["~"] / __)^____)*__/_)))^____^(-(_^(!_)))
1 ___[")"] = -(abs((___["="]/_)-(((___["("] = ___["_"] / __)^____)*__/_)))^____^(-(_^(!_)))
1 if (_ < _) {
for (_ in ___) {
print "debug", _, ___[_]
}
}
1 ____ = split($(_ < _), ______, ORS = "\n")
1 _ = index(FS = "[" (OFS = ",") "]", OFS)
1 print ______[_ ^ (! _)]
1048574 for (__ += __ ^= _ < _; __ < ____; __++) {
1048574 print sprintf("%.*s%s,%+.*f,%s,%s,%+.*f", ! __, $! _ = ______[__], $(_ ~ _), _ + _, (___["{"] - $_) / ___["}"], $++_, $(--_ + _), _ + _, (___["("] - $NF) / ___[")"])
}
}
# Functions, listed alphabetically
2 function abs(_)
{
2 return (+_ < -_ ? -_ : _)
}
PERFORMANCE OF SOLUTION # 2 : End-to-End 2.57 secs for 2^20 rows
rows = 1048575. | UTF8 chars = 39912117. | bytes = 39912117.
( pvE 0.1 in0 < "${f}" | LC_ALL=C mawk2 ; )
2.46s user 0.13s system 100% cpu 2.573 total

Passing for loop using non-integers to awk

I am trying to write code which will achieve:
Where $7 is less than $i (0 - 1 in increments of 0.05), print the line and pass to word count. The way I tried to do this was:
for i in $(seq 0 0.05 1); do awk '{if ($7 <= $i) print $0}' file.txt | wc -l ; done
This just ends up returning the word count of the full file (~40 million lines) for each instance of $i. When, for example using $7 <= 0.00, it should be returning ~67K.
I feel like there may be a way to do this within awk, but I have not seen any suggestions which allow for non-integers.
Thanks in advance.
Pass $i to awk as a variable with -v and so:
for i in $(seq 0 0.05 1); do awk -v i=$i '{if ($7 <= i) print $0}' file.txt | wc -l ; done
Some made up data:
$ cat file.txt
1 2 3 4 5 6 7 a b c d e f
1 2 3 4 5 6 0.6 a b c
1 2 3 4 5 6 0.57 a b c d e f g h i j
1 2 3 4 5 6 1 a b c d e f g
1 2 3 4 5 6 0.21 a b
1 2 3 4 5 6 0.02 x y z
1 2 3 4 5 6 0.00 x y z l j k
One possible 100% awk solution:
awk '
BEGIN { line_count=0 }
{ printf "================= %s\n",$0
for (i=0; i<=20; i++)
{ if ($7 <= i/20)
{ printf "matching seq : %1.2f\n",i/20
line_count++
seq_count[i]++
next
}
}
}
END { printf "=================\n\n"
for (i=0; i<=20; i++)
{ if (seq_count[i] > 0)
{ printf "seq = %1.2f : %8s (count)\n",i/20,seq_count[i] }
}
printf "\nseq = all : %8s (count)\n",line_count
}
' file.txt
# the output:
================= 1 2 3 4 5 6 7 a b c d e f
================= 1 2 3 4 5 6 0.6 a b c
matching seq : 0.60
================= 1 2 3 4 5 6 0.57 a b c d e f g h i j
matching seq : 0.60
================= 1 2 3 4 5 6 1 a b c d e f g
matching seq : 1.00
================= 1 2 3 4 5 6 0.21 a b
matching seq : 0.25
================= 1 2 3 4 5 6 0.02 x y z
matching seq : 0.05
================= 1 2 3 4 5 6 0.00 x y z l j k
matching seq : 0.00
=================
seq = 0.00 : 1 (count)
seq = 0.05 : 1 (count)
seq = 0.25 : 1 (count)
seq = 0.60 : 2 (count)
seq = 1.00 : 1 (count)
seq = all : 6 (count)
BEGIN { line_count=0 } : initialize a total line counter
print statement is merely for debug purposes; will print out every line from file.txt as it's processed
for (i=0; i<=20; i++) : depending on implementation, some versions of awk may have rounding/accuracy problems with non-integer numbers in sequences (eg, increment by 0.05), so we'll use whole integers for our sequence, and divide by 20 (for this particular case) to provide us with our 0.05 increments during follow-on testing
$7 <= i/20 : if field #7 is less than or equal to (i/20) ...
printf "matching seq ... : print the sequence value we just matched on (i/20)
line_count++ : add '1' to our total line counter
seq_count[i]++ : add '1' to our sequence counter array
next : break out of our sequence loop (since we found our matching sequence value (i/20), and process the next line in the file
END ... : print out our line counts
for (x=1; ...) / if / printf : loop through our array of sequences, printing the line count for each sequence (i/20)
printf "\nseq = all... : print out our total line count
NOTE: Some of the awk code can be further reduced but I'll leave this as is since it's a little easier to understand if you're new to awk.
One (obvious?) benefit of a 100% awk solution is that our sequence/looping construct is internal to awk thus allowing us to limit ourselves to one loop through the input file (file.txt); when the sequence/looping construct is outside of awk we find ourselves having to process the input file once for each pass through the sequence/loop (eg, for this exercise we would have to process the input file 21 times !!!).
Using a bit of guesswork as to what you actually want to accomplish, I came up with this:
awk '{ for (i=20; 20*$7<=i && i>0; i--) bucket[i]++ }
END { for (i=1; i<=20; i++) print bucket[i] " lines where $7 <= " i/20 }'
With the mock data from mark's second answer I get this output:
2 lines where $7 <= 0.05
2 lines where $7 <= 0.1
2 lines where $7 <= 0.15
2 lines where $7 <= 0.2
3 lines where $7 <= 0.25
3 lines where $7 <= 0.3
3 lines where $7 <= 0.35
3 lines where $7 <= 0.4
3 lines where $7 <= 0.45
3 lines where $7 <= 0.5
3 lines where $7 <= 0.55
5 lines where $7 <= 0.6
5 lines where $7 <= 0.65
5 lines where $7 <= 0.7
5 lines where $7 <= 0.75
5 lines where $7 <= 0.8
5 lines where $7 <= 0.85
5 lines where $7 <= 0.9
5 lines where $7 <= 0.95
6 lines where $7 <= 1

Optimize AWK script for large dataset

For the following input data,
Chr C rsid D A1 A2 ID1_AA ID1_AB ID1_BB ID2_AA ID2_AB ID2_BB ID3_AA ID3_AB ID3_BB ID4_AA ID4_AB ID4_BB ID5_AA ID5_AB ID5_BB
10 p rsid1 q A G 0.00 0.85 0.15 0.70 0.10 0.20 0.40 0.50 0.10 0.30 0.30 0.40 0.10 0.20 0.80
10 p rsid2 q C T 0.90 0.10 0.00 0.80 0.10 0.10 0.70 0.10 0.20 0.30 0.40 0.30 0.30 0.20 0.40
10 p rsid3 q A G 0.40 0.50 0.10 0.80 0.20 0.00 0.20 0.30 0.50 0.50 0.30 0.20 0.20 0.30 0.40
I need to generate the following output data.
rsid ID1 ID2 ID3 ID4 ID5
rsid1 2.15 1.50 1.70 2.10 2.90
rsid2 1.10 1.30 1.50 2.00 1.90
rsid3 1.70 1.20 2.30 1.70 2.00
The table show the sum of 3 columns (_AA, _AB & _BB) by multiplying with a constant factor (1, 2, 3) for every ID (ID1, ID2, ID3, etc).
Example: for rsID1 --> ID1 -> (ID1_AA*1 + ID1_AB*2 + ID1_BB*3) = (0.00*1 + 0.85*2 + 0.15*3) = 2.15
I wrote the following AWK script to establish the task and it works absolutely fine.
Please note: I'm a very beginner in AWK.
awk '{
if(NR <= 1) { # header line
str = $3;
for(i=7; i<=NF; i+=3) {
split($i,s,"_”);
str = str"\t"s[1]
}
print str
} else { # data line
k = 0;
for(i=7; i<=NF; i+=3)
arr[k++] = $i*1 + $(i+1)*2 + $(i+2)*3;
str=$3;
for(i=0; i<=(NF-6)/3; i++)
str = str"\t"arr[i];
print str
}
}' input.txt > out.txt
Later I was told the input data can be as big as 60 Million rows & 300 Thousand columns which means the output data will be 60Mx100K. If I'm not wrong, AWK reads one line at a time & hence at an instant there will be 300K columns of data held in memory. Is it a problem? Given the situation, how can I improve my code?
While both approaches have pros/cons and they can both handle any number of rows/columns since they only store 1 row at a time in memory, I'd use this approach rather than the answer posted by Akshay since you have 300,000 columns per line so his approach would require you to test NR==1 almost 100,000 times per line whereas the approach below will just perform the test 1 time per line so it should be noticeably more efficient:
$ cat tst.awk
BEGIN { OFS="\t" }
{
printf "%s", $3
if (NR==1) {
gsub(/_[^[:space:]]+/,"")
for (i=7; i<=NF; i+=3) {
printf "%s%s", OFS, $i
}
}
else {
for (i=7; i<=NF; i+=3) {
printf "%s%.2f", OFS, $i + $(i+1)*2 + $(i+2)*3
}
}
print ""
}
$ awk -f tst.awk file
rsid ID1 ID2 ID3 ID4 ID5
rsid1 2.15 1.50 1.70 2.10 2.90
rsid2 1.10 1.30 1.50 2.00 1.90
rsid3 1.70 1.20 2.30 1.70 2.00
I highly recommend you read the book Effective Awk Programming, 4th Edition, by Arnold Robbins to learn what awk is and how to use it.
awk -v OFS="\t" '
{
printf("%s",$3);
for(i=7;i<=NF; i+=3)
{
if(FNR==1)
{
sub(/_.*/,"",$i)
f = $i
}else
{
f = sprintf("%5.2f",$i*1 + $(i+1)*2 + $(i+2)*3)
}
printf("%s%s",OFS,f)
}
print ""
}
' file
Output
rsid ID1 ID2 ID3 ID4 ID5
rsid1 2.15 1.50 1.70 2.10 2.90
rsid2 1.10 1.30 1.50 2.00 1.90
rsid3 1.70 1.20 2.30 1.70 2.00
Do u think making use of a low level language like C?
C++ or C is not automagically faster than awk, also, the code is less readable and more fragile.
I show another solution using c++, to compare
//p.cpp
#include <stdio.h>
//to modify this value
#define COLUMNS 5
int main() {
char column3[256];
bool header=true;
while (scanf("%*s\t%*s\t%255s\t%*s\t%*s\t%*s\t", column3) == 1) {
printf("%s", column3);
if(header){
header=false;
char name[256];
for(int i=0; i<COLUMNS; ++i){
scanf("%[^_]_%*s\t%*s\t%*s\t", name);
printf("\t%s", name);
}
}else{
float nums[3];
for(int i=0; i<COLUMNS; ++i){
scanf("%f %f %f", nums, nums + 1, nums + 2);
float sum = nums[0]+nums[1]*2+nums[2]*3;
printf("\t%2.2f", sum);
}
}
printf("\n");
}
}
Run it, like
g++ p.cpp -o p
cat file | ./p
Benchmark
with 1 millon of lines in input and 300 columns
Ed Morton solution: 2m 34s
c++: 1m 19s

bash group times and average + sum columns

I have a daily file output on a linux system like the below and was wondering is there a way to group the data in 30min increments based on $1 and avg $3 and sum $4 $5 $6 $7 $8 via a shell script using awk/gawk or something similar?
04:04:13 04:10:13 2.13 36 27 18 18 0
04:09:13 04:15:13 2.37 47 38 13 34 0
04:14:13 04:20:13 2.19 57 37 23 33 1
04:19:13 04:25:13 2.43 43 35 13 30 0
04:24:13 04:30:13 2.29 48 40 19 28 1
04:29:13 04:35:13 2.33 56 42 16 40 0
04:34:13 04:40:13 2.21 62 47 30 32 0
04:39:13 04:45:13 2.25 44 41 19 25 0
04:44:13 04:50:13 2.20 65 50 32 33 0
04:49:13 04:55:13 2.47 52 38 16 36 0
04:54:13 05:00:13 2.07 72 54 40 32 0
04:59:13 05:05:13 2.35 53 41 19 34 0
so basically this hour of data would result in something like this:
04:04:13-04:29:13 2.29 287 219 102 183 2
04:34:13-04:59:13 2.25 348 271 156 192 0
this is what I have gotten so far using awk to search between the time frames but I think there is an easier way to get the grouping done without awking each 30min interval
awk '$1>=from&&$1<=to' from="04:00:00" to="04:30:00" | awk '{ total += $3; count++ } END { print total/count }'|awk '{printf "%0.2f\n", $1'}
awk '$1>=from&&$1<=to' from="04:00:00" to="04:30:00" | awk '{ sum+=$4} END {print sum}'
This should do what you want:
{
split($1, times, ":");
i = (2 * times[1]);
if (times[2] >= 30) i++;
if (!start[i] || $1 < start[i]) start[i] = $1;
if (!end[i] || $1 > end[i]) end[i] = $1;
count[i]++;
for (col = 3; col <= 8; col++) {
data[i, col] += $col;
}
}
END {
for (i = 1; i <= 48; i++) {
if (start[i]) {
data[i, 3] = data[i, 3] / count[i];
printf("%s-%s %.2f", start[i], end[i], data[i, 3]);
for (col = 4; col <= 8; col++) {
printf(" " data[i, col]);
}
print "";
}
}
}
As you can see, I divide the day into 48 half-hour intervals and place the data into one of these bins depending on the time in the first column. After the input has been exhausted, I print out all bins that are not empty.
Personally, I would do this in Python or Perl. In awk, the arrays are not ordered (well, in gawk you could use assorti to sort the array...) which makes printing ordered buckets more work.
Here is the outline:
Read input
Convert the time stamp to seconds
Add to an ordered (or sortable) associative array of the data elements in buckets of the desired time frame (or, just keep running totals).
After the data is read, process as you wish.
Here is a Python version of that:
#!/usr/bin/python
from collections import OrderedDict
import fileinput
times=[]
interval=30*60
od=OrderedDict()
for line in fileinput.input():
li=line.split()
secs=sum(x*y for x,y in zip([3600,60,1], map(int, li[0].split(":"))))
times.append([secs, [li[0], float(li[2])]+map(int, li[3:])])
current=times[0][0]
for t, li in times:
if t-current<interval:
od.setdefault(current, []).append(li)
else:
current=t
od.setdefault(current, []).append(li)
for s, LoL in od.items():
avg=sum(e[1] for e in LoL)/len(LoL)
sums=[sum(e[i] for e in LoL) for i in range(2,7)]
print "{}-{} {:.3} {}".format(LoL[0][0], LoL[-1][0], avg, ' '.join(map(str, sums)))
Running that on your example data:
$ ./ts.py ts.txt
04:04:13-04:29:13 2.29 287 219 102 183 2
04:34:13-04:59:13 2.26 348 271 156 192 0
The advantage is you can easily change the interval and a similar technic can use timestamps that are longer than days.
If you really want awk you could do:
awk 'BEGIN{ interval=30*60 }
function fmt(){
line=sprintf("%s-%s %.2f %i %i %i %i %i", ls, $1, sums[3]/count,
sums[4], sums[5], sums[6], sums[7], sums[8])
}
{
split($1,a,":")
secs=a[1]*3600+a[2]*60+a[3]
if (NR==1) {
low=secs
ls=$1
count=0
for (i=3; i<=8; i++)
sums[i]=0
}
for (i=3; i<=8; i++){
sums[i]+=$i
}
count++
if (secs-low<interval) {
fmt()
}
else {
print line
low=secs
ls=$1
count=1
for (i=3; i<=8; i++)
sums[i]=$i
}
}
END{
fmt()
print line
}' file
04:04:13-04:29:13 2.29 287 219 102 183 2
04:34:13-04:59:13 2.26 348 271 156 192 0

Calculating sum of gradients with awk

I have a file that contains 4 columns such as:
A B C D
1 2 3 4
10 20 30 40
100 200 300 400
.
.
.
I can calculate gradient of columns B to D versus A such as following commands:
NR>1{print $0,($2-b)/($1-a)}{a=$1;b=$2}' file
How can I print sum of gradients as the 5th column in the file? The results should be:
A B C D sum
1 2 3 4 1+2+3+4=10
10 20 30 40 (20-2)/(10-1)+(30-3)/(10-1)+(40-4)/(10-1)=9
100 200 300 400 (200-20)/(100-10)+(300-30)/(100-10)+(400-40)/(100-10)=9
.
.
.
awk 'NR == 1 { print $0, "sum"; next } { if (NR == 2) { sum = $1 + $2 + $3 + $4 } else { t = $1 - a; sum = ($2 - b) / t + ($3 - c) / t + ($4 - d) / t } print $0, sum; a = $1; b = $2; c = $3; d = $4 }' file
Output:
A B C D sum
1 2 3 4 10
10 20 30 40 9
100 200 300 400 9
With ... | column -t:
A B C D sum
1 2 3 4 10
10 20 30 40 9
100 200 300 400 9
Update:
#!/usr/bin/awk -f
NR == 1 {
print $0, "sum"
next
}
{
sum = 0
if (NR == 2) {
for (i = 1; i <= NF; ++i)
sum += $i
} else {
t = $1 - a[1]
for (i = 2; i <= NF; ++i)
sum += ($i - a[i]) / t
}
print $0, sum
for (i = 1; i <= NF; ++i)
a[i] = $i
}
Usage:
awk -f script.awk file
If you apply the same logic to the first line of numbers as you do to the rest, taking the initial value of each column as 0, you get 9 as the result of the sum (as it was in your question originally). This approach uses a loop to accumulate the sum of the gradient from the second field up to the last one. It uses the fact that on the first time round, the uninitialised values in the array a evaluate to 0:
awk 'NR==1 { print $0, "sum"; next }
{
s = 0
for(i=2;i<=NF;++i) s += ($i-a[i])/($1-a[1]) # accumulate sum
for(i=1;i<=NF;++i) a[i] = $i # fill array to be used for next iteration
print $0, s
}' file
You can pack it all onto one line if you want but remember to separate the statements with semicolons. It's also slightly shorter to only use a single for loop with an if:
awk 'NR==1{print$0,"sum";next}{s=0;for(i=1;i<=NF;++i)if(i>1)s+=($i-a[i])/($1-a[1]);a[i]=$i;print$0,s}' file
Output:
A B C D sum
1 2 3 4 9
10 20 30 40 9
100 200 300 400 9

Resources