Finding Contiguous Ranges - bash

I would like to find the contiguous ranges given a set of dates by day
given the following sample
2016-01-01
2016-01-02
2016-01-03
2016-01-04
2016-01-05
2016-01-06
2016-01-08
2016-01-09
2016-01-10
2016-01-11
2016-01-12
2016-01-15
2016-01-16
2016-01-17
2016-01-20
2016-01-21
2016-01-30
2016-01-31
2016-02-01
I expect the following result
2016-01-01-2016-01-06
2016-01-08-2016-01-12
2016-01-15-2016-01-17
2016-01-20-2016-01-21
2016-01-30-2016-01-31
2016-02-01-2016-02-01
I have already came across this question which is almost the opposite of what I want but with integers.
I have formulated the following which works with integers.
awk 'NR==1 {l=$1; n=$1} {if ($1==n){n=$1+1} else{print l"-"n-1; l=$1 ;n=$1+1} } END {print l"-"$1}' file.txt

With GNU awk for mktime():
$ cat tst.awk
BEGIN { FS=OFS="-" }
{ currSecs = mktime( $1" "$2" "$3" 0 0 0" ) }
(currSecs - prevSecs) > (24*60*60) {
if (NR>1) {
print startDate, prevDate
}
startDate = $0
}
{ prevSecs = currSecs; prevDate = $0 }
END { print startDate, prevDate }
$ awk -f tst.awk file
2016-01-01-2016-01-06
2016-01-08-2016-01-12
2016-01-15-2016-01-17
2016-01-20-2016-01-21
2016-01-30-2016-02-01
With any awk if you don't care about ranges restarting when months change (as apparent in your expected output and the comment under your question):
$ cat tst.awk
BEGIN { FS=OFS="-" }
{ currYrMth = $1 FS $2; currDay = $3 }
(currYrMth != prevYrMth) || ((currDay - prevDay) > 1) {
if (NR>1) {
print startDate, prevDate
}
startDate = $0
}
{ prevYrMth = currYrMth; prevDay = currDay; prevDate = $0 }
END { print startDate, prevDate }
$ awk -f tst.awk file
2016-01-01-2016-01-06
2016-01-08-2016-01-12
2016-01-15-2016-01-17
2016-01-20-2016-01-21
2016-01-30-2016-01-31
2016-02-01-2016-02-01

If you have GNU Awk you can use its time functions.
gawk -F - 'NR==1 || $1 "-" $2 "-" $3 != following {
if (following != "") print start "-" latest;
start = $1 "-" $2 "-" $3
this = mktime($1 " " $2 " " $3 " 0 0 0")
}
{
this += 24*60*60
following = strftime("%F", this)
latest = $1 "-" $2 "-" $3 }
END { if (start != latest) print start "-" latest }' filename
Unit ranges will print like "2016-04-15-2016-04-15" which is a bit of a wart, but easy to fix if you need to. Also the END block has a bug in this case, but again, this should at least get you started.

gawk:
#!/bin/awk -f
BEGIN{
FS="-"
}
{
a[NR]=mktime($1" "$2" "$3" 0 0 0")
b[NR]=$2;
if ( (a[NR-1]+86400) != a[NR] || b[NR-1]!=b[NR] ) {
if(NR!=1){
print s" - "strftime("%Y-%m-%d",a[NR-1])
};
s=$0
}
}
END{
print s" - "$0
}
Create array a with index NR and value as epochtime derived from $0 using awk time function mktime.
Array b with index NR and value as the month in $2
if either epoch time from last line + 86400 ( +1 day) is not equal to epoch time in current line or month in previous line and current line differs, except for first line, print value in s" - "strftime("%Y-%m-%d",a[NR-1] and reassign s which is the start date with $0
END:
Print the last start time s and last line

Related

UNIX group by two values

I have a file with the following lines (values are separated by ";"):
dev_name;dev_type;soft
name1;ASR1;11.1
name2;ASR1;12.2
name3;ASR1;11.1
name4;ASR3;15.1
I know how to group them by one value, like count of all ASRx, but how can I group it by two values, as for example:
ASR1
*11.1 - 2
*12.2 - 1
ASR3
*15.1 - 1
another awk
$ awk -F';' 'NR>1 {a[$2]; b[$3]; c[$2,$3]++}
END {for(k in a) {print k;
for(p in b)
if(c[k,p]) print "\t*"p,"-",c[k,p]}}' file
ASR1
*11.1 - 2
*12.2 - 1
ASR3
*15.1 - 1
$ cat tst.awk
BEGIN { FS=";"; OFS=" - " }
NR==1 { next }
$2 != prev { prt(); prev=$2 }
{ cnt[$3]++ }
END { prt() }
function prt( soft) {
if ( prev != "" ) {
print prev
for (soft in cnt) {
print " *" soft, cnt[soft]
}
delete cnt
}
}
$ awk -f tst.awk file
ASR1
*11.1 - 2
*12.2 - 1
ASR3
*15.1 - 1
Or if you like pipes....
$ tail +2 file | cut -d';' -f2- | sort | uniq -c |
awk -F'[ ;]+' '{print ($3!=prev ? $3 ORS : "") " *" $4 " - " $2; prev=$3}'
ASR1
*11.1 - 2
*12.2 - 1
ASR3
*15.1 - 1
try something like
awk -F ';' '
NR==1{next}
{aRaw[$2"-"$3]++}
END {
asorti( aRaw, aVal)
for( Val in aVal) {
split( aVal [Val], aTmp, /-/ )
if ( aTmp[1] != Last ) { Last = aTmp[1]; print Last }
print " " aTmp[2] " " aRaw[ aVal[ Val] ]
}
}
' YourFile
key here is to use 2 field in a array. The END part is more difficult to present the value than the content itself
Using Perl
$ cat bykub.txt
dev_name;dev_type;soft
name1;ASR1;11.1
name2;ASR1;12.2
name3;ASR1;11.1
name4;ASR3;15.1
$ perl -F";" -lane ' $kv{$F[1]}{$F[2]}++ if $.>1;END { while(($x,$y) = each(%kv)) { print $x;while(($p,$q) = each(%$y)){ print "\t\*$p - $q" }}}' bykub.txt
ASR1
*11.1 - 2
*12.2 - 1
ASR3
*15.1 - 1
$
Yet Another Solution, this one using the always useful GNU datamash to count the groups:
$ datamash -t ';' --header-in -sg 2,3 count 3 < input.txt |
awk -F';' '$1 != curr { curr = $1; print $1 } { print "\t*" $2 " - " $3 }'
ASR1
*11.1 - 2
*12.2 - 1
ASR3
*15.1 - 1
I don't want to encourage lazy questions, but I wrote a solution, and I'm sure someone can point out improvements. I love posting answers on this site because I learn so much. :)
One binary subcall to sort, otherwise all built-in processing. That means using read, which is slow. If your file is large, I'd recommend rewriting the loop in awk or perl, but this will get the job done.
sed 1d groups | # strip the header
sort -t';' -k2,3 > group.srt # pre-sort to collect groupings
declare -i ctr=0 # initialize integer record counter
IFS=';' read x lastA lastB < group.srt # priming read for comparators
printf "$lastA\n\t*$lastB - " # priming print (assumes at least one record)
while IFS=';' read x a b # loop through the file
do if [[ "$lastA" < "$a" ]] # on every MAJOR change
then printf "$ctr\n$a\n\t*$b - " # print total, new MAJOR header and MINOR header
lastA="$a" # update the MAJOR comparator
lastB="$b" # update the MINOR comparator
ctr=1 # reset the counter
elif [[ "$lastB" < "$b" ]] # on every MINOR change
then printf "$ctr\n\t*$b - " # print total and MINOR header
ctr=1 # reset the counter
else (( ctr++ )) # otherwise increment
fi
done < group.srt # feed read from sorted file
printf "$ctr\n" # print final group total at EOF

AWK: increment a field based on values from previous line

Given the following input for AWK:
10;20;20
8;41;41
15;52;52
How could I increase/decrease the values so that:
$1 = remains unchanged
$2 = $2 of previous line + $1 of previous line + 1
$3 = $3 of previous line + $1 of previous line + 1
So the desired output would be:
10;20;20
8;31;31
15;40;40
I need to auto-increment and loop over the lines,
using associative arrays, but it's confusing for me.
Surely, this doesn't work as desired:
#!/bin/awk -f
BEGIN { FS = ";" }
{
print ln, st, of
ln=$1
st=$2 + ln + 1
of=$3 + ln + 1
}
with awk
awk -F";" -v OFS=";"
'NR!=1{ $2=a[2]+a[1]+1; $3=a[3]+a[1]+1 } { split($0,a,FS) } 1' file
split the line to an array and when processing the next line we can use the values stored.
test
10;20;20
8;31;31
15;40;40
Following awk may help you in same.
awk -F";" '
FNR==1{
val=$1;
val1=$2;
val2=$3;
print;
next
}
{
$2=val+val1+1;
$3=val+val2+1;
print;
val=$1;
val1=$2;
val2=$3;
}' OFS=";" Input_file
For your given Input_file, output will be as follows.
10;20;20
8;31;31
15;40;40
awk 'BEGIN{
FS = OFS = ";"
}
FNR>1{
$2 = p2 + p1 + 1
$3 = p3 + p1 + 1
}
{
p1=$1; p2=$2; p3=$3
}1
' infile
Input:
$ cat infile
10;20;20
8;41;41
15;52;52
Output:
awk 'BEGIN{FS=OFS=";"}FNR>1{$2=p2+p1+1; $3=p3+p1+1 }{p1=$1; p2=$2; p3=$3}1' infile
10;20;20
8;31;31
15;40;40
Or store only fields of your interest
awk -v myfields="2,3" '
BEGIN{
FS=OFS=";";
split(myfields,t,/,/)
}
{
for(i in t)
{
if(FNR>1)
{
$(t[i]) = a[t[i]] + a[1] + 1
}
a[t[i]] = $(t[i])
}
a[1] = $1
}1' infile

how to group sequence of item into square brackets

how to group sequence of item into square brackets
for example
List of items
cat item.txt
sn01
sn02
sn03
sn05
sn07
sn08
Desired output
sn[01-03,05,07-08]
If your data is same as shown Input_file sample then following may help you in same.
awk 'FNR==1{line=$0} {sub(/[a-z]+/,"")} $0-val>1 && val1!=val{out=out?out "," val1"-"val:line"[" val1"-"val;val1=$0} $0-val>1 && val1==val{out=out?out "," val1:out "," val1;val1=$0} {if(FNR==1){sub(/[0-9]+/,"",line);val1=$0};val=$0}END{if(val1!=val){print out "," val1"-"val"]"} else {print out "," val"]"}}' Input_file
Adding non-one liner form of solution too.
awk '
FNR==1{
line=$0
}
{
sub(/[a-z]+/,"")
}
$0-val>1 && val1!=val{
out=out?out "," val1"-"val:line"[" val1"-"val;
val1=$0
}
$0-val>1 && val1==val{
out=out?out "," val1:out "," val1;
val1=$0
}
{
if(FNR==1){
sub(/[0-9]+/,"",line);
val1=$0
};
val=$0
}
END{
if(val1!=val){
print out "," val1"-"val"]"
}
else{
print out "," val"]"
}
}
' Input_file
Output will be as follows.
sn[01-03,05,07-08]
"sn" being static here. it should pick it from the input file. when I
given list of items start with "cn". still it picks "sn"
Using awk:
$ cat infile
sn01
sn02
sn03
sn05
sn07
sn08
cn08
cn09
cn10
cn11
cn15
when search='sn'
$ awk -v search='sn' 'function pr(){if(f && l)printf("%s%s",n?",":search"[",f==l?f:f"-"l)}$0!~"^"search{next}{t=$1;sub(/[^0-9]+/,"",t)}f==""{f=l=t;next}t==l+1{l=t;next}{pr();f=l=t;n++}END{pr(); print n?"]":"Nothing matched for keyword :"search}' infile
sn[01-03,05,07-08]
when search='cn'
$ awk -v search='cn' 'function pr(){if(f && l)printf("%s%s",n?",":search"[",f==l?f:f"-"l)}$0!~"^"search{next}{t=$1;sub(/[^0-9]+/,"",t)}f==""{f=l=t;next}t==l+1{l=t;next}{pr();f=l=t;n++}END{pr(); print n?"]":"Nothing matched for keyword :"search}' infile
cn[08-11,15]
Better Readable :
awk -v search='sn' '
function pr()
{
if(f && l)
printf("%s%s",n?",":search"[",f==l?f:f"-"l)
}
$0!~"^"search{
next
}
{
t=$1;
sub(/[^0-9]+/,"",t)
}
f==""{
f=l=t;
next
}
t==l+1{
l=t;
next
}
{
pr();
f=l=t;
n++
}
END{
pr();
print n?"]":"Nothing matched for keyword :"search
}' infile
a simple awk solution
We're aiming to set LB and UB for each possible range.
Starting from LB, the last number in sequence upto which the common difference is 1 gives us the UB.
If difference is more than 1 print the last range and set LB again.
$ awk 'FNR==1{ $1=$1; prefix=substr($0,1,2);} {gsub(/[^0-9]/,"",$1); a[++i]=$1;} END{ printf prefix"["; LB=UB=prev=a[1]; for(i=1; i<=NR; i++){ if(int(a[i+1])==int(prev+1)) { UB=a[i+1]; prev=UB; } else { if(LB==UB) { printf LB"," } else {delim=(i==NR)? "]" :","; printf LB "-" UB delim; } prev=LB=UB=a[i+1]; }} }' file
sn[01-03,05,07-08]
gsub(/[^0-9]/,"",$1) : This sets all non-digit chars to null. Therefore $1 ends up with just numbers;
To understand it better :
$ awk 'FNR==1{ $1=$1; prefix=substr($0,1,2); } {gsub(/[^0-9]/,"",$1); a[++i]=$1;}
END
{
printf prefix"["; LB=UB=prev=a[1];
for(i=1; i<=NR; i++)
{
if(int(a[i+1])==int(prev+1))
{
UB=a[i+1];
prev=UB; }
else
{
if(LB==UB)
{
printf LB","
}
else
{
delim=(i==NR)? "]" :",";
printf LB "-" UB delim;
}
prev=LB=UB=a[i+1];
}
}
}' file
Awk solution:
awk '{ v=substr($0,3) }NR==1{ pfx=substr($0,1,2); r=a=v; next }
{ diff=v-a; if(diff>1) { r=r ((a==last)? ",":"-"a",")v; last=v } a=v }
END{ if(diff==1) r=r"-"v; print pfx"["r"]" }' file
The output:
sn[01-03,05,07-08]

Got stuck with multiple value validation against in particular columns in awk?

I have a text file where i'm trying to validate with particular column(5) if that column contains value like ACT,LFP,TST and EPO then file goes to further process else it should be exit.Here i'm if my text file contains these value in column number 5 means ACT,LFP,TST and EPO go for further process on other hand if column contains apart from that four value then script will terminate.
Code
cat test.txt \
| awk -F '~' -v ERR="/a/x/ERROR" -v NAME="/a/x/z/" -v WRKD="/a/x/b/" -v DATE="23_09_16" -v PD="234" -v FILE_NAME="FILENAME" \
'{ if ($5 != "ACT" || $5 != "LFP" || $5 != "EPO" || $5 != "TST")
system("mv "NAME" "ERR);
system("rm -f"" "WRKD);
print DATE" " PD " " "[" FILE_NAME "]" " ERROR: Panel status contains invalid value due to this file move to error folder";
print DATE" " PD " " "[" FILE_NAME "]" " INFO: Script is exited";
system("exit");
}' >>log.txt
Txt file: test.txt(Note:- File should be processed successfully)
161518~CHEM~ACT~IRPMR~ACT~UD
010282~CHEM~ACT~IRPMR~ACT~UD
162794~CHEM~ACT~IRPMR~LFP~UD
030767~CHEM~ACT~IRPMR~LFP~UD
Txt file: test1.txt(Note:- File should not be processed successfully.This file contains one invalid value)
161518~CHEM~ACT~IRPMR~**ACT1**~UD
010282~CHEM~ACT~IRPMR~ACT~UD
162794~CHEM~ACT~IRPMR~TST~UD
030767~CHEM~ACT~IRPMR~LFP~UD
awk to the rescue!
Lets assume the following input file:
010282~CHEM~ACT~IRPMR~ACT~UD
121212~CHEM~ACT~IRPMR~ZZZ~UD
162794~CHEM~ACT~IRPMR~TST~UD
020202~CHEM~ACT~IRPMR~YYY~UD
030767~CHEM~ACT~IRPMR~LFP~UD
987654~CHEM~ACT~IRPMR~EPO~UD
010101~CHEM~ACT~IRPMR~XXX~UD
123456~CHEM~ACT~IRPMR~TST~UD
1) This example illustrates how to check for invalid lines/records in the input file:
#!/bin/awk
BEGIN {
FS = "~"
s = "ACT,LFP,TST,EPO"
n = split( s, a, "," )
}
{
for( i = 1; i <= n; i++ )
if( a[i] == $5 )
next
print "Unexpected value # line " NR " [" $5 "]"
}
# eof #
Testing:
$ awk -f script.awk -- input.txt
Unexpected value # line 2 [ZZZ]
Unexpected value # line 4 [YYY]
Unexpected value # line 7 [XXX]
2) This example illustrates how to filter out (remove) invalid lines/records from the input file:
#!/bin/awk
BEGIN {
FS = "~"
s = "ACT,LFP,TST,EPO"
n = split( s, a, "," )
}
{
for( i = 1; i <= n; i++ )
{
if( a[i] == $5 )
{
print $0
next
}
}
}
# eof #
Testing:
$ awk -f script.awk -- input.txt
010282~CHEM~ACT~IRPMR~ACT~UD
162794~CHEM~ACT~IRPMR~TST~UD
030767~CHEM~ACT~IRPMR~LFP~UD
987654~CHEM~ACT~IRPMR~EPO~UD
123456~CHEM~ACT~IRPMR~TST~UD
3) This example illustrates how to display the invalid lines/records from the input file:
#!/bin/awk
BEGIN {
FS = "~"
s = "ACT,LFP,TST,EPO"
n = split( s, a, "," )
}
{
for( i = 1; i <= n; i++ )
if( a[i] == $5 )
next
print $0
}
# eof #
Testing:
$ awk -f script.awk -- input.txt
121212~CHEM~ACT~IRPMR~ZZZ~UD
020202~CHEM~ACT~IRPMR~YYY~UD
010101~CHEM~ACT~IRPMR~XXX~UD
Hope it Helps!
Without getting into the calls to system, this will show you an answer.
awk -F"~" '{ if (! ($5 == "ACT" || $5 == "LFP" || $5 == "EPO" || $5 == "TST")) print $0}' data.txt
output
161518~CHEM~ACT~IRPMR~**ACT1**~UD
This version is testing if $5 matches at least one item in the list. If it doesn't (the ! at the front of the || chain tests), then it prints the record as an error.
Of course, $5 will match only one from that list at a time, but that is all you need.
By contrast, when you say
if ($5 != "ACT" || $5 != "LFP" ...)
You're creating a logic test that can never be true. If $5 does not equal "ACT" because it is "LFP", you have already had the chained condition fail, and the remaining || will not be checked.
IHTH

Use gawk to convert from human readable time in a file to unix time?

I am new to gawk. Based on this thread, I already made a gawk function to convert datetime such as "07,JUN,2015,06,PM" to unix time (millisecond):
$ cat tst.awk
function cvttime(t, a) {
split(t,a,/[,: ]+/)
# fa0,07,DEC,2014,10,AM,862.209018
# =>
# a[2] = "07" date
# a[3] = "DEC" month
# a[4] = "2014" year
# a[5] = "06" time
# a[6] = "AM" AM/PM
if ( (a[6] == "PM") && (a[5] < 12) ) {
a[5] += 12
}
match("JANFEBMARAPRMAYJUNJULAUGSEPOCTNOVDEC",a[3])
a[3] = (RSTART+2)/3
return( a[1]","mktime(a[4]" "a[3]" "a[2]" "a[5]" 00 0")"000,"a[7])
}
BEGIN {
mdt = "fa0,07,DEC,2014,10,AM,862.209018"
ms = cvttime(mdt)
print ms
}
In terminal, the following command gives me the correct unix time:
$ TZ=UTC gawk -f tst.awk
returns:
fa0,1417946400000,862.209018
Now, I have a file "input.csv" containing:
aa1,07,DEC,2014,06,AM,282.485988
ac3,07,DEC,2014,07,AM,97.6757181
ef3,07,DEC,2014,08,AM,112.816554
ag3,07,DEC,2014,09,AM,101.479961
fa0,07,DEC,2014,10,AM,862.209018
How should I modified the gawk function and the shell command to pass in "input.csv" and generate "output.csv" containing:
aa1,1417932000000,282.485988
ac3,1417935600000,97.6757181
ef3,1417939200000,112.816554
ag3,1417942800000,101.479961
fa0,1417946400000,862.209018
Thanks in advance!
You could use something like this:
awk -F, '{"date -d "$3"\" \""$2"\" \""$5"\" \""$6"\" \""$4" '+%s'" | getline d; print $1","d"000,"$NF""}' input.csv > output.csv
or
awk -F, '{"date -u -d "$3"\" \""$2"\" \""$5"\" \""$6"\" \""$4" '+%s'" | getline d; print $1","d"000,"$NF""}' input.csv > output.csv
If you want set the flag -u from date command
-u, --utc, --universal
print or set Coordinated Universal Time
Well this is confusing because your input times do not match your output times,
but I think this does what you want:
BEGIN {
FS = OFS = ","
}
{
# fix year
$4 += 2000
# fix month
match("JANFEBMARAPRMAYJUNJULAUGSEPOCTNOVDEC", $3)
$3 = (RSTART + 2) / 3
# fix hour
if ($6 == "PM" && $5 < 12)
$5 += 12
print $1, mktime($4 " " $3 " " $2 " " $5 " 0 0") * 1000, $NF
}

Resources