awk bash recursive brackets id sed tr - bash

A server provides a list of asset IDs separated by commas in square brackets after the date and colons :
20160420084726:-
20160420085418:[111783178, 111557953, 111646835, 111413356, 111412662, 105618372, 111413557]
20160420085418:[111413432, 111633904, 111783198, 111792767, 111557948, 111413225, 111413281]
20160420085418:[111413432, 111633904, 111783198, 111792767, 111557948, 111413225, 111413281]
20160420085522:[111344871, 111394583, 111295547, 111379566, 111352520]
20160420090022:[111344871, 111394583, 111295547, 111379566, 111352520]
The format of the input log is:
timestamp:ads
Where:
timestamp is in the format YYYYMMDDhhmmss, and ads is a comma separated list of ad asset IDs surrounded by square brackets, or - if no ads were returned.
The first part of the task is to write a script that outputs, for each ten minute slice of the day:
Count of IDs that were returned
Count of unique IDs that were returned
Script should support a command line parameter to select whether unique or total IDs should be given.
Example output using the above log excerpt (in total mode):
20160420084:0
20160420085:26
20160420090:5
And in unique count mode it would give:
20160420084:0
20160420085:19
20160420090:5
I have tried this:
awk -F '[,:]' '
{
key = substr($1,1,11)"0"
count[key] += ($2 == "-" ? 0 : NF-1)
}
END {
PROCINFO["sorted_in"] = "#ind_num_asc"
for (key in count) print key, count[key]
}
' $LOGFILENAME | grep $DATE;
With the scripts given until now other scenarios fail. For example this one:
log file:
https://drive.google.com/file/d/1sXFvLyCH8gZrXiqf095MubyP7-sLVUXt/view?usp=sharing
The first few lines of the results should be:
nonunique:
20160420000:1
20160420001:11
20160420002:13
20160420003:16
20160420004:3
20160420005:3
20160420010:6
unique:
20160420000:1
20160420001:5
20160420002:5
20160420003:5
20160420004:3
20160420005:3
20160420010:4

$ cat tst.awk
BEGIN { FS="[]:[]+"; OFS=":" }
{
tot = unq = 0
time = substr($1,1,11)
if ( /,/ ) {
tot = split($2,tmp,/, ?/)
for ( i in tmp ) {
if ( !seen[time,tmp[i]]++ ) {
unq++
}
}
}
tots[time] += tot
unqs[time] += unq
}
END {
for (time in tots) {
print time, tots[time], unqs[time]
}
}
$ awk -f tst.awk file
20160420084:0:0
20160420085:26:19
20160420090:5:5
Massage to suit...

#!/bin/bash
while read; do
dts=$( echo "$REPLY" | cut -d: -f1 )
ids=$( echo "$REPLY" | grep -o '\[.*\]' )
if [ $? -eq 0 ]; then
ids=$( echo "$ids" | tr -d '[] ' | tr ',' '\n' | sort $1 )
count=$( echo "$ids" | wc -l )
else
count=0
fi
echo $dts: $count
done
Run like this:
./script.sh [-u] <input.txt

Related

How to fast sum values in directed graph in shell?

I have a directed graph with like 2000 nodes stored in a file. Each line represents an edge from the node stored in the first column to the node stored in the second column, it is even easy to visualize the data for example in dot(1). Columns are separated by tabs, rows separated by newlines and nodes are named with any of the a-zA-Z0-9_ characters. Tree can have multiple roots, it may have cycles, which should be ignored. I don't care about cycles, they are redundant, but they can happen in the input. Below I am presenting an example of the graph, with tr to substitute spaces for tabs and here-document, to easy reproduce the input file:
tr ' ' '\t' <<EOF >connections.txt
str1 str2
str2 str3
str3 str4
str100 str2
str100 str101
str101 str102
EOF
I have also a list of some node in the graph, called heads. These will be the starting nodes, ie. heads:
tr ' ' '\t' <<EOF >heads.txt
str1
str100
EOF
And I have also a list of associated "cost" with each node. Example with some random data:
tr ' ' '\t' <<EOF >cost.txt
str1 1
str2 5
str3 10
str4 548
str100 57
str101 39
str102 23
EOF
I want to sum the "cost" of each node while traversing the tree from nodes stored in head.txt and print the cost with some traversing information for each leaf.
I want to:
for each node in heads.txt
sum the cost from costs.txt of the node into some variable
find that node in connections.txt
find what does this node connect to
and repeat the algorithm for each of the nodes the node connects to
when the node is connected with nothing, print the sum of costs
Ideally the script would look like:
$ script.sh heads.txt connections.txt cost.txt
str1->str2->str3->str4 1+5+10+548 564
str100->str2->str3->str4 57+5+10+548 620
str100->str101->str102 57+39+23 119
And I even have written this, and it works:
#!/bin/bash
set -euo pipefail
headsf=$1
connectionsf=$2
costf=$3
get_cost() {
grep "^$1"$'\t' "$costf" | cut -f2 || echo 0
}
get_conn() {
grep "^$1"$'\t' "$connectionsf" | cut -f2
}
check_conns() {
grep -q "^$1"$'\t' "$connectionsf"
}
f_output() {
printf "%s\t%s\n" "$1" "$2"
}
f() {
local func cost
func="$1"
cost=$(get_cost "$func")
if ! check_conns "$func"; then
f_output "${2:+$2->}$func" "${3:+$3+}$cost"
return
fi
get_conn "$func" |
while IFS=$'\t' read -r calls; do
if [ "$func" = "$calls" ]; then
echo "$func is recursive" >&2
continue
fi
if <<<"$2" grep -q -w "$calls"; then
printf "$2 calls recursive $calls\n" >&2
continue
fi
f "$calls" "${2:+$2->}$func" "${3:+$3+}$cost"
done
}
while IFS= read -r head; do
f "$head" "" ""
done < "$headsf" |
while IFS=$'\t' read -r func calc; do
tmp=$(<<<$calc bc)
printf "%s\t%s\t%s\n" "$func" "$calc" "$tmp"
done |
column -t -s $'\t'
However it is impossibly slow on bigger inputs. Even with sample files here (only 6 lines) the script takes 200ms on my machine. How can I speed it up? Can the inputs be sorted, joined somehow to speed it up (grep doesn't care if the input is sorted)? Can this be done faster in awk or other unix tools?
I would like to limit myself to bash shell and standard *unix tools, coreutils, moreutils, datamash and such. I tried doing it in awk, but failed, I have no idea how to do find something recursively in the input in awk. It this feels to me "doable" in a shell script really fast.
Since no one has posted an answer yet, here is an awk solution as a starting point:
#!/usr/bin/awk -f
BEGIN {
FS=OFS="\t"
}
FILENAME=="connections.txt" {
edges[$1,++count[$1]]=$2
next
}
FILENAME=="cost.txt" {
costs[$1]=$2
next
}
FILENAME=="heads.txt" {
f($1)
}
function f(node,
path,cost,sum,prev,sep1,sep2,i) {
if(node in prev)
# cycle detected
return
path=path sep1 node
cost=cost sep2 costs[node]
sum+=costs[node]
if(!count[node]) {
print path,cost,sum
}
else {
prev[node]
for(i=1;i<=count[node];++i)
f(edges[node,i],path,cost,sum,prev,"->","+")
delete prev[node]
}
}
Make it read connections.txt and cost.txt before heads.txt.
Its output (padded):
$ awk -f tst.awk connections.txt cost.txt heads.txt
str1->str2->str3->str4 1+5+10+548 564
str100->str2->str3->str4 57+5+10+548 620
str100->str101->str102 57+39+23 119
You say you want only standard tools, but you also mention using dot on your data, so I'm assuming you have the other graphviz utilities available... in particular, gvpr, which is like awk for graphs:
#!/usr/bin/env bash
graph=$(mktemp)
join -t$'\t' -j1 -o 0,1.2,2.2 -a2 \
<(sort -k1,1 connections.txt) \
<(sort -k1,1 cost.txt) |
awk -F$'\t' 'BEGIN { print "digraph g {" }
{ printf "%s [cost = %d ]\n", $1, $3
if ($2 != "") printf "%s -> %s\n", $1, $2 }
END { print "}" }' > "$graph"
while read root; do
gvpr -a "$root" '
BEGIN {
int depth;
int seen[string];
string path[int];
int costs[int];
}
BEG_G {
$tvtype = TV_prepostfwd;
$tvroot = node($, ARGV[0]);
}
N {
if ($.name in seen) {
depth--;
} else {
seen[$.name] = 1;
path[depth] = $.name;
costs[depth] = $.cost;
depth++;
if (!fstout($) && path[0] == ARGV[0]) {
int i, c = 0;
for (i = 0; i < depth - 1; i++) {
printf("%s->", path[i]);
}
printf("%s\t", $.name);
for (i = 0; i < depth - 1; i++) {
c += costs[i];
printf("%d+", costs[i]);
}
c += $.cost;
printf("%d\t%d\n", $.cost, c);
}
}
}' "$graph"
done < heads.txt
rm -f "$graph"
Running this after creating your data files:
$ ./paths.sh
str1->str2->str3->str4 1+5+10+548 564
str100->str2->str3->str4 57+5+10+548 620
str100->str101->str102 57+39+23 119
Or, since it's so ubiquitous it might as well be standard, a sqlite-based solution. This one doesn't even require bash/zsh/ksh93, unlike the above.
$ sqlite3 -batch -noheader -list <<EOF
.separator "\t"
CREATE TABLE heads(node TEXT);
.import heads.txt heads
CREATE TABLE costs(node TEXT PRIMARY KEY, cost INTEGER) WITHOUT ROWID;
.import cost.txt costs
CREATE TABLE connections(from_node TEXT, to_node TEXT
, PRIMARY KEY(from_node, to_node)) WITHOUT ROWID;
.import connections.txt connections
WITH RECURSIVE paths(tail, path, costs, cost) AS
(SELECT h.node, h.node, c.cost, c.cost
FROM heads AS h
JOIN costs AS c ON h.node = c.node
UNION ALL
SELECT conn.to_node, p.path || '->' || conn.to_node
, p.costs || '+' || c.cost, p.cost + c.cost
FROM paths AS p
JOIN connections AS conn ON conn.from_node = p.tail
JOIN costs AS c ON c.node = conn.to_node
)
SELECT path, costs, cost FROM paths AS p
WHERE tail NOT IN (SELECT from_node FROM connections)
ORDER BY path;
EOF
str1->str2->str3->str4 1+5+10+548 564
str100->str101->str102 57+39+23 119
str100->str2->str3->str4 57+5+10+548 620

How to split file by percentage of no. of lines?

How to split file by percentage of no. of lines?
Let's say I want to split my file into 3 portions (60%/20%/20% parts), I could do this manually, -_- :
$ wc -l brown.txt
57339 brown.txt
$ bc <<< "57339 / 10 * 6"
34398
$ bc <<< "57339 / 10 * 2"
11466
$ bc <<< "34398 + 11466"
45864
bc <<< "34398 + 11466 + 11475"
57339
$ head -n 34398 brown.txt > part1.txt
$ sed -n 34399,45864p brown.txt > part2.txt
$ sed -n 45865,57339p brown.txt > part3.txt
$ wc -l part*.txt
34398 part1.txt
11466 part2.txt
11475 part3.txt
57339 total
But I'm sure there's a better way!
There is a utility that takes as arguments the line numbers that should become the first of each respective new file: csplit. This is a wrapper around its POSIX version:
#!/bin/bash
usage () {
printf '%s\n' "${0##*/} [-ks] [-f prefix] [-n number] file arg1..." >&2
}
# Collect csplit options
while getopts "ksf:n:" opt; do
case "$opt" in
k|s) args+=(-"$opt") ;; # k: no remove on error, s: silent
f|n) args+=(-"$opt" "$OPTARG") ;; # f: filename prefix, n: digits in number
*) usage; exit 1 ;;
esac
done
shift $(( OPTIND - 1 ))
fname=$1
shift
ratios=("$#")
len=$(wc -l < "$fname")
# Sum of ratios and array of cumulative ratios
for ratio in "${ratios[#]}"; do
(( total += ratio ))
cumsums+=("$total")
done
# Don't need the last element
unset cumsums[-1]
# Array of numbers of first line in each split file
for sum in "${cumsums[#]}"; do
linenums+=( $(( sum * len / total + 1 )) )
done
csplit "${args[#]}" "$fname" "${linenums[#]}"
After the name of the file to split up, it takes the ratios for the sizes of the split files relative to their sum, i.e.,
percsplit brown.txt 60 20 20
percsplit brown.txt 6 2 2
percsplit brown.txt 3 1 1
are all equivalent.
Usage similar to the case in the question is as follows:
$ percsplit -s -f part -n 1 brown.txt 60 20 20
$ wc -l part*
34403 part0
11468 part1
11468 part2
57339 total
Numbering starts with zero, though, and there is no txt extension. The GNU version supports a --suffix-format option that would allow for .txt extension and which could be added to the accepted arguments, but that would require something more elaborate than getopts to parse them.
This solution plays nice with very short files (split file of two lines into two) and the heavy lifting is done by csplit itself.
$ cat file
a
b
c
d
e
$ cat tst.awk
BEGIN {
split(pcts,p)
nrs[1]
for (i=1; i in p; i++) {
pct += p[i]
nrs[int(size * pct / 100) + 1]
}
}
NR in nrs{ close(out); out = "part" ++fileNr ".txt" }
{ print $0 " > " out }
$ awk -v size=$(wc -l < file) -v pcts="60 20 20" -f tst.awk file
a > part1.txt
b > part1.txt
c > part1.txt
d > part2.txt
e > part3.txt
Change the " > " to just > to actually write to the output files.
Usage
The following bash script allows you to specify the percentage like
./split.sh brown.txt 60 20 20
you also can use the placeholder . which fills the percentage up to 100%.
./split.sh brown.txt 60 20 .
the splitted file is written to
part1-brown.txt
part2-brown.txt
part3-brown.txt
The script always generates as much part files as numbers are specified.
If the percentages sum up to 100, cat part* will always generate the original file (no duplicated or missing lines).
Bash Script: split.sh
#! /bin/bash
file="$1"
fileLength=$(wc -l < "$file")
shift
part=1
percentSum=0
currentLine=1
for percent in "$#"; do
[ "$percent" == "." ] && ((percent = 100 - percentSum))
((percentSum += percent))
if ((percent < 0 || percentSum > 100)); then
echo "invalid percentage" 1>&2
exit 1
fi
((nextLine = fileLength * percentSum / 100))
if ((nextLine < currentLine)); then
printf "" # create empty file
else
sed -n "$currentLine,$nextLine"p "$file"
fi > "part$part-$file"
((currentLine = nextLine + 1))
((part++))
done
BEGIN {
split(w, weight)
total = 0
for (i in weight) {
weight[i] += total
total = weight[i]
}
}
FNR == 1 {
if (NR!=1) {
write_partitioned_files(weight,a)
split("",a,":") #empty a portably
}
name=FILENAME
}
{a[FNR]=$0}
END {
write_partitioned_files(weight,a)
}
function write_partitioned_files(weight, a) {
split("",threshold,":")
size = length(a)
for (i in weight){
threshold[length(threshold)] = int((size * weight[i] / total)+0.5)+1
}
l=1
part=0
for (i in threshold) {
close(out)
out = name ".part" ++part
for (;l<threshold[i];l++) {
print a[l] " > " out
}
}
}
Invoke as:
awk -v w="60 20 20" -f above_script.awk file_to_split1 file_to_split2 ...
Replace " > " with > in script to actually write partitioned files.
The variable w expects space separated numbers. Files are partitioned in that proportion. For example "2 1 1 3" will partition files into four with number of lines in proportion of 2:1:1:3. Any sequence of numbers adding up to 100 can be used as percentages.
For large files the array a may consume too much memory. If that is an issue, here is an alternative awk script:
BEGIN {
split(w, weight)
for (i in weight) {
total += weight[i]; weight[i] = total #cumulative sum
}
}
FNR == 1 {
#get number of lines. take care of single quotes in filename.
name = gensub("'", "'\"'\"'", "g", FILENAME)
"wc -l '" name "'" | getline size
split("", threshold, ":")
for (i in weight){
threshold[length(threshold)+1] = int((size * weight[i] / total)+0.5)+1
}
part=1; close(out); out = FILENAME ".part" part
}
{
if(FNR>=threshold[part]) {
close(out); out = FILENAME ".part" ++part
}
print $0 " > " out
}
This passes through each file twice. Once for counting lines (via wc -l) and the other time while writing partitioned files. Invocation and effect is similar to the first method.
i like Benjamin W.'s csplit solution, but it's so long...
#!/bin/bash
# usage ./splitpercs.sh file 60 20 20
n=`wc -l <"$1"` || exit 1
echo $* | tr ' ' '\n' | tail -n+2 | head -n`expr $# - 1` |
awk -v n=$n 'BEGIN{r=1} {r+=n*$0/100; if(r > 1 && r < n){printf "%d\n",r}}' |
uniq | xargs csplit -sfpart "$1"
(the if(r > 1 && r < n) and uniq bits are to prevent creating empty files or strange behavior for small percentages, files with small numbers of lines, or percentages that add to over 100.)
I just followed your lead and made what you do manually into a script. It may not be the fastest or "best", but if you understand what you are doing now and can just "scriptify" it, you may be better off should you need to maintain it.
#!/bin/bash
# thisScript.sh yourfile.txt 20 50 10 20
YOURFILE=$1
shift
# changed to cat | wc so I dont have to remove the filename which comes from
# wc -l
LINES=$(cat $YOURFILE | wc -l )
startpct=0;
PART=1;
for pct in $#
do
# I am assuming that each parameter is on top of the last
# so 10 30 10 would become 10, 10+30 = 40, 10+30+10 = 50, ...
endpct=$( echo "$startpct + $pct" | bc)
# your math but changed parts of 100 instead of parts of 10.
# change bc <<< to echo "..." | bc
# so that one can capture the output into a bash variable.
FIRSTLINE=$( echo "$LINES * $startpct / 100 + 1" | bc )
LASTLINE=$( echo "$LINES * $endpct / 100" | bc )
# use sed every time because the special case for head
# doesn't really help performance.
sed -n $FIRSTLINE,${LASTLINE}p $YOURFILE > part${PART}.txt
$((PART++))
startpct=$endpct
done
# get the rest if the % dont add to 100%
if [[ $( "lastpct < 100" | bc ) -gt 0 ]] ; then
sed -n $FIRSTLINE,${LASTLINE}p $YOURFILE > part${PART}.txt
fi
wc -l part*.txt

i have a protein sequence file i want to count trimers in it using sed or grep

I have a protein sequence file in the following format
uniprotID\space\sequence
sequence is a string of any length but with only 20 allowed letters i.e.
ARNDCQEGHILKMFPSTWYV
Example of 1 record
Q5768D AKCCACAKCCAC
I want to create a csv file in the following format
Q5768D
12
ACA 1
AKC 2
CAC 2
CAK 1
CCA 2
KCC 2
This is what I'm currently trying:
#!/bin/sh
while read ID SEQ # uniprot along with sequences
do
echo $SEQ | tr -d '[[:space:]]' | sed 's/./& /g' > TEST_FILE
declare -a SSA=(`cat TEST_FILE`)
SQL=$(echo ${#SSA[#]})
for (( X=0; X <= "$SQL"; X++ ))
do
Y=$(expr $X + 1)
Z=$(expr $X + 2)
echo ${SSA[X]} ${SSA[Y]} ${SSA[Z]}
done | awk '{if (NF == 3) print}' | tr -d ' ' > TEMPTRIMER
rm TEST_FILE # removing temporary sequence file
sort TEMPTRIMER|uniq -c > $ID.$SQL
done < $1
in this code i am storing individual record in a different file which is not good. Also the program is very slow in 12 hours only 12000 records are accessed out of .5 million records.
If this is what you want:
$ cat file
Q5768D AKCCACAKCCAC
OTHER FOOBARFOOBAR
$
$ awk -f tst.awk file
Q5768D OTHER
12 12
AKC 2 FOO 2
KCC 2 OOB 2
CCA 2 OBA 2
CAC 2 BAR 2
ACA 1 ARF 1
CAK 1 RFO 1
This will do it:
$ cat tst.awk
BEGIN { OFS="\t" }
{
colNr = NR
rowNr = 0
name[colNr] = $1
lgth[colNr] = length($2)
delete name2nr
for (i=1;i<=(length($2)-2);i++) {
trimer = substr($2,i,3)
if ( !(trimer in name2nr) ) {
name2nr[trimer] = ++rowNr
nr2name[colNr,rowNr] = trimer
}
cnt[colNr,name2nr[trimer]]++
}
numCols = colNr
numRows = (rowNr > numRows ? rowNr : numRows)
}
END {
for (colNr=1;colNr<=numCols;colNr++) {
printf "%s%s", name[colNr], (colNr<numCols?OFS:ORS)
}
for (colNr=1;colNr<=numCols;colNr++) {
printf "%s%s", lgth[colNr], (colNr<numCols?OFS:ORS)
}
for (rowNr=1;rowNr<=numRows;rowNr++) {
for (colNr=1;colNr<=numCols;colNr++) {
printf "%s %s%s", nr2name[colNr,rowNr], cnt[colNr,rowNr], (colNr<numCols?OFS:ORS)
}
}
}
If instead you want output like in #rogerovo's perl answer that'd be much simpler than the above and more efficient and use far less memory:
$ cat tst2.awk
{
delete cnt
for (i=1;i<=(length($2)-2);i++) {
cnt[substr($2,i,3)]++
}
printf "%s;%s", $1, length($2)
for (trimer in cnt) {
printf ";%s=%s", trimer, cnt[trimer]
}
print ""
}
$ awk -f tst2.awk file
Q5768D;12;ACA=1;KCC=2;CAK=1;CAC=2;CCA=2;AKC=2
OTHER;12;RFO=1;FOO=2;OBA=2;OOB=2;ARF=1;BAR=2
This perl script processes cca 550'000 "trimmers"/sec. (random valid test sequences 0-8000 chars long, 100k records (~400MB) produce an 2GB output csv)
output:
Q1024A;421;AAF=1;AAK=1;AFC=1;AFE=2;AGP=1;AHC=1;AHE=1;AIV=1;AKN=1;AMC=1;AQD=1;AQY=1;...
Q1074F;6753;AAA=1;AAD=1;AAE=1;AAF=2;AAN=2;AAP=2;AAT=1;ACA=1;ACC=1;ACD=1;ACE=3;ACF=2;...
code:
#!/usr/bin/perl
use strict;
$|=1;
my $c;
# process each line on input
while (readline STDIN) {
$c++; chomp;
# is it a valid line? has the format and a sequence to process
if (m~^(\w+)\s+([ARNDCQEGHILKMFPSTWYV]+)\r?$~ and $2) {
print join ";",($1,length($2));
my %trimdb;
my $seq=$2;
#split the sequence into chars
my #a=split //,$seq;
my #trimmer;
# while there are unprocessed chars in the sequence...
while (scalar #a) {
# fill up the buffer with a char from the top of the sequence
push #trimmer, shift #a;
# if the buffer is full (has 3 chars), increase the trimer frequency
if (scalar #trimmer == 3 ) {
$trimdb{(join "",#trimmer)}++;
# drop the first letter from buffer, for next loop
shift #trimmer;
}
}
# we're done with the sequence - print the sorted list of trimers
foreach (sort keys %trimdb) {
#print in a csv (;) line
print ";$_=$trimdb{$_}";
}
print"\n";
}
else {
#the input line was not valid.
print STDERR "input error: $_\n";
}
# just a progress counter
printf STDERR "%8i\r",$c if not $c%100;
}
print STDERR "\n";
if you have perl installed (most linuxes do, check the path /usr/bin/perl or replace with yours), just run: ./count_trimers.pl < your_input_file.txt > output.csv

How to append name and values of each set of records into multiple records

Use the below Code but it taking more time to read.
while read TAG
do
TAGNAME=$(echo $TAG | cut -d '>' -f1)
TAGVALUE=$(echo $TAG | cut -d '>' -f2)
if [ "$TAGNAME" = "START_OF_REC" ]
then
CNT_VAR=`expr $CNT_VAR + 1`
DERIVED_ID=${DATE_VAR}${CNT_VAR}
CUST_ID_VAR="NULL_CUST_ID"
OPPOR_ID_VAR="NULL_OPPOR_ID"
elif [ "$TAGNAME" = "bd-cust-id" ]
then
CUST_ID_VAR=$TAGVALUE
sed -i 's/NULL_CUST_ID/'$CUST_ID_VAR'/g' $FLAT_FILE
echo ${CUST_ID_VAR}${PIPE}${OPPOR_ID_VAR}${PIPE}${DERIVED_ID}${PIPE}${TAGNAME}${PIPE}${TAGVALUE} >> $FLAT_FILE
elif [ "$TAGNAME" = "mars-opportunity-id" ]
then
OPPOR_ID_VAR=$TAGVALUE
if [ "$OPPOR_ID_VAR" = "EMPTY_VAL" ]
then
sed -i 's/NULL_OPPOR_ID//g' $FLAT_FILE
else
sed -i 's/NULL_OPPOR_ID/'$OPPOR_ID_VAR'/g' $FLAT_FILE
echo ${CUST_ID_VAR}${PIPE}${OPPOR_ID_VAR}${PIPE}${DERIVED_ID}${PIPE}${TAGNAME}${PIPE}${TAGVALUE} >> $FLAT_FILE
fi
else
if [ "$OPPOR_ID_VAR" = "EMPTY_VAL" ]
then
echo ${CUST_ID_VAR}${PIPE}${PIPE}${DERIVED_ID}${PIPE}${TAGNAME}${PIPE}${TAGVALUE} >> $FLAT_FILE
else
echo ${CUST_ID_VAR}${PIPE}${OPPOR_ID_VAR}${PIPE}${DERIVED_ID}${PIPE}${TAGNAME}${PIPE}${TAGVALUE} >> $FLAT_FILE
fi
fi
done < INPUT_FILE
I'm using the above code to read 50K records as specified below 2 records examples that start each record with START_OF_REC.
I wrote a script but its taking forever to complete 50K records.
I'm looking for bash script that runs faster.
INPUT_FILE
START_OF_REC>START
trigger>SalesLeadCreated
message-sent-at-ts>2015-01-27T00:00.08
bd-cust-id>01234
mars-opportunity-id>2-BFGCMQ5
mars-activity-id>2-BFGCMPZ
lead-type>AccountOpen
media-ad-code>WWW
lead-action-code>completed
START_OF_REC>START
trigger>SalesLeadCreated
message-sent-at-ts>2015-01-27T00:00.10
bd-cust-id>054671
mars-opportunity-id>2-BFGC39C
mars-activity-id>2-BFGC396
lead-type>AccountOpen
media-ad-code>WWW `enter code here`
lead-action-code>saved
Expected OUTPUT
bd-cust-id|mars-opportunity-id|SQL_ID|Tag_name|Tag_Value
01234|2-BFGCMQ5|1|trigger|SalesLeadCreated
01234|2-BFGCMQ5|1|message-sent-at-ts|2015-01-27T00:00.08
01234|2-BFGCMQ5|1|bd-cust-id|01234
01234|2-BFGCMQ5|1|mars-opportunity-id|2-BFGCMQ5
01234|2-BFGCMQ5|1|mars-activity-id|2-BFGCMPZ
01234|2-BFGCMQ5|1|lead-type|AccountOpen
01234|2-BFGCMQ5|1|media-ad-code|WWW
01234|2-BFGCMQ5|1|lead-action-code|completed
054671|2-BFGC39C|2|trigger|SalesLeadCreated
054671|2-BFGC39C|2|message-sent-at-ts|2015-01-27T00:00.10
054671|2-BFGC39C|2|bd-cust-id|054671
054671|2-BFGC39C|2|mars-opportunity-id|2-BFGC39C
054671|2-BFGC39C|2|mars-activity-id|2-BFGC396
054671|2-BFGC39C|2|lead-type|AccountOpen
054671|2-BFGC39C|2|media-ad-code|WWW
054671|2-BFGC39C|2|lead-action-code|completed
awk -F ">" -v OFS="|" '
BEGIN { print "bd-cust-id|mars-opportunity-id|SQL_ID|Tag_name|Tag_Value" }
function output() {
sqlid++
custid = data["bd-cust-id"]
oppid = data["mars-opportunity-id"]
for (key in data)
print custid, oppid, sqlid, key, data[key]
delete data
}
$1 == "START_OF_REC" { if (NR > 1) output(); next }
{ data[$1] = $2 }
END { output() }
' INPUT_FILE
bd-cust-id|mars-opportunity-id|SQL_ID|Tag_name|Tag_Value
01234 |2-BFGCMQ5|1|bd-cust-id|01234
01234 |2-BFGCMQ5|1|trigger|SalesLeadCreated
01234 |2-BFGCMQ5|1|mars-activity-id|2-BFGCMPZ
01234 |2-BFGCMQ5|1|lead-action-code|completed
01234 |2-BFGCMQ5|1|lead-type|AccountOpen
01234 |2-BFGCMQ5|1|media-ad-code|WWW
01234 |2-BFGCMQ5|1|message-sent-at-ts|2015-01-27T00:00.08
01234 |2-BFGCMQ5|1|mars-opportunity-id|2-BFGCMQ5
054671 |2-BFGC39C|2|bd-cust-id|054671
054671 |2-BFGC39C|2|trigger|SalesLeadCreated
054671 |2-BFGC39C|2|mars-activity-id|2-BFGC396
054671 |2-BFGC39C|2|lead-action-code|saved
054671 |2-BFGC39C|2|lead-type|AccountOpen
054671 |2-BFGC39C|2|media-ad-code|WWW `enter code here`
054671 |2-BFGC39C|2|message-sent-at-ts|2015-01-27T00:00.10
054671 |2-BFGC39C|2|mars-opportunity-id|2-BFGC39C
The spaces are due to trailing whitespace in your input file.
I'm assuming that the SQL_ID is just a running count of the records.

request shell script to locate longest repeated segment of each line of input

I have a large file where each line consists of 24 small integers separated by whitespace. I would like to find, for each line, the longest segment that repeats, allowing the line to wrap around. For example, given the line
0 10 4 2 7 9 11 8 6 5 0 10 4 2 7 11 9 3 8 3 1 1 6 5
the sequence 6 5 0 10 4 2 7 is longest; it has length 7 and the two occurrences are separated by 10 positions (or 14).
Could someone show me how to cobble a script together to return, for each line, the length of the longest sequence and the interval between its two beginnings?
The way the file is constructed it will be impossible for any segment to be repeated more than once (i.e. more than two appearances), because each number from 0 to 11 is constrained to appear exactly twice.
Much appreciated. --Lloyd
Here is a rather obfuscated solution that works on a single line of input. Wrap the whole thing in a loop that reads the line from your input rather than setting it explicitly, and you should have a viable (albeit terribly slow and ugly) solution.
#!/bin/sh
input='0 10 4 2 7 9 11 8 6 5 0 10 4 2 7 11 9 3 8 3 1 1 6 5'
trap 'rm -f $TMP1 $TMP2' 0
TMP1=$(mktemp $(basename $0.XXXX))
TMP2=$(mktemp $(basename $0.XXXX))
input="$input $input" # handle wrap-around
seq 0 11 | while read start_value; do
echo $input | tr \ \\n | grep -w -n $start_value | sed 's/:.*//' | {
read i
read j
delta=$( expr $j - $i )
echo $input | tr \ \\n | sed -n "$i,${j}p" > $TMP1
echo $input | tr \ \\n | sed -n "$j,\$p" > $TMP2
diff $TMP1 $TMP2 | { IFS=a read length junk
echo $length $delta $start_value
}
}
done | sort -rn | sed 1q | { read length delta start;
printf "%s " "The sequence"
echo $input | tr \ \\n | awk '$0==k{t=1}t' k=$start | sed "${length}q"
echo ' is the longest sequence.'
/bin/echo -n The difference between starting positions is $delta '(or '
expr 24 - $delta
echo ')'
} | tr \\n ' '
echo
There are a lot of languages that would make this easier than awk ( including gawk ), but here's an all awk answer.
Try putting this into an executable awk file:
#!/usr/bin/awk -f
BEGIN { DELIM=":" }
function reorder(start) {
head = ""
tail = ""
for( i=1;i<=NF;i++ ) {
if( i<start ) tail = sprintf( "%s%s%s", tail, $i, FS )
else head = sprintf( "%s%s%s", head, $i, FS )
}
# last field is the starting index
return( head tail start )
}
function longest(pair) {
split( pair, a, DELIM )
split( a[1], one, FS )
split( a[2], two, FS )
long = ""
for( i=1;i<=NF;i++ ) {
if( one[i] != two[i] ) break
long = sprintf( "%s%s%s", long, one[i], FS )
}
return( i-1 DELIM two[NF+1]-one[NF+1] DELIM long )
}
{
for( k=1;k<=NF;k++ ) {
pairs[$k] = (pairs[$k]==""?"":pairs[$k]DELIM) reorder( k )
}
for( p in pairs ) {
tmp = longest( pairs[p] )
out = tmp>out ? tmp : out
}
print out
}
If I call this awko then running awko data yields data in the form:
# of matched fields:index separation:longest match
which for the input data is:
7:14:6 5 0 10 4 2 7
Notice that I haven't bothered to clean up the extra space at the end of the data that matches. With more input data, I'd have a better idea if this has bugs or not.
I wanted to see how fast I could do this:
#!/usr/bin/awk -f
BEGIN { OFS=":" }
function longest(first, second) {
long = ""
s = second
flds = 0
for( f=first;f<=NF;f++ ) {
if( $f != $s ) break
long = sprintf( "%s%s%s", long, $f, " " )
if( s==NF ) s=0
s++
flds++
}
return( flds OFS second-first OFS long )
}
{
for(k=1;k<=NF;k++) {
val = pos[$k]
if( val!="" ) {
tmp = longest( val, k )
delete pos[$k] #### need an awk/gawk that can remove elems or "delete pos" l8r
}
else pos[$k] = k
out = tmp>out ? tmp : out
}
print out
}
It's ~200% faster than the first go round. It's only using a single outer field loop and processes each matching number as the second is found using the original parsed fields. Running the same data over and over (2400 lines worth) gave me a system time of 0.33s instead of the 71.10s I got from the first script on the same data.

Resources