How to add duplicates lines to a file using Unix - shell
I want to add duplicate lines I have tried but not able to get the desired output
I have used sed but end up with all lines duplicated (below code)
sed 'p' Data.txt > Output.txt
I have tried with awk but end up with all lines duplicated (below code)
while read line; do
commacount=`echo $line|tr ',' '\n'|wc -l`
atcount=`echo $line|tr '#' '\n'|wc -l`
echo $commacount,$atcount
if [ "$commacount == '8' && $atcount == '3'" ]; then
{
awk '{print $0}1' Data.txt > tmp
}
else
{
awk '{print $0}' Data.txt > tmp
}
fi
done < Data.txt
Data.txt
2009-09-12T05:18:#00#+10:00,2303,Dump,CAM,1,1,JUNM
2009-09-12T05:24:00+10:00,2009-09-12T05:24:#00#+10:00,2303,Dump,RIV,1,1,JUNM
2009-09-12T05:25:00+10:00,2009-09-12T05:25:#00#+10:00,2303,Dump,WSN,1,1,JUNM
2009-09-12T05:27:00+10:00,2009-09-12T05:27:#00#+10:00,2303,Dump,HWL,1,1,JUNM
2009-09-12T05:29:00+10:00,2009-09-12T05:29:#00#+10:00,2303,Dump,BWD,1,1,JUNM
2009-09-12T05:31:00+10:00,2009-09-12T05:31:#00#+10:00,2303,Dump,ASH,1,1,JUNM
2009-09-12T05:33:00+10:00,,2303,Dump,ALM,1,1,JUNM
2009-09-12T05:00:#00#+10:00,2300,Up,ALM,1,1,JUNM
2009-09-12T05:01:00+10:00,2009-09-12T05:01:#00#+10:00,2300,Up,ASH,1,1,JUNM
2009-09-12T05:04:00+10:00,2009-09-12T05:04:#00#+10:00,2300,Up,BWD,1,1,JUNM
2009-09-12T05:06:00+10:00,2009-09-12T05:06:#00#+10:00,2300,Up,HWL,1,1,JUNM
2009-09-12T05:08:00+10:00,2009-09-12T05:08:#00#+10:00,2300,Up,WSN,1,1,JUNM
2009-09-12T05:10:00+10:00,2009-09-12T05:10:#00#+10:00,2300,Up,RIV,1,1,JUNM
2009-09-12T05:17:00+10:00,,2300,Up,CAM,1,1,JUNM
2009-09-12T09:25:#00#+10:00,2305,Dump,CAM,1,1,JUNM
2009-09-12T09:28:00+10:00,2009-09-12T09:28:#00#+10:00,2305,Dump,RIV,1,1,JUNM
2009-09-12T09:29:00+10:00,2009-09-12T09:29:#00#+10:00,2305,Dump,WSN,1,1,JUNM
2009-09-12T09:31:00+10:00,2009-09-12T09:31:#00#+10:00,2305,Dump,HWL,1,1,JUNM
2009-09-12T09:32:00+10:00,2009-09-12T09:32:#00#+10:00,2305,Dump,BWD,1,1,JUNM
2009-09-12T09:34:00+10:00,2009-09-12T09:34:#00#+10:00,2305,Dump,ASH,1,1,JUNM
2009-09-12T09:41:00+10:00,,2305,Dump,ALM,1,1,JUNM
,2306,Up,ALM,1,1,JUNM
,2306,Up,ASH,1,1,JUNM
,2306,Up,BWD,1,1,JUNM
,2306,Up,HWL,1,1,JUNM
,2306,Up,WSN,1,1,JUNM
,2306,Up,RIV,1,1,JUNM
,2306,Up,CAM,1,1,JUNM
2009-09-12T06:18:#00#+10:00,4505,Dump,CAR,1,1,JUNM
2009-09-12T06:21:00+10:00,2009-09-12T06:21:#00#+10:00,4505,Dump,SEA,1,1,JUNM
2009-09-12T06:24:00+10:00,2009-09-12T06:24:#00#+10:00,4505,Dump,KAN,1,1,JUNM
Output should be
2009-09-12T05:18:#00#+10:00,2303,Dump,CAM,1,1,JUNM
2009-09-12T05:24:00+10:00,2009-09-12T05:24:#00#+10:00,2303,Dump,RIV,1,1,JUNM
2009-09-12T05:24:00+10:00,2009-09-12T05:24:#00#+10:00,2303,Dump,RIV,1,1,JUNM
2009-09-12T05:25:00+10:00,2009-09-12T05:25:#00#+10:00,2303,Dump,WSN,1,1,JUNM
2009-09-12T05:25:00+10:00,2009-09-12T05:25:#00#+10:00,2303,Dump,WSN,1,1,JUNM
2009-09-12T05:27:00+10:00,2009-09-12T05:27:#00#+10:00,2303,Dump,HWL,1,1,JUNM
2009-09-12T05:27:00+10:00,2009-09-12T05:27:#00#+10:00,2303,Dump,HWL,1,1,JUNM
2009-09-12T05:29:00+10:00,2009-09-12T05:29:#00#+10:00,2303,Dump,BWD,1,1,JUNM
2009-09-12T05:29:00+10:00,2009-09-12T05:29:#00#+10:00,2303,Dump,BWD,1,1,JUNM
2009-09-12T05:31:00+10:00,2009-09-12T05:31:#00#+10:00,2303,Dump,ASH,1,1,JUNM
2009-09-12T05:31:00+10:00,2009-09-12T05:31:#00#+10:00,2303,Dump,ASH,1,1,JUNM
2009-09-12T05:33:00+10:00,,2303,Dump,ALM,1,1,JUNM
2009-09-12T05:00:#00#+10:00,2300,Up,ALM,1,1,JUNM
2009-09-12T05:01:00+10:00,2009-09-12T05:01:#00#+10:00,2300,Up,ASH,1,1,JUNM
2009-09-12T05:01:00+10:00,2009-09-12T05:01:#00#+10:00,2300,Up,ASH,1,1,JUNM
2009-09-12T05:04:00+10:00,2009-09-12T05:04:#00#+10:00,2300,Up,BWD,1,1,JUNM
2009-09-12T05:04:00+10:00,2009-09-12T05:04:#00#+10:00,2300,Up,BWD,1,1,JUNM
2009-09-12T05:06:00+10:00,2009-09-12T05:06:#00#+10:00,2300,Up,HWL,1,1,JUNM
2009-09-12T05:06:00+10:00,2009-09-12T05:06:#00#+10:00,2300,Up,HWL,1,1,JUNM
2009-09-12T05:08:00+10:00,2009-09-12T05:08:#00#+10:00,2300,Up,WSN,1,1,JUNM
2009-09-12T05:08:00+10:00,2009-09-12T05:08:#00#+10:00,2300,Up,WSN,1,1,JUNM
2009-09-12T05:10:00+10:00,2009-09-12T05:10:#00#+10:00,2300,Up,RIV,1,1,JUNM
2009-09-12T05:10:00+10:00,2009-09-12T05:10:#00#+10:00,2300,Up,RIV,1,1,JUNM
2009-09-12T05:17:00+10:00,,2300,Up,CAM,1,1,JUNM
2009-09-12T09:25:#00#+10:00,2305,Dump,CAM,1,1,JUNM
2009-09-12T09:28:00+10:00,2009-09-12T09:28:#00#+10:00,2305,Dump,RIV,1,1,JUNM
2009-09-12T09:28:00+10:00,2009-09-12T09:28:#00#+10:00,2305,Dump,RIV,1,1,JUNM
2009-09-12T09:29:00+10:00,2009-09-12T09:29:#00#+10:00,2305,Dump,WSN,1,1,JUNM
2009-09-12T09:29:00+10:00,2009-09-12T09:29:#00#+10:00,2305,Dump,WSN,1,1,JUNM
2009-09-12T09:31:00+10:00,2009-09-12T09:31:#00#+10:00,2305,Dump,HWL,1,1,JUNM
2009-09-12T09:31:00+10:00,2009-09-12T09:31:#00#+10:00,2305,Dump,HWL,1,1,JUNM
2009-09-12T09:32:00+10:00,2009-09-12T09:32:#00#+10:00,2305,Dump,BWD,1,1,JUNM
2009-09-12T09:32:00+10:00,2009-09-12T09:32:#00#+10:00,2305,Dump,BWD,1,1,JUNM
2009-09-12T09:34:00+10:00,2009-09-12T09:34:#00#+10:00,2305,Dump,ASH,1,1,JUNM
2009-09-12T09:34:00+10:00,2009-09-12T09:34:#00#+10:00,2305,Dump,ASH,1,1,JUNM
2009-09-12T09:41:00+10:00,,2305,Dump,ALM,1,1,JUNM
,2306,Up,ALM,1,1,JUNM
,2306,Up,ASH,1,1,JUNM
,2306,Up,BWD,1,1,JUNM
,2306,Up,HWL,1,1,JUNM
,2306,Up,WSN,1,1,JUNM
,2306,Up,RIV,1,1,JUNM
,2306,Up,CAM,1,1,JUNM
2009-09-12T06:18:#00#+10:00,4505,Dump,CAR,1,1,JUNM
2009-09-12T06:21:00+10:00,2009-09-12T06:21:#00#+10:00,4505,Dump,SEA,1,1,JUNM
2009-09-12T06:21:00+10:00,2009-09-12T06:21:#00#+10:00,4505,Dump,SEA,1,1,JUNM
2009-09-12T06:24:00+10:00,2009-09-12T06:24:#00#+10:00,4505,Dump,KAN,1,1,JUNM
2009-09-12T06:24:00+10:00,2009-09-12T06:24:#00#+10:00,4505,Dump,KAN,1,1,JUNM
Is there anyway that I can get the above output.
I appreciate any help/suggestion.
Thanks
Sri
Do you want duplicate lines which have 8 non-empty columns?
Try this:
awk -F',+' 'NF==8;1' file.txt
Here's the sed version:
sed '/^[0-9-]\{10\}T[0-9]\{2\}:[0-9]\{2\}:00+10/p' file.txt
Related
How to extract phone number and Pin from each text line
Sample Text from the log file 2021/08/29 10:25:37 20210202GL1 Message Params [userid:user1] [timestamp:20210829] [from:TEST] [to:0214736848] [text:You requested for Pin reset. Your Customer ID: 0214736848 and PIN: 4581] 2021/08/27 00:03:18 20210202GL2 Message Params [userid:user1] [timestamp:20210827] [from:TEST] [to:0214736457] [text:You requested for Pin reset. Your Customer ID: 0214736457 and PIN: 6193] 2021/08/27 10:25:16 Thank you for joining our service; Your ID is 0214736849 and PIN is 5949 Other wording and formatting can change but ID and PIN don't change Expected out put for each line 0214736848#4581 0214736457#6193 0214736849#5949 Below is what I have tried out using bash though am currently able to extract only the numeric values while read p; do NUM='' counter=1; text=$(echo "$p" | grep -o -E '[0-9]+') for line in $text do if [ "$counter" -eq 1 ] #if is equal to 1 then NUM+="$line" #concatenate string else NUM+="#$line" #concatenate string fi let counter++ #Increment counter done printf "$NUM\n" done < logfile.log Current output though not the expected. 2021#08#29#00#03#18#20210202#2#1#20210826#0214736457#0214736457#6193 2021#08#27#10#25#37#20210202#1#1#20210825#0214736848#0214736848#4581 2021#08#27#10#25#16#0214736849#5949
Another variation using gawk and 2 capture groups, matching 1 or more digits per group: awk ' match($0, /ID: ([0-9]+) and PIN: ([0-9]+)/, m) { print m[1]"#"m[2] } ' file Output 0214736848#4581 0214736457#6193 For the updated question, you could either match : or is if you want a more precise match, and the capture group values will be 2 and 4. awk ' match($0, /ID(:| is) ([0-9]+) and PIN(:| is) ([0-9]+)/, m) { print m[2]"#"m[4] } ' file Output 0214736848#4581 0214736457#6193 0214736849#5949
Using sed capture groups you can do: sed 's/.* Your Customer ID: \([0-9]*\) and PIN: \([0-9]*\).*/\1#\2/g' file.txt
With your shown samples please try following awk code, you could simple do it with using different field separators. Simple explanation would be, making Customer ID: OR and PIN: OR ]$ as field separators and then keeping them in mind printing only 2nd and 3rd fields along with # as per required output by OP. awk -v FS='Customer ID: | and PIN: |]$' '{print $2"#"$3}' Input_file
With bash and a regex: while IFS='] ' read -r line; do [[ "$line" =~ ID:\ ([^\ ]+).*PIN:\ ([^\ ]+)] ]] echo "${BASH_REMATCH[1]}#${BASH_REMATCH[2]}" done <file Output: 0214736848#4581 0214736457#6193
Given the updated input in your question then using any sed in any shell on every Unix box: $ sed 's/.* ID[: ][^0-9]*\([0-9]*\).* PIN[: ][^0-9]*\([0-9]*\).*/\1#\2/' file 0214736848#4581 0214736457#6193 0214736849#5949 Original answer: Using any awk in any shell on every Unix box: $ awk -v OFS='#' '{print $18, $21+0}' file 0214736848#4581 0214736457#6193
Alternating output in bash for loop from two grep
I'm trying to search through files and extract two pieces of relevant information every time they appear in the file. The code I currently have: #!/bin/bash echo "Utilized reads from ustacks output" > reads.txt str1="utilized reads:" str2="Parsing" for file in /home/desaixmg/novogene/stacks/sample01/conda_ustacks.o*; do reads=$(grep $str1 $file | cut -d ':' -f 3 samples=$(grep $str2 $file | cut -d '/' -f 8 echo $samples $reads >> reads.txt done It is doing each line for the file (the files have varying numbers of instances of these phrases) and gives me the output per row for each file: PopA_15.fq 1081264 PopA_16.fq PopA_17.fq 1008416 554791 PopA_18.fq PopA_20.fq PopA_21.fq 604610 531227 595129 ... I want it to match each instance (i.e. 1st instance of both greps next two each other): PopA_15.fq 1081264 PopA_16.fq 1008416 PopA_17.fq 554791 PopA_18.fq 604610 PopA_20.fq 531227 PopA_21.fq 595129 ... How do I do this? Thank you
Considering that your Input_file is same as sample shown and number of columns are even on each line with 1 PopA value and other will be with digit values. Following awk may help you in same. awk '{for(i=1;i<=(NF/2);i++){print $i,$((NF/2)+i)}}' Input_file Output will be as follows. PopA_15.fq 1081264 PopA_16.fq 1008416 PopA_17.fq 554791 PopA_18.fq 604610 PopA_20.fq 531227 PopA_21.fq 595129 In case you want to pass output of a command to awk command then you could do like your command | awk command... no need to add Input_file to above awk command.
This is what ended up working for me...any tips for more efficient code are definitely welcome #!/bin/bash echo "Utilized reads from ustacks output" > reads.txt str1="utilized reads:" str2="Parsing" for file in /home/desaixmg/novogene/stacks/sample01/conda_ustacks.o*; do reads=$(grep $str1 $file | cut -d ':' -f 3) samples=$(grep $str2 $file | cut -d '/' -f 8) paste <(echo "$samples" | column -t) <(echo "$reads" | column -t) >> reads.txt done This provides the desired output described above.
UNIX : Deleting all the lines containing string & number
I have a TXT files with lines of about 1 Million. #Test.txt zs272 zs273 zs277 zs278 zs282 zs285 zs288 zs289 zs298 zs300 zs7 zsa zsag zsani179yukkie zsani182zaide zsaqgiw zsb86581 zsbguepqtkcn zscazn zscfhlsv zscgxadrwijl zsclions111yuen zscwqtk zscz zsder zsdfdgdgg I wanted to delete the line which has the numbers and keeping only strings. I tried, grep -v '^[1-9]' Test.txt > 1_Test.txt Couldn't get the desired result. Expected output: #1_Test.txt zsa zsag zsbguepqtkcn zscazn zscfhlsv zscgxadrwijl zscwqtk zscz zsder zsdfdgdgg
sed '/[0-9]/d' file If you want to edit your file "in place" use sed's option -i. awk '!/[0-9]/' file With bash: while read -r line; do [[ ! $line =~ [0-9] ]] && printf "%s\n" "$line"; done < file
Just remove the start of the line anchor ^. ^[1-9] regex only matches the numbers 1-9 which exists at the start. grep -v '[1-9]' Test.txt > 1_Test.txt to work for all digits including 0. grep -v '[0-9]' Test.txt > 1_Test.txt
A solution in AWK! awk '!/[0-9]+/{print}' file
extract string from another using awk
I have this variable which contain a list of string separted by space val=00:21:5D:16:F3 00:21:5D:16:F4 00:21:5D:16:F5 I want to extract each string separated bu space " " and then assign it to val I use this shell code but it doesn't work while [ "$((i++))" != "10" ]; do val$i=`echo $val | awk '{print $i}'` echo "val$i=$val$i" done the desired result is : val1="00:21:5D:16:F3" val2="00:21:5D:16:F4" val3="00:21:5D:16:F5" val4="" val5="" val6="" val7="" val8="" val9="" val10="" any help is appreciated even if the treatment is done with another linux utility like cut , sed , grep.
this awk script should be what are you looking for awk -F[' '=] 'BEGIN{t=1} { for (i=2;i<=11;i++) {print "val" t "=\"" $i "\""; t+=1}}' test there is output system1:/depot/scripts/sh # awk -F[' '=] 'BEGIN{t=1} { for (i=2;i<=11;i++) {print "val" t "=\"" $i "\""; t+=1}}' test val1="00:21:5D:16:F3" val2="00:21:5D:16:F4" val3="00:21:5D:16:F5" val4="" val5="" val6="" val7="" val8="" val9="" val10="" system:/depot/scripts/sh # test file contains: system:/depot/scripts/sh # cat test val=00:21:5D:16:F3 00:21:5D:16:F4 00:21:5D:16:F5 system:/depot/scripts/sh #
thank you for your help and I want to share with you the best solution that I found while [ "$((i++))" != "10" ]; do val$i=`echo $val | awk -F' ' '{print $'"$i"'}'` echo "val$i=$val$i" done
I Know is not what you really asked, but what about using array to solve this? like: val=(00:21:5D:16:F3 00:21:5D:16:F4 00:21:5D:16:F5) $ echo ${val[0]} 00:21:5D:16:F3 $ echo ${val[1]} 00:21:5D:16:F4 $ echo ${val[2]} 00:21:5D:16:F5 $ echo ${val[3]}
Convert data from a simple JSON format to a DSV format
I have a file in Unix, with data sample like the following: {"ID":"123", "Region":"Asia", "Location":"India"} {"ID":"234", "Region":"APAC", "Location":"Australia"} {"ID":"345", "Region":"Americas", "Location":"Mexio"} {"ID":"456", "Region":"Americas", "Location":"Canada"} {"ID":"567", "Region":"APAC", "Location":"Japan"} The desired output is ID|Region|Location 123|Asia|India 234|APAC|Australia 345|Americas|Mexico 456|Americas|Canada 567|APAC|Japan I tried with a few sed commands. I could remove the following: '{', '}', ' " ', ':' There are 2 issues with the output file All rows from input appear in single line in the output. Adding the pipe ('|') as delimiter. Any pointers are highly appreciated.
I recommend the tool jq (http://stedolan.github.io/jq/); jq is a lightweight and flexible command-line JSON processor. jq -r '"\(.ID)|\(.Region)|\(.Location)"' < infile 123|Asia|India 234|APAC|Australia 345|Americas|Mexio 456|Americas|Canada 567|APAC|Japan Explanation -r is --raw-output
Through awk, awk -F'"' -v OFS="|" 'BEGIN{print "ID|Region|Location"}{print $4,$8,$12}' file Example: $ cat file {"ID":"123", "Region":"Asia", "Location":"India"} {"ID":"234", "Region":"APAC", "Location":"Australia"} {"ID":"345", "Region":"Americas", "Location":"Mexio"} {"ID":"456", "Region":"Americas", "Location":"Canada"} {"ID":"567", "Region":"APAC", "Location":"Japan"} $ awk -F'"' -v OFS="|" 'BEGIN{print "ID|Region|Location"}{print $4,$8,$12}' file ID|Region|Location 123|Asia|India 234|APAC|Australia 345|Americas|Mexio 456|Americas|Canada 567|APAC|Japan EXplanation: -F'"' Sets " as Field Separator value. OFS="|" Sets | as Output Field Separator value. Atfirst, awk would execute the function inside the BEGIN block. It helps to print the header section.
This sed one-liner does what you want. It's capturing the field values using parenthesized expressions, and then putting them into the output using \1, \2, and \3. s/^{"ID":"\([^"]*\)", "Region":"\([^"]*\)", "Location":"\([^"]*\)"}$/\1|\2|\3/ Invoke it like: $ sed -f one-liner.sed input.txt Or you can invoke it within a Bash script, producing the header: echo 'ID|Region|Location' sed -e 's/^{"ID":"\([^"]*\)", "Region":"\([^"]*\)", "Location":"\([^"]*\)"}$/\1|\2|\3/' $input
It is a JSON file so it is best to use a JSON parser. Here is a perl implementation of it. #!/usr/bin/perl use strict; use warnings; use JSON; open my $fh, '<', 'path/to/your/file'; #keys of your structure my #key = qw(ID Region Location); print join ("|", #key), "\n"; #iterate over your file, decode it and print in order of your key structure while (my $json = <$fh>) { my $text = decode_json($json); print join ("|", map { $$text{$_} } #key ),"\n"; } Output: ID|Region|Location 123|Asia|India 234|APAC|Australia 345|Americas|Mexio 456|Americas|Canada 567|APAC|Japan
Using sed as follows Command line echo "my_string" | sed -e 's#[,:"{}]##g' -e 's#ID##g' -e "s#Region##g" -e 's#Location##g' \ -e '1 s#^.*$#ID Region Location\n&#' -e 's# #|#g' or sed -e 's#[,:"{}]##g' -e 's#ID##g' -e "s#Region##g" -e 's#Location##g' \ -e '1 s#^.*$#ID Region Location\n&#' -e 's# #|#g' my_file I tried this in a terminal as follows: echo '{"ID":"123", "Region":"Asia", "Location":"India"} {"ID":"234", "Region":"APAC", "Location":"Australia"} {"ID":"345", "Region":"Americas", "Location":"Mexio"} {"ID":"456", "Region":"Americas", "Location":"Canada"} {"ID":"567", "Region":"APAC", "Location":"Japan"}' | sed -e 's#[,:"{}]##g' -e 's#ID##g' -e "s#Region##g" -e 's#Location##g' \ -e '1 s#^.*$#ID Region Location\n&#' -e 's# #|#g' Output ID|Region|Location 123|Asia|India 234|APAC|Australia 345|Americas|Mexio 456|Americas|Canada 567|APAC|Japan
Many thanks for your response and the pointers/ solutions did help a lot. For some mysterious reasons, I couldn't get any sed commands work. So, I devised my own solution. Although it's not elegant, it's still worked. Here is the script I prepared which resolved the issue. #!/bin/bash # ource file path. infile=/home/exfile.txt # remove if these temp file exist already. rm ./efile.txt ./xfile.txt ./yfile.txt ./zfile.txt # removing the curly braces from input file. cat exfile.txt | cut -d "{" -f2 | cut -d "}" -f1 >> ./efile.txt # setting input file name to different value. infile=./efile.txt # remove double quotes from the file. while IFS= read -r line do echo $line | sed 's/\"//g' >> ./xfile.txt done < "$infile" # creating another temp file. infile2=./xfile.txt # remove colon from file. while IFS= read -r line do echo $line | sed 's/\:/,/g' >> ./yfile.txt done < "$infile2" # set input file path to new temp file. infile3=yfile.txt # initialize variables to hold header column values. t1=0 t3=0 t5=0 # read each of the line to extract header row. Exit loop after reading 1st row. once=1 while IFS=',' read -r f1 f2 f3 f4 f5 f6 do "$f1 $f2 $f3 $f4 $f5 $f6" t1=$f1 t3=$f3 t5=$f5 if [ "$once" -eq 1 ]; then break fi done < "$infile3" # Read each of the line from input file. Write only the value to another output file. while IFS=',' read -r f1 f2 f3 f4 f5 f6 do echo "$f2|$f4|$f6" >> ./zfile.txt done < "$infile3" # insert the header column row into the file generated in the step above. frstline="$t1|$t3|$t5" sed -i '1i ID|Region|Location' ./zfile.txt