file1 contains multiple alphabetic sequences:
AETYUIOOILAKSJ
EAYEURIOPOSIDK
RYXURIAJSKDMAO
URITORIEJAHSJD
YWQIAKSJDHFKCM
HAJSUDIDSJSIAJ
AJDHDPFDIXSIBJ
JAQIAUXCNCVUFO
while file2 contains indexes of the sequences which I want to pull out and transfer to another file. For example, 3T means I want the sequence with a T at position 3 from within file1.
In reality both files are very large with thousands of indexes and sequences.
file2:
3T
10K
14D
1J
Desired output:
AETYUIOOILAKSJ
RYXURIAJSKDMAO
URITORIEJAHSJD
JAQIAUXCNCVUFO
Ideally the output should match the order of indexes in file2. In other words the first index "3T" matches sequence "AETYUIOOILAKSJ" and thus this is the first sequence in the new file.
Things I have tried:
grep -f file2 file1
grep -fov file2 file1 # possibly to filter for those non-matching entries
I have also used the command line tool sift but am still having difficulty.
Thanks
$ cat tst.awk
NR==FNR {
lgth = length($0)
pos2char[substr($0,1,lgth-1)] = substr($0,lgth,1)
next
}
{
for (pos in pos2char) {
if ( substr($0,pos,1) == pos2char[pos] ) {
print
next
}
}
}
$ awk -f tst.awk file2 file1
AETYUIOOILAKSJ
RYXURIAJSKDMAO
URITORIEJAHSJD
JAQIAUXCNCVUFO
With awk + grep pipeline:
awk '{ pat=sprintf("%*s", int($0)-1, ""); gsub(" ", ".", pat);
printf "^%s%s\n", pat, substr($0, length) }' file2 | grep -f- file1
The output:
AETYUIOOILAKSJ
RYXURIAJSKDMAO
URITORIEJAHSJD
JAQIAUXCNCVUFO
Here you go:
awk 'NR==FNR {b[$0]++;next} {for (i in b) {a=match($0,"[A-Z]");n=substr($0,1,(a-1));s=substr($0,a);t=substr(i,n,1);if (t==s) print i}}' file1 file2
AETYUIOOILAKSJ
RYXURIAJSKDMAO
URITORIEJAHSJD
JAQIAUXCNCVUFO
Some more readable:
awk '
NR==FNR {
b[$0]++;
next
}
{
for (i in b) {
a=match($0,"[A-Z]");
n=substr($0,1,(a-1));
s=substr($0,a);
t=substr(i,n,1);
if (t==s)
print i
}
}
' file1 file2
With comments:
awk '
NR==FNR { # For the first file
b[$0]++; # Store file1 in in array b
next
}
{
for (i in b) { # Loop trough elements in array b
a=match($0,"[A-Z]"); # For file2 find where letters starts
n=substr($0,1,(a-1)); # Store the number part of file2 in n
s=substr($0,a); # Store the letters part of file2 in s
t=substr(i,n,1); # from file1 find string at position n
if (t==s) # test if string found is equal to letter to find s
print i # if yes, print the line
}
}
' file1 file2
awk '(NR==FNR){a[$0]=substr($0,length);next}
{ for(key in a) if (a[key] == substr($0,key+0,1)) { print; break }
}' file2 file1
Here, the array a[key] is a associative array with the following key-value pairs:
key: value
3T T
10K K
... ...
When processing file2 with the line: (NR==FNR){a[$0]=substr($0,length);next}: we extract the value beforehand so we don't have to do it later on. The index is easily extracted with a math operation. Eg. "10K"+0=10 in Awk.
Processing file1 is done with the next line. Here we just check if the character matches for any of the entries in the associative array.
With GNU awk and grep:
awk -v FPAT='[0-9]+|[A-Z]+' '{ print "^.{" $1-1 "}" $2 }' file1 | grep -Ef - file2
Output:
AETYUIOOILAKSJ
RYXURIAJSKDMAO
URITORIEJAHSJD
JAQIAUXCNCVUFO
file1 contains multiple alphabetic sequences:
AETYUIOOILAKSJ
EAYEURIOPOSIDK
RYXURIAJSKDMAO
URITORIEJAHSJD
YWQIAKSJDHFKCM
HAJSUDIDSJSIAJ
AJDHDPFDIXSIBJ
JAQIAUXCNCVUFO
while file2 contains indexes of the sequences which I want to pull out and transfer to another file. For example, 3T means I want the sequence with a T at position 3 from within file1.
In reality both files are very large with thousands of indexes and sequences.
file2:
3T
10K
14D
1J
Desired output:
AETYUIOOILAKSJ
RYXURIAJSKDMAO
URITORIEJAHSJD
JAQIAUXCNCVUFO
Ideally the output should match the order of indexes in file2. In other words the first index "3T" matches sequence "AETYUIOOILAKSJ" and thus this is the first sequence in the new file.
Things I have tried:
grep -f file2 file1
grep -fov file2 file1 # possibly to filter for those non-matching entries
I have also used the command line tool sift but am still having difficulty.
Thanks
$ cat tst.awk
NR==FNR {
lgth = length($0)
pos2char[substr($0,1,lgth-1)] = substr($0,lgth,1)
next
}
{
for (pos in pos2char) {
if ( substr($0,pos,1) == pos2char[pos] ) {
print
next
}
}
}
$ awk -f tst.awk file2 file1
AETYUIOOILAKSJ
RYXURIAJSKDMAO
URITORIEJAHSJD
JAQIAUXCNCVUFO
With awk + grep pipeline:
awk '{ pat=sprintf("%*s", int($0)-1, ""); gsub(" ", ".", pat);
printf "^%s%s\n", pat, substr($0, length) }' file2 | grep -f- file1
The output:
AETYUIOOILAKSJ
RYXURIAJSKDMAO
URITORIEJAHSJD
JAQIAUXCNCVUFO
Here you go:
awk 'NR==FNR {b[$0]++;next} {for (i in b) {a=match($0,"[A-Z]");n=substr($0,1,(a-1));s=substr($0,a);t=substr(i,n,1);if (t==s) print i}}' file1 file2
AETYUIOOILAKSJ
RYXURIAJSKDMAO
URITORIEJAHSJD
JAQIAUXCNCVUFO
Some more readable:
awk '
NR==FNR {
b[$0]++;
next
}
{
for (i in b) {
a=match($0,"[A-Z]");
n=substr($0,1,(a-1));
s=substr($0,a);
t=substr(i,n,1);
if (t==s)
print i
}
}
' file1 file2
With comments:
awk '
NR==FNR { # For the first file
b[$0]++; # Store file1 in in array b
next
}
{
for (i in b) { # Loop trough elements in array b
a=match($0,"[A-Z]"); # For file2 find where letters starts
n=substr($0,1,(a-1)); # Store the number part of file2 in n
s=substr($0,a); # Store the letters part of file2 in s
t=substr(i,n,1); # from file1 find string at position n
if (t==s) # test if string found is equal to letter to find s
print i # if yes, print the line
}
}
' file1 file2
awk '(NR==FNR){a[$0]=substr($0,length);next}
{ for(key in a) if (a[key] == substr($0,key+0,1)) { print; break }
}' file2 file1
Here, the array a[key] is a associative array with the following key-value pairs:
key: value
3T T
10K K
... ...
When processing file2 with the line: (NR==FNR){a[$0]=substr($0,length);next}: we extract the value beforehand so we don't have to do it later on. The index is easily extracted with a math operation. Eg. "10K"+0=10 in Awk.
Processing file1 is done with the next line. Here we just check if the character matches for any of the entries in the associative array.
With GNU awk and grep:
awk -v FPAT='[0-9]+|[A-Z]+' '{ print "^.{" $1-1 "}" $2 }' file1 | grep -Ef - file2
Output:
AETYUIOOILAKSJ
RYXURIAJSKDMAO
URITORIEJAHSJD
JAQIAUXCNCVUFO
I can't find a solution.
So here is the problem.
Result should be 100 rows (File1) with contents from File2 repeating 25 times.
What I want is to join the contents even though the number of rows is not equal. Keep repeating including lines from File2 until number of rows from File1 is met.
File1:
test1#domain.com
test2#domain2.com
test3#domain3.com
test4#domain4.com
File2:
A1,B11
A2,B22
A3,B33
A4,B44
What I want is to combine the files in the following to have the following expected result:
File3:
test1#domain.com,A1,B12
test2#domain2.com,A2,B22
test3#domain3.com,A3,B33
test4#domain4.com,A4,B44
Note here: After it finishes with the 4 rows from File2, start again from first line, then repeat.
test5#domain5.com,A1,B12
test6#domain6.com,A2,B22
test7#domain7.com,A3,B33
test8#domain8.com,A4,B44
The example in your question isn't clear but I THINK this is what you're trying to do:
$ awk -v OFS=',' 'NR==FNR{a[++n]=$0;next} {print $0, a[(FNR-1)%n+1]}' file2 file1
test1#domain.com,A1,B11
test2#domain2.com,A2,B22
test3#domain3.com,A3,B33
test4#domain4.com,A4,B44
test5#domain5.com,A1,B11
test6#domain6.com,A2,B22
The above was run against this input:
$ cat file1
test1#domain.com
test2#domain2.com
test3#domain3.com
test4#domain4.com
test5#domain5.com
test6#domain6.com
$
$ cat file2
A1,B11
A2,B22
A3,B33
A4,B44
Could you please try following.
awk '
BEGIN{
OFS=","
}
FNR==NR{
a[++count]=$0
next
}
{
count_curr++
count_curr=count_curr>count?1:count_curr
print a[count_curr],$0
}
' Input_file2 Input_file1
I have multiple files with different number of columns, i need to do a merge on first file and second file and do a left outer join in awk respective to first file and print all columns in both files matching the first column of both files.
I have tried below codes to get close to my output. But i can't print the ",', where no matching number is found in second file. Below is the code. Join needs sorting and takes more time than awk. My file sizes are big, like 30 million records.
awk -F ',' '{
if (NR==FNR){ r[$1]=$0}
else{ if($1 in r)
r[$1]=r[$1]gensub($1,"",1)}
}END{for(i in r){print r[i]}}' file1 file2
file1
number,column1,column2,..columnN
File2
numbr,column1,column2,..columnN
Output
number,file1.column1,file1.column2,..file1.columnN,file2.column1,file2.column3...,file2.columnN
file1
1,a,b,c
2,a,b,c
3,a,b,c
5,a,b,c
file2
1,x,y
2,x,y
5,x,y
6,x,y
7,x,y
desired output
1,a,b,c,x,y
2,a,b,c,x,y
3,a,b,c,,,
5,a,b,c,x,y
$ cat tst.awk
BEGIN { FS=OFS="," }
NR==FNR {
tail = gensub(/[^,]*,/,"",1)
if ( FNR == 1 ) {
empty = gensub(/[^,]/,"","g",tail)
}
file2[$1] = tail
next
}
{ print $0, ($1 in file2 ? file2[$1] : empty) }
$ awk -f tst.awk file2 file1
1,a,b,c,x,y
2,a,b,c,x,y
3,a,b,c,,
5,a,b,c,x,y
The above uses GNU awk for gensub(), with other awks it's just one more step to do [g]sub() on the appropriate variable after initially assigning it.
An interesting (to me at least!) alternative you might want to test for a performance difference is:
$ cat tst.awk
BEGIN { FS=OFS="," }
NR==FNR {
tail = gensub(/[^,]*,/,"",1)
idx[$1] = NR
file2[NR] = tail
if ( FNR == 1 ) {
file2[""] = gensub(/[^,]/,"","g",tail)
}
next
}
{ print $0, file2[idx[$1]] }
$ awk -f tst.awk file2 file1
1,a,b,c,x,y
2,a,b,c,x,y
3,a,b,c,,
5,a,b,c,x,y
but I don't really expect it to be any faster and it MAY even be slower.
you can try,
awk 'BEGIN{FS=OFS=","}
FNR==NR{d[$1]=substr($0,index($0,",")+1); next}
{print $0, ($1 in d?d[$1]:",")}' file2 file1
you get,
1,a,b,c,x,y
2,a,b,c,x,y
3,a,b,c,,
5,a,b,c,x,y
join to the rescue:
$ join -t $',' -a 1 -e '' -o 0,1.2,1.3,1.4,2.2,2.3 file1.txt file2.txt
Explanation:
-t $',': Field separator token.
-a 1: Do not discard records from file 1 if not present in file 2.
-e '': Missing records will be treated as an empty field.
-o: Output format.
file1.txt
1,a,b,c
2,a,b,c
3,a,b,c
5,a,b,c
file2.txt
1,x,y
2,x,y
5,x,y
6,x,y
7,x,y
Output
1,a,b,c,x,y
2,a,b,c,x,y
3,a,b,c,,
5,a,b,c,x,y