I want to filter my sequences which has more than 8 same consecutive nucleotides like "GGGGGGGG", "CCCCCCCC", etc in my fastq files.
How should I do that?
The quick and incorrect way, which might be close enough: grep -E -B1 -A2 'A{8}|C{8}|G{8}|T{8}' yourfile.fastq.
This will miss blocks where the 8-mer is split across two lines (e.g. the first line ends with AAAA and the second starts with AAAA). It also assumes the output has blocks of 4 lines each.
The proper way: write a little program (in Python, or a language of your choice) which buffers one FASTQ block (e.g. 4 lines) and checks that the concatenation of the previous (buffered) block's sequence and the current block's sequence do not have an 8-mer as above. If that's the case, then output the buffered block.
I ended up to use below codes in R and solved my problem.
library(ShortRead)
fq <- FastqFile("/Users/path/to/file")
reads_fq <- readFastq(fq)
trimmed_fq <- reads_fq[grep("GGGGGGGG|TTTTTTTTT|AAAAAAAAA|CCCCCCCCC",
sread(reads_fq), invert = TRUE)]
writeFastq(trimmed_fq, "new_name_for_fq.fastq", compress = FALSE)
You can use the Python package biotite for it (https://www.biotite-python.org).
Let's say you have the following FASTQ file:
#Read:01
CCCAAGGGCCCCCCCCCACTGCGATCACCTGGTTGCTGCCGGGAAAGGAGACCCAGGAGGTGAAACGGACTGGTGAATTG
CGGGGGTAGATATGGCGGGTGACACAAAAACATATAATCGGGCC
+
.+.+:'-FEAC-4'4CA-3-5#/4+?*G#?,<)<E&5(*82C9FH4G315F*DF8-4%F"9?H5535F7%?7#+6!FDC&
+4=4+,#2A)8!1B#,HA18)1*D1A-.HGAED%?-G10'6>:2
#Read:02
AACACTACTTCGCTGTCGCCAAAGGTTGGTGTAGGTCGGACTTCGAATTATCGATACTAGTTAGTAGTACGTCGCGTGGC
GTCAGCTCGTATGCTCTCAGAACAGGGAGAACTAGCACCGTAAGTAACCTAGCTCCCAAC
+
6%9,#'4A0&%.19,1E)E?!9/$.#?(!H2?+E"")?6:=F&FE91-*&',,;;$&?#2A"F.$1)%'"CB?5$<.F/$
7055E>#+/650B6H<8+A%$!A=0>?'#",8:#5%18&+3>'8:28+:5F0);E9<=,+
This is a script, that should do the work:
import biotite.sequence.io.fastq as fastq
import biotite.sequence as seq
# 'GGGGGGGG', 'CCCCCCCC', etc.
consecutive_nucs = [seq.NucleotideSequence(nuc * 8) for nuc in "ACGT"]
fastq_file = fastq.FastqFile("Sanger")
fastq_file.read("example.fastq")
# Iterate over sequence entries in file
for header in fastq_file:
sequence = fastq_file.get_sequence(header)
# Iterative over each of the consecutive sequences
for consecutive_nuc in consecutive_nucs:
# Find all indices, where a match was found
matches = seq.find_subsequence(sequence, consecutive_nuc)
if len(matches) > 0:
# If any match was found report it
print(
f"Found '{consecutive_nuc}' "
f"in sequence '{header}' at position {matches[0]}"
)
This is the output:
Found 'CCCCCCCC' in sequence 'Read:01' at pos 8
I'm trying to write a CMake script that does the following:
Using the execute_process instruction reads the output of a command and stores it into a variable named 'STRING_VARIABLE'. The command that returns something that has both characters and digits in it's name. Something like this: RESULT-v1.2.8-...
I have read this value properly and displayed it on the terminal to confirm this.
Now what I want to do is store the first three digits of this output and store them in 3 other variables: 'FIRST_DIGIT', 'SECOND_DIGIT' and 'THIRD_DIGIT'.
My logic was this:
Using a counter count each time a digit is encountered in a variable name. Each time a digit is encountered store the digit in one of the three variables then increment the counter. The counter counts therefore from 0 to 2 and for each of these 3 values does a store.
Here is the script I wrote:
SET(COUNTER 0)
foreach(LETTER ${STRING_VARIABLE})
if(LETTER EQUAL '0,1,2,3,4,5,6,7,8,9')
if( COUNTER EQUAL 0 ) # if first digit is encountered
list(GET STRING_VARIABLE LETTER FIRST_DIGIT) # store it in FIRST_DIGIT
SET(COUNTER 1)
elseif( COUNTER EQUAL 1 ) # if second digit is encountered
list(GET STRING_VARIABLE LETTER SECOND_DIGIT) # store it in SECOND_DIGIT
SET(COUNTER 2)
else( COUNTER EQUAL 2 ) # if second digit is encountered
list(GET STRING_VARIABLE LETTER THIRD_DIGIT) # store it in THIRD_DIGIT
endif()
endif()
endforeach()
# To check the variables
#message("*****${STRING_VARIABLE}") # OK!
message("*****${FIRST_DIGIT}") # NOT OK :(
As I'm a total beginner in CMake I suppose my problem is at either of the two(or both):
- When looping through the 'STRING_VARIABLE' I used foreach(LETTER) and since my string also contains digits the program may not see them. If that is the mistake with what else should I replace LETTER in order to get each character of the string?
- In the first if where I check if the 'LETTER' is a digit. I think that is the correct syntax altough I'm not sure. Basically what I'm doing there is trying to check if the letter at each index is a digit.
The 'STRING_VARIABLE' prints ok as I said.
However when I try printing the 'FIRST_DIGIT' or any other of the 3(second and third) I get an empty string as a result.
Please help me understand what is wrong in my logic and what I'm doing wrong.
Please help me understand what I'm doing wrong. Thank you.
In case the format is know, you can use string(REGEX REPLACE ...).
Function:
function(get_versions versionstring libname major minor patch)
string(REGEX REPLACE "([A-Za-z0-9_]*)-[vV].*" "\\1" locallibname ${versionstring} )
set(libname ${locallibname} PARENT_SCOPE)
string(REGEX REPLACE "^([A-Za-z0-9_]*-[vV])([0-9]*)([.][0-9]*[.][0-9]*-?.*)$" "\\2" numbers ${versionstring} )
set(major ${numbers} PARENT_SCOPE)
string(REGEX REPLACE "^([A-Za-z0-9_]*-[vV][0-9]*[.])([0-9]*)([.][0-9]*-?.*)$" "\\2" numbers ${versionstring} )
set(minor ${numbers} PARENT_SCOPE)
string(REGEX REPLACE "^([A-Za-z0-9_]*-[vV][0-9]*[.][0-9]*[.])([0-9]*)(-?.*)$" "\\2" numbers ${versionstring} )
set(patch ${numbers} PARENT_SCOPE)
endfunction()
Usage:
get_versions("MyLib-V11.222.034-remark" libname major minor patch)
status_ref(libname)
status_ref(major)
status_ref(minor)
status_ref(patch)
Result:
STATUS: libname = "MyLib"
STATUS: major = "11"
STATUS: minor = "222"
STATUS: patch = "034"
I have MATLAB set to record three webcams at the same time. I want to capture and save each feed to a file and automatically increment it the file name, it will be replaced by experiment_0001.avi, followed by experiment_0002.avi, etc.
My code looks like this at the moment
set(vid1,'LoggingMode','disk');
set(vid2,'LoggingMode','disk');
avi1 = VideoWriter('X:\ABC\Data Collection\Presentations\Correct\ExperimentA_002.AVI');
avi2 = VideoWriter('X:\ABC\Data Collection\Presentations\Correct\ExperimentB_002.AVI');
set(vid1,'DiskLogger',avi1);
set(vid2,'DiskLogger',avi2);
and I am incrementing the 002 each time.
Any thoughts on how to implement this efficiently?
Thanks.
dont forget matlab has some roots to C programming language. That means things like sprintf will work
so since you are printing out an integer value zero padded to 3 spaces you would need something like this sprintf('%03d',n) then % means there is a value to print that isn't text. 0 means zero pad on the left, 3 means pad to 3 digits, d means the number itself is an integer
just use sprintf in place of a string. the s means String print formatted. so it will output a string. here is an idea of what you might do
set(vid1,'LoggingMode','disk');
set(vid2,'LoggingMode','disk');
for (n=1:2:max_num_captures)
avi1 = VideoWriter(sprintf('X:\ABC\Data Collection\Presentations\Correct\ExperimentA_%03d.AVI',n));
avi2 = VideoWriter(sprintf('X:\ABC\Data Collection\Presentations\Correct\ExperimentB_002.AVI',n));
set(vid1,'DiskLogger',avi1);
set(vid2,'DiskLogger',avi2);
end
Locked. This question and its answers are locked because the question is off-topic but has historical significance. It is not currently accepting new answers or interactions.
Introduction
A valid Sudoku grid is filled with numbers 1 to 9, with no number occurring more than once in each sub-block of 9, row or column. Read this article for further details if you're unfamiliar with this popular puzzle.
Challenge
The challenge is to write the shortest program that validates a Sudoku grid that might not be full.
Input will be a string of 9 lines of 9 characters each, representing the grid. An empty cell will be represented by a .. Your output should be Valid if the grid is valid, otherwise output Invalid.
Example
Input
123...789
...456...
456...123
789...456
...123...
564...897
...231...
897...564
...564...
Output
Valid
Input
123456789
987654321
123456789
123456789
987654321
123456789
123456789
987654321
123456789
Output
Invalid
Code Golf Rules
Please post your shortest code in any language that solves this problem. Input and output may be handled via stdin and stdout or by other files of your choice.
Winner will be the shortest solution (by byte count) in a language with an implementation existing prior to the posting of this question. So while you are free to use a language you've just made up in order to submit a 0-byte solution, it won't count, and you'll probably get downvotes.
Golfscript: 56
n%{zip''+9/.{'.'-..&=}%$0=\}:|2*{3/}%|;**"InvV"3/="alid"
C: 165 162 161 160 159
int v[1566],x,y=9,c,b;main(){while(y--)for(x=9;x--+1;)if((c
=getchar()*27)>1242)b|=v[x+c]++|v[y+9+c]++|v[x-x%3+y/3+18+c]
++;puts(b?"Invalid":"Valid");return 0;}
The two newlines are not needed. One char saved by josefx :-) ...
Haskell: 207 230 218 195 172
import List
t=take 3
h=[t,t.drop 3,drop 6]
v[]="V"
v _="Inv"
f s=v[1|v<-[s,transpose s,[g=<<f s|f<-h,g<-h]],g<-map(filter(/='.'))v,g/=nub g]++"alid\n"
main=interact$f.lines
Perl: 168 128
$_=join'',<>;#a=/.../g;print+(/(\d)([^\n]{0,8}|(.{10})*.{9})\1/s
+map"#a[$_,$_+3,$_+6]"=~/(\d).*\1/,0..2,9..11,18..20)?Inv:V,alid
The first regex checks for duplicates that are in the same row and column; the second regex handles duplicates in the "same box".
Further improvement is possible by replacing the \n in the first regex with a literal newline (1 char), or with >= Perl 5.12, replacing [^\n] with \N (3 char)
Earlier, 168 char solution:
Input is from stdin, output is to stderr because it makes things so easy. Linebreaks are optional and not counted.
$_=join'',<>;$m=alid.$/;$n=Inv.$m;/(\d)(\N{0,8}|(.{10})*.{9})\1/s&&
die$n;#a=/.../g;for$i(0,8,17){for$j($i..$i+2){
$_=$a[$j].$a[$j+3].$a[$j+6];/(\d).*\1/&&die$n}}die"V$m"
Python: 230 221 200 185
First the readable version at len=199:
import sys
r=range(9)
g=[raw_input()for _ in r]
s=[[]for _ in r*3]
for i in r:
for j in r:
n=g[i][j]
for x in i,9+j,18+i/3*3+j/3:
<T>if n in s[x]:sys.exit('Invalid')
<T>if n>'.':s[x]+=n
print'Valid'
Since SO doesn't display tab characters, I've used <T> to represent a single tab character.
PS. the same approach minEvilized down to 185 chars:
r=range(9)
g=[raw_input()for _ in r]
s=['']*27
for i in r:
for j in r:
for x in i,9+j,18+i/3*3+j/3:n=g[i][j];s[x]+=n[:n>'.']
print['V','Inv'][any(len(e)>len(set(e))for e in s)]+'alid'
Perl, 153 char
#B contains the 81 elements of the board.
&E tests whether a subset of #B contains any duplicate digits
main loop validates each column, "block", and row of the puzzle
sub E{$V+="#B[#_]"=~/(\d).*\1/}
#B=map/\S/g,<>;
for$d(#b=0..80){
E grep$d==$_%9,#b;
E grep$d==int(($_%9)/3)+3*int$_/27,#b;
E$d*9..$d*9+8}
print$V?Inv:V,alid,$/
Python: 159 158
v=[0]*244
for y in range(9):
for x,c in enumerate(raw_input()):
if c>".":
<T>for k in x,y+9,x-x%3+y//3+18:v[k*9+int(c)]+=1
print["Inv","V"][max(v)<2]+"alid"
<T> is a single tab character
Common Lisp: 266 252
(princ(let((v(make-hash-table))(r "Valid"))(dotimes(y 9)(dotimes(x
10)(let((c(read-char)))(when(>(char-code c)46)(dolist(k(list x(+ 9
y)(+ 18(floor(/ y 3))(- x(mod x 3)))))(when(>(incf(gethash(+(* k
9)(char-code c)-49)v 0))1)(setf r "Invalid")))))))r))
Perl: 186
Input is from stdin, output to stdout, linebreaks in input optional.
#y=map/\S/g,<>;
sub c{(join'',map$y[$_],#$h)=~/(\d).*\1/|c(#_)if$h=pop}
print(('V','Inv')[c map{$x=$_;[$_*9..$_*9+8],[grep$_%9==$x,0..80],[map$_+3*$b[$x],#b=grep$_%9<3,0..20]}0..8],'alid')
(Linebreaks added for "clarity".)
c() is a function that checks the input in #y against a list of lists of position numbers passed as an argument. It returns 0 if all position lists are valid (contain no number more than once) and 1 otherwise, using recursion to check each list. The bottom line builds this list of lists, passes it to c() and uses the result to select the right prefix to output.
One thing that I quite like is that this solution takes advantage of "self-similarity" in the "block" position list in #b (which is redundantly rebuilt many times to avoid having #b=... in a separate statement): the top-left position of the ith block within the entire puzzle can be found by multiplying the ith element in #b by 3.
More spread out:
# Grab input into an array of individual characters, discarding whitespace
#y = map /\S/g, <>;
# Takes a list of position lists.
# Returns 0 if all position lists are valid, 1 otherwise.
sub c {
# Pop the last list into $h, extract the characters at these positions with
# map, and check the result for multiple occurences of
# any digit using a regex. Note | behaves like || here but is shorter ;)
# If the match fails, try again with the remaining list of position lists.
# Because Perl returns the last expression evaluated, if we are at the
# end of the list, the pop will return undef, and this will be passed back
# which is what we want as it evaluates to false.
(join '', map $y[$_], #$h) =~ /(\d).*\1/ | c(#_) if $h = pop
}
# Make a list of position lists with map and pass it to c().
print(('V','Inv')[c map {
$x=$_; # Save the outer "loop" variable
[$_*9..$_*9+8], # Columns
[grep$_%9==$x,0..80], # Rows
[map$_+3*$b[$x],#b=grep$_%9<3,0..20] # Blocks
} 0..8], # Generates 1 column, row and block each time
'alid')
Perl: 202
I'm reading Modern Perl and felt like coding something... (quite a cool book by the way:)
while(<>){$i++;$j=0;for$s(split//){$j++;$l{$i}{$s}++;$c{$j}{$s}++;
$q{(int(($i+2)/3)-1)*3+int(($j+2)/3)}{$s}++}}
$e=V;for$i(1..9){for(1..9){$e=Inv if$l{$i}{$_}>1or$c{$i}{$_}>1or$q{$i}{$_}>1}}
print $e.alid
Count is excluding unnecessary newlines.
This may require Perl 5.12.2.
A bit more readable:
#use feature qw(say);
#use JSON;
#$json = JSON->new->allow_nonref;
while(<>)
{
$i++;
$j=0;
for $s (split //)
{
$j++;
$l{$i}{$s}++;
$c{$j}{$s}++;
$q{(int(($i+2)/3)-1)*3+int(($j+2)/3)}{$s}++;
}
}
#say "lines: ", $json->pretty->encode( \%l );
#say "columns: ", $json->pretty->encode( \%c );
#say "squares: ", $json->pretty->encode( \%q );
$e = V;
for $i (1..9)
{
for (1..9)
{
#say "checking {$i}{$_}: " . $l{$i}{$_} . " / " . $c{$i}{$_} . " / " . $q{$i}{$_};
$e = Inv if $l{$i}{$_} > 1 or $c{$i}{$_} > 1 or $q{$i}{$_} > 1;
}
}
print $e.alid;
Ruby — 176
f=->x{x.any?{|i|(i-[?.]).uniq!}}
a=[*$<].map{|i|i.scan /./}
puts f[a]||f[a.transpose]||f[a.each_slice(3).flat_map{|b|b.transpose.each_slice(3).map &:flatten}]?'Invalid':'Valid'
Lua, 341 bytes
Although I know that Lua isn't the best golfing language, however, considering it's size, I think it's worth posting it ;).
Non-golfed, commented and error-printing version, for extra fun :)
i=io.read("*a"):gsub("\n","") -- Get input, and strip newlines
a={{},{},{}} -- checking array, 1=row, 2=columns, 3=squares
for k=1,3 do for l=1,9 do a[k][l]={0,0,0,0,0,0,0,0,0}end end -- fillup array with 0's (just to have non-nils)
for k=1,81 do -- loop over all numbers
n=tonumber(i:sub(k,k):match'%d') -- get current character, check if it's a digit, and convert to a number
if n then
r={math.floor((k-1)/9)+1,(k-1)%9+1} -- Get row and column number
r[3]=math.floor((r[1]-1)/3)+3*math.floor((r[2]-1)/3)+1 -- Get square number
for l=1,3 do v=a[l][r[l]] -- 1 = row, 2 = column, 3 = square
if v[n] then -- not yet eliminated in this row/column/square
v[n]=nil
else
print("Double "..n.." in "..({"row","column","square"}) [l].." "..r[l]) --error reporting, just for the extra credit :)
q=1 -- Flag indicating invalidity
end
end
end
end
io.write(q and"In"or"","Valid\n")
Golfed version, 341 bytes
f=math.floor p=io.write i=io.read("*a"):gsub("\n","")a={{},{},{}}for k=1,3 do for l=1,9 do a[k][l]={0,0,0,0,0,0,0,0,0}end end for k=1,81 do n=tonumber(i:sub(k,k):match'%d')if n then r={f((k-1)/9)+1,(k-1)%9+1}r[3]=f((r[1]-1)/3)+1+3*f((r[2]-1)/3)for l=1,3 do v=a[l][r[l]]if v[n]then v[n]=nil else q=1 end end end end p(q and"In"or"","Valid\n")
Python: 140
v=[(k,c) for y in range(9) for x,c in enumerate(raw_input()) for k in x,y+9,(x/3,y/3) if c>'.']
print["V","Inv"][len(v)>len(set(v))]+"alid"
ASL: 108
args1["\n"x2I3*x;{;{:=T(T'{:i~{^0}?})}}
{;{;{{,0:e}:;{0:^},u eq}}/`/=}:-C
dc C#;{:|}C&{"Valid"}{"Invalid"}?P
ASL is a Golfscript inspired scripting language I made.