NCBIWWW.qblast parsing xml files - bioinformatics

I am using biopython to (attempt to) write a script that will take my downloaded Sanger sequencing results from Genewiz (multiple sequences downloaded into a single FASTA file), create a new file with the sequence trimmed to my desired length, run the trimmed sequences on BLAST, and list the species of the top hit. As I am pretty new to bioinformatics and programming I am working through each of these parts step-by-step using the biopython cookbook as a framework. I have managed to get my trimmed sequences in a new file and BLAST to run (is it always that slow?) but am getting stuck now on parsing. Any help would be appreciated! I will edit/post more questions as I work through this program, but one step at a time.
Code so far:
import os
os.chdir('C:\\Users\\asmit\\Desktop\\Sequences\\Cytb')
print("Current folder: " + os.getcwd())
import Bio
from Bio import SeqIO
import glob
for filename in glob.iglob('*download.fasta'):
name = str(filename)
newname = str(filename.strip("_download.fasta") + "_trim.fasta")
print("File procesing: " + name)
with open(newname, "w") as f:
for seq_record in SeqIO.parse(open(name, mode = 'r'), "fasta"):
f.write(str(">" + seq_record.id + "\n"))
x = 31
while x < 411:
f.write(str(seq_record.seq[x:x+50])+ "\n")
x = x + 50
print("All files trimmed.")
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
for filename in glob.iglob('*trim.fasta'):
name = str(filename)
newname = str(filename.strip(".fasta") + ".xml")
print("Running BLAST on " + name)
record = open(name).read()
result_handle = NCBIWWW.qblast("blastn", "nt", record, hitlist_size=1)
print("BLAST on " + name + " is complete.")
with open(newname, "w") as out_handle:
out_handle.write(result_handle.read())
out_handle.close()
result_handle.close()
print(newname + " is ready for parsing.")
print("All files BLASTed")

Related

The wav file wont play after using pyinstaller --onefile.I just hear the windows 'beep'

This program displays a home circuit breaker panel. the user can view what is on each breaker on the panel (data taken from an imported dictionary of entered breaker panel info) or the user can check what breakers control any list zone (kitchen basement, etc) The breakerville program closes when the user decides and is supposed to play a wave file at the close. It doesn't play after the program is made into an exe with pyinstaller just the windows 'beep'.
I am suspecting that I may need to edit the spec file to get the wave file to work after compiled. Is this correct and if so how? Do I need to modify the spec file?
from playsound import playsound # CURRENTLY USING
from chart import chart
from BreakerZones import BreakerZones
import time
import sys
import colorama
import yaml # to print the nested_lookup results(n) on separate lines
from nested_lookup import nested_lookup, get_all_keys # importing 2 items from nested_lookup
from colorama import Fore, Back, Style
colorama.init(autoreset=True) # If you don't want to print Style.RESET_ALL all the time,
# reset automatically after each print statement with True
print(colorama.ansi.clear_screen())
print('\n'*4) # prints a newline 4 times
print(Fore.MAGENTA + ' Arriving-' + Fore.GREEN + ' *** BREAKERVILLE USA ***')
def main():
print('\n' * 2)
print(Fore.BLUE + ' Breaker Numbers and Zones')
k = get_all_keys(BreakerZones)
# raw amount of keys even repeats , has quotes
new_l = [] # eliminate extra repeating nested keys
for e in k: # has quotes
if e not in new_l and sorted(e) not in new_l: #
new_l.append(e) #
print()
new_l.sort() # make alphabetical
newer_l = ('%s' % ', '.join(map(str, new_l)).strip("' ,")) # remove ['%s'] brackets so they don't show up when run
print(' ', yaml.dump(newer_l, default_flow_style=False)) # strip("' ,") or will see leading "' ," in output
print(Fore.BLUE + ' ENTER A BREAKER # OR ZONE', Fore.GREEN + ': ', end='')
i = input().strip().lower() # these lines is workaround for the colorama
print() # user input() issue of 'code' appearing in screen output
if i in k:
n = (nested_lookup(i, BreakerZones, wild=False, with_keys=False)) # wild=True means key not case sensitive,
print(yaml.dump(n, default_flow_style=False)) # 'with_keys' returns values + keys also
# for key, value in n.items(): eliminated by using yaml
# print(key, '--', value) eliminated by using yaml
else:
print(Fore.YELLOW + ' Typo,' + Fore.GREEN + ' try again')
main()
print()
print(Fore.GREEN + ' Continue? Y or N: C for breaker chart : ', end='') # see comments ENTER A BREAKER
ans = input().strip().lower() # strip() removes any spaces before or after user input
if ans == 'c':
chart()
print()
print(Fore.GREEN + ' Continue? Y or N : ', end='')
ans = input().strip().lower() # strip() removes any spaces before or after user input
if ans == 'y': # shorter version 'continue Y or N' after printing breaker chart
main()
else:
print()
print(Fore.MAGENTA + ' Departing -' + Fore.GREEN + ' *** BREAKERVILLE ***')
playsound('train whistle.wav')
time.sleep(2) # delay to exit program
sys.exit()
elif ans != 'y':
print()
print(Fore.MAGENTA + ' Good Day -' + Fore.GREEN + ' *** BREAKERVILLE ***')
playsound('train whistle.wav') #CURRENTLY USING
time.sleep(2) # delay to exit program
sys.exit()
else:
main()
main()
For the records: The issue is fixed by providing full path to the sound file.
This is probably linked to the implementation of playsound and how it determines what is the current working directory. Please refer to https://pyinstaller.readthedocs.io/en/stable/runtime-information.html#run-time-information for a better understanding of that topic with pyinstaller

When exporting XLIFF from Xcode, how to exclude dummy strings?

I'm using the Xcode's Editor > Export For Localization... to export XLIFF file for translation
but the translations for the Main.storyboard includes a lot of unnecessary strings, mostly placeholders/dummies that are useful at design time.
How do I exclude such strings from XLIFF file?
I've written a script that excludes certain translation.
How it works?
cmd-line: python strip_loc.py input.xliff output.xliff exclude_list.txt [-v]
Example usage:python strip_loc.py en.xliff en-stripped.xliff exclude_words.txt -v
The exclude_list.txt is a file with a string per line. The script parses this list and creates a dictionary of banned words. If a translation with source containing one of these strings is encountered, the whole translation unit is removed from the output xml/xliff.
Here is the solution that works with latest python version:
def log(string_to_log):
if args.verbose:
print(string_to_log)
import argparse
parser = argparse.ArgumentParser(description="Process xliff file against banned words and output new xliff with stripped translation.", epilog="Example usage: strip_loc.py en.xliff en-stripped.xliff exclude_words.txt -v")
parser.add_argument('source', help="Input .xliff file containing all the strings")
parser.add_argument('output', help="Output .xliff file which will containt the stripped strings according to the exclude_list")
parser.add_argument('exclude_list', help="Multi-line text file where every line is a banned string")
parser.add_argument('-v', '--verbose', action="store_true", help="print script steps while working")
args = parser.parse_args()
banned_words = [line.strip().lower() for line in open(args.exclude_list, 'r')]
log("original file: " + args.source)
log("output file: " + args.output)
log("banned words: " + ", ".join(banned_words))
log("")
import xml.etree.ElementTree as ET
ET.register_namespace('',"urn:oasis:names:tc:xliff:document:1.2")
ns = {"n": "urn:oasis:names:tc:xliff:document:1.2"}
with open(args.source, 'r') as xml_file:
tree = ET.parse(xml_file)
root = tree.getroot()
counter = 1
for file_body in root.findall("./*/n:body", ns):
for trans_unit in file_body.findall("n:trans-unit", ns):
source = trans_unit.find("n:source", ns)
if source.text is not None:
source = source.text.encode("utf-8").lower()
source = source.decode("utf-8")
source = source.strip()
for banned_word in banned_words:
if source.find(banned_word) != -1:
log(str(counter) + ": removing <trans-unit id=\"" + trans_unit.attrib['id'] + "\">, banned: \"" + banned_word + "\"")
file_body.remove(trans_unit)
break
counter += 1
tree.write(args.output, "utf-8", True)
log("")
print("DONE")
And the usage is the same:
python strip_loc.py en.xliff en-stripped.xliff exclude_words.txt -v
For me I use this XLIFF Online Editor to edit the xliff file. It will be easy to you to ignore the dummy text or anything you need.

How to save links automatically repeat?

from bs4 import BeautifulSoup
from urllib import urlopen
import re
b = urllib2.urlopen("http://www.apache.org")
soup = BeautifulSoup(b)
for link in soup.findAll('a'):
print " %s link.get" % ('href')
f = open("/home/apache/test/test.txt", "w")
f.write()
f.close()
How to save links automatically repeat??????
It's a placeholder for formatting. It represents a string.
" %s link.get" % ('href')
is equivalent to
" " + 'href' + " link.get"
The placeholders can make things more readable, without cluttering the text with quotes and +. Though in this case, there is no variable, so it is simply
" href link.get"
However, .format() is preferred to % formatting nowadays, like
" {} link.get".format('href')

Insert random video in place of actual filename

I am looking for a way to modify the following script to play a random video in the folder /mnt/usb when script is ran. Videos in the folder will change daily but will all be mov file type. At the end of the video, it returns to our slideshow.
import xbmc
import time
import os
def PlayAndWait(mediafile):
xbmc.executebuiltin("PlayMedia(%s)" % mediafile, True)
while xbmc.Player().isPlaying():
time.sleep(1.0)
if os.path.isfile ("/mnt/usb/videoenter.mov"):
PlayAndWait("/mnt/usb/videoenter.mov")
xbmc.executebuiltin("SlideShow(/mnt/usb/slideshow)")
Use glob.glob to get the list of files matching '/mnt/usb/*.mov' then use random.choice to pick one out:
import xbmc
import time
import os
import glob
import random
def PlayAndWait(mediafile):
# escape characters and quote if needed as xbmc requires
if '"' in mediafile:
mediafile = mediafile.replace('"','\\"')
if any(x in mediafile for x in ",() "):
mediafile = '"' + mediafile + '"'
xbmc.executebuiltin("PlayMedia(%s)" % mediafile, True)
while xbmc.Player().isPlaying():
time.sleep(1.0)
files = glob.glob('/mnt/usb/*.mov')
if not files:
filename = random.choice(files)
if os.path.isfile(filename):
PlayAndWait(filename)
xbmc.executebuiltin("SlideShow(/mnt/usb/slideshow)")

How to automatically turn BibTex citation into something parseable by Zotero?

I have a citation system which publishes users notes to a wiki (Researchr). Programmatically, I have access to the full BibTeX record of each entry, and I also display this on the individual pages (for example - click on BibTeX). This is in the interest of making it easy for users of other citation manager to automatically import the citation of a paper that interests them. I would also like other citation managers, especially Zotero, to be able to automatically detect and import a citation.
Zotero lists a number of ways of exposing metadata that it will understand, including meta tags with RDF, COiNS, Dublin Core and unAPI. Is there a Ruby library for converting BibTeX to any of these standards automatically - or a Javascript library? I could probably create something, but if something existed, it would be far more robust (BibTeX has so many publication types and fields etc).
There's a BibTeX2RDF convertor available here, might be what you're after.
unAPI is not a data standard - it's a way to serve data (to Zotero and other programs). Zotero imports Bibtex, so serving Bibtex via unAPI works just fine. Inspire is an example of a site that does that:
http://inspirehep.net/
By now one can simply import bibtex files of type .bib directly in Zotero. However, I noticed my bibtex files were often less complete than Zotero (in particular they often missed a DOI), and I did not find an "auto-complete" function (based on the data in the bibtex entries) in Zotero.
So I import the .bib file with Zotero, to ensure they are all in there. Then I run a python script that gets all the missing DOI's it can find for the entries in that .bib file, and exports them to a space separated .txt file.:
# pip install habanero
from habanero import Crossref
import re
def titletodoi(keyword):
cr = Crossref()
result = cr.works(query=keyword)
items = result["message"]["items"]
item_title = items[0]["title"]
tmp = ""
for it in item_title:
tmp += it
title = keyword.replace(" ", "").lower()
title = re.sub(r"\W", "", title)
# print('title: ' + title)
tmp = tmp.replace(" ", "").lower()
tmp = re.sub(r"\W", "", tmp)
# print('tmp: ' + tmp)
if title == tmp:
doi = items[0]["DOI"]
return doi
else:
return None
def get_dois(titles):
dois = []
for title in titles:
try:
doi = titletodoi(title)
print(f"doi={doi}, title={title}")
if not doi is None:
dois.append(doi)
except:
pass
# print("An exception occurred")
print(f"dois={dois}")
return dois
def read_titles_from_file(filepath):
with open(filepath) as f:
lines = f.read().splitlines()
split_lines = splits_lines(lines)
return split_lines
def splits_lines(lines):
split_lines = []
for line in lines:
new_lines = line.split(";")
for new_line in new_lines:
split_lines.append(new_line)
return split_lines
def write_dois_to_file(dois, filename, separation_char):
textfile = open(filename, "w")
for doi in dois:
textfile.write(doi + separation_char)
textfile.close()
filepath = "list_of_titles.txt"
titles = read_titles_from_file(filepath)
dois = get_dois(titles)
write_dois_to_file(dois, "dois_space.txt", " ")
write_dois_to_file(dois, "dois_per_line.txt", "\n")
The DOIs of the .txt are fed into magic wand of Zotero. Next, I (manually) remove the duplicates by choosing the latest added entry (because that comes from the magic wand with the most data).
After that, I run another script to update all the reference id's in my .tex and .bib files to those generated by Zotero:
# Importing library
import bibtexparser
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import *
import os, fnmatch
import Levenshtein as lev
# Let's define a function to customize our entries.
# It takes a record and return this record.
def customizations(record):
"""Use some functions delivered by the library
:param record: a record
:returns: -- customized record
"""
record = type(record)
record = author(record)
record = editor(record)
record = journal(record)
record = keyword(record)
record = link(record)
record = page_double_hyphen(record)
record = doi(record)
return record
def get_references(filepath):
with open(filepath) as bibtex_file:
parser = BibTexParser()
parser.customization = customizations
bib_database = bibtexparser.load(bibtex_file, parser=parser)
# print(bib_database.entries)
return bib_database
def get_reference_mapping(main_filepath, sub_filepath):
found_sub = []
found_main = []
main_into_sub = []
main_references = get_references(main_filepath)
sub_references = get_references(sub_filepath)
for main_entry in main_references.entries:
for sub_entry in sub_references.entries:
# Match the reference ID if 85% similair titles are detected
lev_ratio = lev.ratio(
remove_curly_braces(main_entry["title"]).lower(),
remove_curly_braces(sub_entry["title"]).lower(),
)
if lev_ratio > 0.85:
print(f"lev_ratio={lev_ratio}")
if main_entry["ID"] != sub_entry["ID"]:
print(f'replace: {sub_entry["ID"]} with: {main_entry["ID"]}')
main_into_sub.append([main_entry, sub_entry])
# Keep track of which entries have been found
found_sub.append(sub_entry)
found_main.append(main_entry)
return (
main_into_sub,
found_main,
found_sub,
main_references.entries,
sub_references.entries,
)
def remove_curly_braces(string):
left = string.replace("{", "")
right = left.replace("{", "")
return right
def replace_references(main_into_sub, directory):
for pair in main_into_sub:
main = pair[0]["ID"]
sub = pair[1]["ID"]
print(f"replace: {sub} with: {main}")
# UNCOMMENT IF YOU WANT TO ACTUALLY DO THE PRINTED REPLACEMENT
# findReplace(latex_root_dir, sub, main, "*.tex")
# findReplace(latex_root_dir, sub, main, "*.bib")
def findReplace(directory, find, replace, filePattern):
for path, dirs, files in os.walk(os.path.abspath(directory)):
for filename in fnmatch.filter(files, filePattern):
filepath = os.path.join(path, filename)
with open(filepath) as f:
s = f.read()
s = s.replace(find, replace)
with open(filepath, "w") as f:
f.write(s)
def list_missing(main_references, sub_references):
for sub in sub_references:
if not sub["ID"] in list(map(lambda x: x["ID"], main_references)):
print(f'the following reference has a changed title:{sub["ID"]}')
latex_root_dir = "some_path/"
main_filepath = f"{latex_root_dir}latex/Literature_study/zotero.bib"
sub_filepath = f"{latex_root_dir}latex/Literature_study/references.bib"
(
main_into_sub,
found_main,
found_sub,
main_references,
sub_references,
) = get_reference_mapping(main_filepath, sub_filepath)
replace_references(main_into_sub, latex_root_dir)
list_missing(main_references, sub_references)
# For those references which have levenshtein ratio below 85 you can specify a manual swap:
manual_swap = [] # main into sub
# manual_swap.append(["cantley_impact_2021","cantley2021impact"])
# manual_swap.append(["widemann_envision_2021","widemann2020envision"])
for pair in manual_swap:
main = pair[0]
sub = pair[1]
print(f"replace: {sub} with: {main}")
# UNCOMMENT IF YOU WANT TO ACTUALLY DO THE PRINTED REPLACEMENT
# findReplace(latex_root_dir, sub, main, "*.tex")
# findReplace(latex_root_dir, sub, main, "*.bib")

Resources