Improve genbank feature addition - performance

I am trying to add more than 70000 new features to a genbank file using biopython.
I have this code:
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation
fi = "myoriginal.gbk"
fo = "mynewfile.gbk"
for result in results:
start = 0
end = 0
result = result.split("\t")
start = int(result[0])
end = int(result[1])
for record in SeqIO.parse(original, "gb"):
record.features.append(SeqFeature(FeatureLocation(start, end), type = "misc_feat"))
SeqIO.write(record, fo, "gb")
Results is just a list of lists containing the start and end of each one of the features I need to add to the original gbk file.
This solution is extremely costly for my computer and I do not know how to improve the performance. Any good idea?

You should parse the genbank file just once. Omitting what results contains (I do not know exactly, because there are some missing pieces of code in your example), I would guess something like this would improve performance, modifying your code:
fi = "myoriginal.gbk"
fo = "mynewfile.gbk"
original_records = list(SeqIO.parse(fi, "gb"))
for result in results:
result = result.split("\t")
start = int(result[0])
end = int(result[1])
for record in original_records:
record.features.append(SeqFeature(FeatureLocation(start, end), type = "misc_feat"))
SeqIO.write(record, fo, "gb")

Related

Creating button that clips Entries

I've created a program that is supposed to take the question and entry from a user and copy it to the clipboard. It works fine as a regular program but when I try to adapt it in trying to adapt it to a GUI I am running into an issue. Currently the program is only copying the question and the entries are returning empty strings. I know that if a broke down each entry into its own named variable I could probably fix this issue but a loop seems like a much cleaner solution. Can anyone assist?
import tkinter as tk
from tkinter import *
import pyperclip
system = 'What is the system?'
product = 'What is the product?'
issue = 'What is the issue?'
error = 'Is there an error message?'
screenshot = 'Do you have a screenshot or documentation for this issue?'
impact = 'Is the floor impacted. If so, what is the impact?'
users = 'How many users is this affecting?'
troubleshooting = 'Was there troubleshooting performed?'
changes = 'Are you aware of any changes that may have led up to the issue?'
ticket = 'Do you have an internal ticket number?'
questions = (
system, product, issue, error,
screenshot, impact, users, troubleshooting,
changes, ticket)
entries = []
clip = []
index = 0
index=0
c=0
r=0
root = tk.Tk()
root.title('SysIt4')
top_frame=tk.Frame(root)
bottom_frame=tk.Frame(root)
top_frame.grid(column=0, row=0, sticky=W)
bottom_frame.grid(column=0, row=1)
canvas = tk.Canvas(root, width=600, height=800)
canvas.grid()
while index < 10:
label = tk.Label(top_frame, text=questions[index])
label.grid(columnspan=2, column=c, row=r, sticky=W)
r+=2
index += 1
for r in range(1,20,2):
entry = tk.Entry(top_frame, width=50)
entry.grid(columnspan=2, column=c, row=r, sticky=W, padx=10, pady=5)
entries.append(entry)
def enact_clip(entries, questions):
responses = []
outfile= open('copy.txt', 'w')
for entry in entries:
responses.append(entry.get())
clip = list(zip(questions, responses))
for line in clip:
outfile.write(str(line) + '\n')
outfile.close()
infile = open('copy.txt', 'r')
copy_contents = infile.read()
return pyperclip.copy(copy_contents)
infile.close()
clip_button = Button(bottom_frame, text='Clip', command= enact_clip(entries, questions))
clip_button.grid(column=0, row=1)
root.mainloop()

How can I get the score from Question-Answer Pipeline? Is there a bug when Question-answer pipeline is used?

When I run the following code
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
text = r"""
As checked Dis is not yet on boarded to ARB portal, hence we cannot upload the invoices in portal
"""
questions = [
"Dis asked if it is possible to post the two invoice in ARB.I have not access so I wanted to check if you would be able to do it.",
]
for question in questions:
inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
input_ids = inputs["input_ids"].tolist()[0]
text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer_start_scores, answer_end_scores = model(**inputs)
answer_start = torch.argmax(
answer_start_scores
) # Get the most likely beginning of answer with the argmax of the score
answer_end = torch.argmax(answer_end_scores) + 1 # Get the most likely end of answer with the argmax of the score
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
print(f"Question: {question}")
print(f"Answer: {answer}\n")
The answer that I get here is:
Question: Dis asked if it is possible to post the two invoice in ARB.I have not access so I wanted to check if you would be able to do it.
Answer: dis is not yet on boarded to ARB portal
How do I get a score for this answer? Score here is very similar to what is I get when I run Question-Answer pipeline .
I have to take this approach since Question-Answer pipeline when used is giving me Key Error for the below code
from transformers import pipeline
nlp = pipeline("question-answering")
context = r"""
As checked Dis is not yet on boarded to ARB portal, hence we cannot upload the invoices in portal.
"""
print(nlp(question="Dis asked if it is possible to post the two invoice in ARB?", context=context))
This is my attempt to get the score. It appears that I cannot figure out what feature.p_mask. So I could not remove the non-context indexes that contribute to the softmax at the moment.
# ... assuming imports and question and context
model_name="deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
inputs = tokenizer(question, context,
add_special_tokens=True,
return_tensors='pt')
input_ids = inputs['input_ids'].tolist()[0]
outputs = model(**inputs)
# used to compute score
start = outputs.start_logits.detach().numpy()
end = outputs.end_logits.detach().numpy()
# from source code
# Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
#?? undesired_tokens = np.abs(np.array(feature.p_mask) - 1) & feature.attention_mask
# Generate mask
undesired_tokens = inputs['attention_mask']
undesired_tokens_mask = undesired_tokens == 0.0
# Make sure non-context indexes in the tensor cannot contribute to the softmax
start_ = np.where(undesired_tokens_mask, -10000.0, start)
end_ = np.where(undesired_tokens_mask, -10000.0, end)
# Normalize logits and spans to retrieve the answer
start_ = np.exp(start_ - np.log(np.sum(np.exp(start_), axis=-1, keepdims=True)))
end_ = np.exp(end_ - np.log(np.sum(np.exp(end_), axis=-1, keepdims=True)))
# Compute the score of each tuple(start, end) to be the real answer
outer = np.matmul(np.expand_dims(start_, -1), np.expand_dims(end_, 1))
# Remove candidate with end < start and end - start > max_answer_len
max_answer_len = 15
candidates = np.tril(np.triu(outer), max_answer_len - 1)
scores_flat = candidates.flatten()
idx_sort = [np.argmax(scores_flat)]
start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
end += 1
score = candidates[0, start, end-1]
start, end, score = start.item(), end.item(), score.item()
print(tokenizer.decode(input_ids[start:end]))
print(score)
See more source code

Extract multiple protein sequences from a Protein Data Bank along with Secondary Structure

I want to extract protein sequences and their corresponding secondary structure from any Protein Data bank, say RCSB. I just need short sequences and their secondary structure. Something like,
ATRWGUVT Helix
It is fine even if the sequences are long, but I want a tag at the end that denotes its secondary structure. Is there any programming tool or anything available for this.
As I've shown above I want only this much minimal information. How can I achieve this?
from Bio.PDB import *
from distutils import spawn
Extract sequence:
def get_seq(pdbfile):
p = PDBParser(PERMISSIVE=0)
structure = p.get_structure('test', pdbfile)
ppb = PPBuilder()
seq = ''
for pp in ppb.build_peptides(structure):
seq += pp.get_sequence()
return seq
Extract secondary structure with DSSP as explained earlier:
def get_secondary_struc(pdbfile):
# get secondary structure info for whole pdb.
if not spawn.find_executable("dssp"):
sys.stderr.write('dssp executable needs to be in folder')
sys.exit(1)
p = PDBParser(PERMISSIVE=0)
ppb = PPBuilder()
structure = p.get_structure('test', pdbfile)
model = structure[0]
dssp = DSSP(model, pdbfile)
count = 0
sec = ''
for residue in model.get_residues():
count = count + 1
# print residue,count
a_key = list(dssp.keys())[count - 1]
sec += dssp[a_key][2]
print sec
return sec
This should print both sequence and secondary structure.
You can use DSSP.
The output of DSSP is explained extensively under 'explanation'. The very short summary of the output is:
H = α-helix
B = residue in isolated β-bridge
E = extended strand, participates in β ladder
G = 3-helix (310 helix)
I = 5 helix (π-helix)
T = hydrogen bonded turn
S = bend

why is Matlab slow in for loop with large number of iterations but fast with a small number of iterations?

I am running a function to extract some information from 100,000+ patient xray dicom files. the files are stored within a veracrypt encryption container for security purposes.
when i run the function on a small sample of files it performs really quickly, however when i run the function on the entire dataset it is very slow in comparison, going from several files per second to 1 file per second (approximately).
i was wandering why this is happening? i have tried storing the data on an ssd and on a normal hard drive and get the same sort of slow down when using a larger dataset compared to a small one.
i have added the code below for reference but haven't commented it fully yet.. this is for my thesis so i will do it once i get the extraction finished..
thanks for any help.
function [ DB, corrupted_files ] = extract_from_dcm( folder_name )
%EXTRACT_FROM_DCM Summary of this function goes here
% Detailed explanation goes here
if nargin == 0
folder_name = 'I:\Find and Treat\MXU Old Backup\2005';
end
Database_Check = strcat(folder_name, '\DataBase.mat');
if exist(Database_Check, 'file')
load(Database_Check);
entry_start = length(DB) + 1;
else
entry_start = 1;
[ found_dicoms ] = recursive_search( folder_name );
end
mat_file_location = strcat(folder_name, '\DataBase.mat');
excel_DB_file = strcat(folder_name, '\DataBase.xlsx');
excel_Corrupted_file = strcat(folder_name, '\Corrupted_Files.xlsx');
% the recursive search creates a struct with the path for each
% dcm file found. the list is then recursivly used to locate
% the image and extract the relevant information from it.
fprintf('---------------------------------------------\n');
fprintf('Start Patient Data Extraction\n');
tic
h = waitbar(0,'','Name','Patient Data Extraction');
entry_end = length(found_dicoms);
if entry_end == 0
% set(handles.info_box, 'String', 'No Dicom Files Found in this Folder or its Subfolders');
else
% set(handles.info_box, 'String', 'Congratulations Dicom Files have been found Look Through the Data Base using the Buttons Below....Press Save Button to save the Database. (Database Save format is EXCEL SpreadSheet and MAT file');
for kk = entry_start : entry_end
progress = kk/entry_end;
progress_percent = round(progress * 100);
waitbar(progress,h, sprintf('%d%% %d/%d of images processed', progress_percent, kk, entry_end));
img_full_path = found_dicoms(kk).name;
% search_path = folder_name;
% img_full_path = strrep(img_full_path, search_path, '');
try %# Attempt to perform some computation
dicom_info = dicominfo(img_full_path); %# The operation you are trying to perform goes here
try %# Attempt to perform some computation
dicom_read = dicomread(dicom_info); %# The operation you are trying to perform goes here
old = dicominfo(img_full_path);
DB(kk).StudyDate = old.StudyDate;
DB(kk).StudyTime = old.StudyTime;
if isfield(old.PatientName, 'FamilyName')
DB(kk).Forename = old.PatientName.FamilyName;
else
DB(kk).Forename = 'NA';
end
if isfield(old.PatientName, 'GivenName')
DB(kk).LastName = old.PatientName.GivenName;
else
DB(kk).LastName = 'NA';
end
if isfield(old, 'PatientSex')
DB(kk).PatientSex = old.PatientSex;
else
DB(kk).PatientSex = 'NA';
end
if isempty(old.PatientBirthDate)
DB(kk).PatientBirthDate = '00000000';
else
DB(kk).PatientBirthDate = old.PatientBirthDate;
end
if strcmp(old.Manufacturer, 'Philips Medical Systems')
DB(kk).Van = '1';
else
DB(kk).Van = '0';% section to represent organising by different vans
end
DB(kk).img_Path = img_full_path;
save(mat_file_location,'DB','found_dicoms');
catch exception %# Catch the exception
fprintf('read - file %d corrupt.\n',kk);
continue %# Pass control to the next loop iteration
end
catch exception %# Catch the exception
fprintf('info - file %d corrupt.\n',kk);
continue %# Pass control to the next loop iteration
end
end
end
[ corrupted_files, DB ] = corruption_check( DB, found_dicoms, folder_name );
toc
fprintf('End Patient Data Extraction\n');
fprintf('---------------------------------------------\n');
fprintf('---------------------------------------------\n');
fprintf('Start Saving Extracted Data \n');
tic
save(mat_file_location,'DB','corrupted_files','found_dicoms');
if isempty(DB)
msg = sprintf('No Dicom Files Found');
msgbox(strcat(msg));
else
DB_table = struct2table(DB);
writetable(DB_table, excel_DB_file);
end
close(h);
toc
fprintf('End Saving Extracted Data \n');
fprintf('---------------------------------------------\n');
end
OK thanks for all the help..
My problem was the saving at the end of each iteration but the biggest problem was the line where i run the dicomread function. i changed the saving to occur for every 20 images processed.
I also removed the preallocation suggested in the comments to see what difference it made without the dicromread and saving a swell. it was considerably slower than with the preallocation.
... i just need to find a solution for dicomread (which i was using as a way to check if the file was corrupt or not).

How to extract string from large file only if specific string appears previous using Ruby?

I am trying to extract information from a large file and cannot figure out how to extract strings from file lines only when a previous line in the same record within the file has been matched by regex. An example of one record in the file is as follows:
*NEW RECORD
RECTYPE = D
MH = Informed Consent
AQ = ES HI LJ PX SN ST
ENTRY = Consent, Informed
MN = N03.706.437.650.312
MN = N03.706.535.489
FX = Disclosure
FX = Mental Competency
FX = Therapeutic Misconception
FX = Treatment Refusal
ST = T058
ST = T078
AN = competency to consent: coordinate IM with MENTAL COMPETENCY (IM)
PI = Jurisprudence (1966-1970)
PI = Physician-Patient Relations (1966-1970)
MS = Voluntary authorization, by a patient or research subject, etc,...
This file contains over 20,000 records like this example. I want to identify a small percent of those records using the "MH" field. In this example, I want to find "Informed Consent", and then use regex to extract the information in the FX, AN, and MS fields only within that record. So far, I have opened the file, accessed the hash that the MH terms are stored in, and been able to extract those terms from the records in the file. I also have a functioning regex that identifies the content in the "FX" field.
File.open('mesh_descriptor.bin').each do |file_line|
file_line = file_line.chomp
# read each key of candidate_descriptor_keys
candidate_descriptor_keys.each do |cand_term|
if file_line =~ /^MH\s=\s(#{cand_term})$/
mesh_header = $1
puts "MH from Mesh Descriptor file is: #{mesh_header}"
if file_line =~ /^FX\s=\s(.*)$/
see_also = $1
puts " See_Also from Descriptor file is: #{see_also}"
end
end
end
end
The hash contains the following MH (keys):
candidate_descriptor_keys = ["Body Weight", "Obesity", "Thinness", "Fetal Weight", "Overweight"]
I had success extracting "FX" when I put the statement outside of the "if" statement to extract "MH", but all of the "FX" from the whole file were retrieved - not what I need. I thought putting the "if" statement for "FX" within the previous "if" statement would restrict the results to only those found when the first statement is true, but I am getting no results (also no errors) with this strategy. What I would like as a result is:
> Informed Consent
> Disclosure
> Mental Competency
> Therapeutic Misconception
> Treatment Refusal
as well as the strings within the "AN" and "MS" fields for only those records matching "MH". Any suggestions would be helpful!
I think this may be what you are looking for, but if not, let me know and I will change it. Look especially at the very end to see if that is the sort of output (for input having two records, both with a "MH" field) you want. I will also add a "explanation" section at the end once I have understood your question correctly.
I have assumed that each record begins
*NEW_RECORD
and you wish to identify all lines beginning "MH" whose field is one of the elements of:
candidate_descriptor_keys =
["Body Weight", "Obesity", "Thinness", "Informed Consent"]
and for each match, you would like to print the contents of the lines for the same record that begin with "FX", "AN" and "MS".
Code
NEW_RECORD_MARKER = "*NEW RECORD"
def getem(fname, candidate_descriptor_keys)
line = 0
found_mh = false
File.open(fname).each do |file_line|
file_line = file_line.strip
case
when file_line == NEW_RECORD_MARKER
puts # space between records
found_mh = false
when found_mh == false
candidate_descriptor_keys.each do |cand_term|
if file_line =~ /^MH\s=\s(#{cand_term})$/
found_mh = true
puts "MH from line #{line} of file is: #{cand_term}"
break
end
end
when found_mh
["FX", "AN", "MS"].each do |des|
if file_line =~ /^#{des}\s=\s(.*)$/
see_also = $1
puts " Line #{line} of file is: #{des}: #{see_also}"
end
end
end
line += 1
end
end
Example
Let's begin be creating a file, starging with a "here document that contains two records":
records =<<_
*NEW RECORD
RECTYPE = D
MH = Informed Consent
AQ = ES HI LJ PX SN ST
ENTRY = Consent, Informed
MN = N03.706.437.650.312
MN = N03.706.535.489
FX = Disclosure
FX = Mental Competency
FX = Therapeutic Misconception
FX = Treatment Refusal
ST = T058
ST = T078
AN = competency to consent
PI = Jurisprudence (1966-1970)
PI = Physician-Patient Relations (1966-1970)
MS = Voluntary authorization
*NEW RECORD
MH = Obesity
AQ = ES HI LJ PX SN ST
ENTRY = Obesity
MN = N03.706.437.650.312
MN = N03.706.535.489
FX = 1st FX
FX = 2nd FX
AN = Only AN
PI = Jurisprudence (1966-1970)
PI = Physician-Patient Relations (1966-1970)
MS = Only MS
_
If you puts records you will see it is just a string. (You'll see that I shortened two of them.) Now write it to a file:
File.write('mesh_descriptor', records)
If you wish to confirm the file contents, you could do this:
puts File.read('mesh_descriptor')
We also need to define define the array candidate_descriptor_keys:
candidate_descriptor_keys =
["Body Weight", "Obesity", "Thinness", "Informed Consent"]
We can now execute the method getem:
getem('mesh_descriptor', candidate_descriptor_keys)
MH from line 2 of file is: Informed Consent
Line 7 of file is: FX: Disclosure
Line 8 of file is: FX: Mental Competency
Line 9 of file is: FX: Therapeutic Misconception
Line 10 of file is: FX: Treatment Refusal
Line 13 of file is: AN: competency to consent
Line 16 of file is: MS: Voluntary authorization
MH from line 18 of file is: Obesity
Line 23 of file is: FX: 1st FX
Line 24 of file is: FX: 2nd FX
Line 25 of file is: AN: Only AN
Line 28 of file is: MS: Only MS

Resources