Spaces:
Running
Running
supercat666
commited on
Commit
·
26e7c05
1
Parent(s):
bc641c8
add vcf
Browse files- app.py +83 -2
- cas9att.py +0 -5
- cas9attvcf.py +9 -18
app.py
CHANGED
@@ -145,8 +145,8 @@ gene_symbol_list = list(gene_annotations.keys()) # List of gene symbols for the
|
|
145 |
if selected_model == 'Cas9':
|
146 |
# Use a radio button to select enzymes, making sure only one can be selected at a time
|
147 |
target_selection = st.radio(
|
148 |
-
"Select either on-target or off-target:",
|
149 |
-
('on-target', 'off-target'),
|
150 |
key='target_selection'
|
151 |
)
|
152 |
if 'current_gene_symbol' not in st.session_state:
|
@@ -319,6 +319,87 @@ if selected_model == 'Cas9':
|
|
319 |
file_name=f"{gene_symbol}_files.zip",
|
320 |
mime="application/zip"
|
321 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
322 |
|
323 |
elif target_selection == 'off-target':
|
324 |
ENTRY_METHODS = dict(
|
|
|
145 |
if selected_model == 'Cas9':
|
146 |
# Use a radio button to select enzymes, making sure only one can be selected at a time
|
147 |
target_selection = st.radio(
|
148 |
+
"Select either on-target, on-target with mutation or off-target:",
|
149 |
+
('on-target', 'mutation', 'off-target'),
|
150 |
key='target_selection'
|
151 |
)
|
152 |
if 'current_gene_symbol' not in st.session_state:
|
|
|
319 |
file_name=f"{gene_symbol}_files.zip",
|
320 |
mime="application/zip"
|
321 |
)
|
322 |
+
elif target_selection == 'mutation':
|
323 |
+
# Prediction button
|
324 |
+
predict_button = st.button('Predict on-target')
|
325 |
+
vcf_reader =...
|
326 |
+
|
327 |
+
if 'exons' not in st.session_state:
|
328 |
+
st.session_state['exons'] = []
|
329 |
+
|
330 |
+
# Process predictions
|
331 |
+
if predict_button and gene_symbol:
|
332 |
+
with st.spinner('Predicting... Please wait'):
|
333 |
+
predictions, gene_sequence, exons = cas9attvcf.process_gene(gene_symbol, cas9att_path)
|
334 |
+
full_predictions = sorted(predictions, key=lambda x: x[8], reverse=True)
|
335 |
+
sorted_predictions = sorted(predictions, key=lambda x: x[8], reverse=True)[:10]
|
336 |
+
st.session_state['full_results'] = full_predictions
|
337 |
+
st.session_state['on_target_results'] = sorted_predictions
|
338 |
+
st.session_state['gene_sequence'] = gene_sequence # Save gene sequence in session state
|
339 |
+
st.session_state['exons'] = exons # Store exon data
|
340 |
+
|
341 |
+
# Notify the user once the process is completed successfully.
|
342 |
+
st.success('Prediction completed!')
|
343 |
+
st.session_state['prediction_made'] = True
|
344 |
+
|
345 |
+
if 'on_target_results' in st.session_state and st.session_state['on_target_results']:
|
346 |
+
ensembl_id = gene_annotations.get(gene_symbol, 'Unknown') # Get Ensembl ID or default to 'Unknown'
|
347 |
+
col1, col2, col3 = st.columns(3)
|
348 |
+
with col1:
|
349 |
+
st.markdown("**Genome**")
|
350 |
+
st.markdown("Homo sapiens")
|
351 |
+
with col2:
|
352 |
+
st.markdown("**Gene**")
|
353 |
+
st.markdown(f"{gene_symbol} : {ensembl_id} (primary)")
|
354 |
+
with col3:
|
355 |
+
st.markdown("**Nuclease**")
|
356 |
+
st.markdown("SpCas9")
|
357 |
+
# Include "Target" in the DataFrame's columns
|
358 |
+
try:
|
359 |
+
df = pd.DataFrame(st.session_state['on_target_results'],
|
360 |
+
columns=["Gene Symbol", "Chr", "Strand", "Target Start", "Transcript", "Exon",
|
361 |
+
"Target",
|
362 |
+
"gRNA", "Prediction", "Is Mutation"])
|
363 |
+
df_full = pd.DataFrame(st.session_state['full_results'],
|
364 |
+
columns=["Gene Symbol", "Chr", "Strand", "Target Start", "Transcript",
|
365 |
+
"Exon", "Target",
|
366 |
+
"gRNA", "Prediction", "Is Mutation"])
|
367 |
+
st.dataframe(df)
|
368 |
+
except ValueError as e:
|
369 |
+
st.error(f"DataFrame creation error: {e}")
|
370 |
+
# Optionally print or log the problematic data for debugging:
|
371 |
+
print(st.session_state['on_target_results'])
|
372 |
+
|
373 |
+
if 'gene_sequence' in st.session_state and st.session_state['gene_sequence']:
|
374 |
+
gene_symbol = st.session_state['current_gene_symbol']
|
375 |
+
gene_sequence = st.session_state['gene_sequence']
|
376 |
+
|
377 |
+
# Define file paths
|
378 |
+
genbank_file_path = f"{gene_symbol}_crispr_targets.gb"
|
379 |
+
bed_file_path = f"{gene_symbol}_crispr_targets.bed"
|
380 |
+
csv_file_path = f"{gene_symbol}_crispr_predictions.csv"
|
381 |
+
plot_image_path = f"{gene_symbol}_gtracks_plot.png"
|
382 |
+
|
383 |
+
# Generate files
|
384 |
+
cas9att.generate_genbank_file_from_df(df_full, gene_sequence, gene_symbol, genbank_file_path)
|
385 |
+
cas9att.create_bed_file_from_df(df_full, bed_file_path)
|
386 |
+
cas9att.create_csv_from_df(df_full, csv_file_path)
|
387 |
+
|
388 |
+
# Prepare an in-memory buffer for the ZIP file
|
389 |
+
zip_buffer = io.BytesIO()
|
390 |
+
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
|
391 |
+
# For each file, add it to the ZIP file
|
392 |
+
zip_file.write(genbank_file_path)
|
393 |
+
zip_file.write(bed_file_path)
|
394 |
+
zip_file.write(csv_file_path)
|
395 |
+
|
396 |
+
# Display the download button for the ZIP file
|
397 |
+
st.download_button(
|
398 |
+
label="Download GenBank, BED, CSV files as ZIP",
|
399 |
+
data=zip_buffer.getvalue(),
|
400 |
+
file_name=f"{gene_symbol}_files.zip",
|
401 |
+
mime="application/zip"
|
402 |
+
)
|
403 |
|
404 |
elif target_selection == 'off-target':
|
405 |
ENTRY_METHODS = dict(
|
cas9att.py
CHANGED
@@ -224,11 +224,6 @@ def process_gene(gene_symbol, model_path):
|
|
224 |
else:
|
225 |
print("Failed to retrieve transcripts.")
|
226 |
|
227 |
-
output = []
|
228 |
-
for result in results:
|
229 |
-
for item in result:
|
230 |
-
output.append(item)
|
231 |
-
|
232 |
# Return the sorted output, combined gene sequences, and all exons
|
233 |
return results, all_gene_sequences, all_exons
|
234 |
|
|
|
224 |
else:
|
225 |
print("Failed to retrieve transcripts.")
|
226 |
|
|
|
|
|
|
|
|
|
|
|
227 |
# Return the sorted output, combined gene sequences, and all exons
|
228 |
return results, all_gene_sequences, all_exons
|
229 |
|
cas9attvcf.py
CHANGED
@@ -325,16 +325,8 @@ def process_gene(gene_symbol, vcf_reader, model_path):
|
|
325 |
else:
|
326 |
print("Failed to retrieve transcripts.")
|
327 |
|
328 |
-
output = []
|
329 |
-
for result in results:
|
330 |
-
for item in result:
|
331 |
-
output.append(item)
|
332 |
-
|
333 |
-
# Sort results based on prediction score (assuming score is at the 8th index)
|
334 |
-
sorted_results = sorted(output, key=lambda x: x[8], reverse=True)
|
335 |
-
|
336 |
# Return the sorted output, combined gene sequences, and all exons
|
337 |
-
return
|
338 |
|
339 |
|
340 |
def create_genbank_features(data):
|
@@ -351,22 +343,22 @@ def create_genbank_features(data):
|
|
351 |
for row in formatted_data:
|
352 |
try:
|
353 |
start = int(row[1])
|
354 |
-
end =
|
355 |
except ValueError as e:
|
356 |
print(f"Error converting start/end to int: {row[1]}, {row[2]} - {e}")
|
357 |
continue
|
358 |
|
359 |
-
strand = 1 if row[3] == '
|
360 |
location = FeatureLocation(start=start, end=end, strand=strand)
|
|
|
361 |
feature = SeqFeature(location=location, type="misc_feature", qualifiers={
|
362 |
'label': row[7], # Use gRNA as the label
|
363 |
-
'note': f"Prediction: {row[8]}" # Include the prediction score
|
364 |
})
|
365 |
features.append(feature)
|
366 |
|
367 |
return features
|
368 |
|
369 |
-
|
370 |
def generate_genbank_file_from_df(df, gene_sequence, gene_symbol, output_path):
|
371 |
# Ensure gene_sequence is a string before creating Seq object
|
372 |
if not isinstance(gene_sequence, str):
|
@@ -381,22 +373,21 @@ def generate_genbank_file_from_df(df, gene_sequence, gene_symbol, output_path):
|
|
381 |
record.annotations["molecule_type"] = "DNA"
|
382 |
SeqIO.write(record, output_path, "genbank")
|
383 |
|
384 |
-
|
385 |
def create_bed_file_from_df(df, output_path):
|
386 |
with open(output_path, 'w') as bed_file:
|
387 |
for index, row in df.iterrows():
|
388 |
chrom = row["Chr"]
|
389 |
-
start = int(row["Start
|
390 |
-
end =
|
391 |
strand = '+' if row["Strand"] == '1' else '-'
|
392 |
gRNA = row["gRNA"]
|
393 |
score = str(row["Prediction"])
|
|
|
394 |
# transcript_id is not typically part of the standard BED columns but added here for completeness
|
395 |
transcript_id = row["Transcript"]
|
396 |
|
397 |
# Writing only standard BED columns; additional columns can be appended as needed
|
398 |
-
bed_file.write(f"{chrom}\t{start}\t{end}\t{gRNA}\t{score}\t{strand}\n")
|
399 |
-
|
400 |
|
401 |
def create_csv_from_df(df, output_path):
|
402 |
df.to_csv(output_path, index=False)
|
|
|
325 |
else:
|
326 |
print("Failed to retrieve transcripts.")
|
327 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
# Return the sorted output, combined gene sequences, and all exons
|
329 |
+
return results, all_gene_sequences, all_exons
|
330 |
|
331 |
|
332 |
def create_genbank_features(data):
|
|
|
343 |
for row in formatted_data:
|
344 |
try:
|
345 |
start = int(row[1])
|
346 |
+
end = start + len(row[6]) # Calculate the end position based on the target sequence length
|
347 |
except ValueError as e:
|
348 |
print(f"Error converting start/end to int: {row[1]}, {row[2]} - {e}")
|
349 |
continue
|
350 |
|
351 |
+
strand = 1 if row[3] == '1' else -1
|
352 |
location = FeatureLocation(start=start, end=end, strand=strand)
|
353 |
+
is_mutation = 'Yes' if row[9] else 'No'
|
354 |
feature = SeqFeature(location=location, type="misc_feature", qualifiers={
|
355 |
'label': row[7], # Use gRNA as the label
|
356 |
+
'note': f"Prediction: {row[8]}, Mutation: {is_mutation}" # Include the prediction score and mutation status
|
357 |
})
|
358 |
features.append(feature)
|
359 |
|
360 |
return features
|
361 |
|
|
|
362 |
def generate_genbank_file_from_df(df, gene_sequence, gene_symbol, output_path):
|
363 |
# Ensure gene_sequence is a string before creating Seq object
|
364 |
if not isinstance(gene_sequence, str):
|
|
|
373 |
record.annotations["molecule_type"] = "DNA"
|
374 |
SeqIO.write(record, output_path, "genbank")
|
375 |
|
|
|
376 |
def create_bed_file_from_df(df, output_path):
|
377 |
with open(output_path, 'w') as bed_file:
|
378 |
for index, row in df.iterrows():
|
379 |
chrom = row["Chr"]
|
380 |
+
start = int(row["Target Start"])
|
381 |
+
end = start + len(row["Target"]) # Calculate the end position based on the target sequence length
|
382 |
strand = '+' if row["Strand"] == '1' else '-'
|
383 |
gRNA = row["gRNA"]
|
384 |
score = str(row["Prediction"])
|
385 |
+
is_mutation = 'Yes' if row["Is Mutation"] else 'No'
|
386 |
# transcript_id is not typically part of the standard BED columns but added here for completeness
|
387 |
transcript_id = row["Transcript"]
|
388 |
|
389 |
# Writing only standard BED columns; additional columns can be appended as needed
|
390 |
+
bed_file.write(f"{chrom}\t{start}\t{end}\t{gRNA}\t{score}\t{strand}\t{is_mutation}\n")
|
|
|
391 |
|
392 |
def create_csv_from_df(df, output_path):
|
393 |
df.to_csv(output_path, index=False)
|