Spaces:

NiniCat
/

CRISPRTool

Sleeping

App Files Files Community

supercat666 commited on Mar 24

Commit

4fa4501

•

1 Parent(s): 24d7d26

fix

Browse files

Files changed (2) hide show

app.py +34 -51
cas9on.py +123 -131

app.py CHANGED Viewed

@@ -144,24 +144,20 @@ if selected_model == 'Cas9':
         if 'exons' not in st.session_state:
             st.session_state['exons'] = []
-        if 'cds' not in st.session_state:
-            st.session_state['cds'] = []
         # Process predictions
         if predict_button and gene_symbol:
             with st.spinner('Predicting... Please wait'):
-                predictions, gene_sequence, exons, cds = cas9on.process_gene(gene_symbol, cas9on_path)
                 sorted_predictions = sorted(predictions, key=lambda x: x[-1], reverse=True)[:10]
                 st.session_state['on_target_results'] = sorted_predictions
                 st.session_state['gene_sequence'] = gene_sequence  # Save gene sequence in session state
                 st.session_state['exons'] = exons  # Store exon data
-                st.session_state['cds'] = cds  # Store CDS data
             # Notify the user once the process is completed successfully.
             st.success('Prediction completed!')
             st.session_state['prediction_made'] = True
             if 'on_target_results' in st.session_state and st.session_state['on_target_results']:
                 ensembl_id = gene_annotations.get(gene_symbol, 'Unknown')  # Get Ensembl ID or default to 'Unknown'
                 col1, col2, col3 = st.columns(3)
@@ -177,7 +173,7 @@ if selected_model == 'Cas9':
                 # Include "Target" in the DataFrame's columns
                 try:
                     df = pd.DataFrame(st.session_state['on_target_results'],
-                                      columns=["Chr", "Start Pos", "End Pos", "Strand", "Transcript", "Target", "gRNA", "Prediction"])
                     st.dataframe(df)
                 except ValueError as e:
                     st.error(f"DataFrame creation error: {e}")
@@ -189,7 +185,6 @@ if selected_model == 'Cas9':
                 EXON_BASE = 0  # Base position for exons and CDS on the Y axis
                 EXON_HEIGHT = 0.02  # How 'tall' the exon markers should appear
-                CDS_HEIGHT = 0.04  # How 'tall' the CDS markers should appear
                 # Plot Exons as small markers on the X-axis
                 for exon in st.session_state['exons']:
@@ -203,18 +198,6 @@ if selected_model == 'Cas9':
                         name='Exon'
                     ))
-                # Plot CDS in a similar manner
-                for cds in st.session_state['cds']:
-                    cds_start, cds_end = cds['start'], cds['end']
-                    fig.add_trace(go.Bar(
-                        x=[(cds_start + cds_end) / 2],
-                        y=[CDS_HEIGHT],
-                        width=[cds_end - cds_start],
-                        base=[EXON_BASE],
-                        marker_color='rgba(0, 0, 255, 1)',
-                        name='CDS'
-                    ))
                 VERTICAL_GAP = 0.2  # Gap between different ranks
                 # Define max and min Y values based on strand and rank
@@ -254,38 +237,38 @@ if selected_model == 'Cas9':
                 # Display the plot
                 st.plotly_chart(fig)
-                if 'gene_sequence' in st.session_state and st.session_state['gene_sequence']:
-                    gene_symbol = st.session_state['current_gene_symbol']
-                    gene_sequence = st.session_state['gene_sequence']
-                    # Define file paths
-                    genbank_file_path = f"{gene_symbol}_crispr_targets.gb"
-                    bed_file_path = f"{gene_symbol}_crispr_targets.bed"
-                    csv_file_path = f"{gene_symbol}_crispr_predictions.csv"
-                    # Generate files
-                    cas9on.generate_genbank_file_from_df(df, gene_sequence, gene_symbol, genbank_file_path)
-                    cas9on.create_bed_file_from_df(df, bed_file_path)
-                    cas9on.create_csv_from_df(df, csv_file_path)
-                    # Prepare an in-memory buffer for the ZIP file
-                    zip_buffer = io.BytesIO()
-                    with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
-                        # For each file, add it to the ZIP file
-                        zip_file.write(genbank_file_path, arcname=genbank_file_path.split('/')[-1])
-                        zip_file.write(bed_file_path, arcname=bed_file_path.split('/')[-1])
-                        zip_file.write(csv_file_path, arcname=csv_file_path.split('/')[-1])
-                    # Important: move the cursor to the beginning of the BytesIO buffer before reading it
-                    zip_buffer.seek(0)
-                    # Display the download button for the ZIP file
-                    st.download_button(
-                        label="Download genbank,.bed,csv files as ZIP",
-                        data=zip_buffer.getvalue(),
-                        file_name=f"{gene_symbol}_files.zip",
-                        mime="application/zip"
-                    )
     elif target_selection == 'off-target':
         ENTRY_METHODS = dict(

         if 'exons' not in st.session_state:
             st.session_state['exons'] = []
         # Process predictions
         if predict_button and gene_symbol:
             with st.spinner('Predicting... Please wait'):
+                predictions, gene_sequence, exons  = cas9on.process_gene(gene_symbol, cas9on_path)
                 sorted_predictions = sorted(predictions, key=lambda x: x[-1], reverse=True)[:10]
                 st.session_state['on_target_results'] = sorted_predictions
                 st.session_state['gene_sequence'] = gene_sequence  # Save gene sequence in session state
                 st.session_state['exons'] = exons  # Store exon data
             # Notify the user once the process is completed successfully.
             st.success('Prediction completed!')
             st.session_state['prediction_made'] = True
             if 'on_target_results' in st.session_state and st.session_state['on_target_results']:
                 ensembl_id = gene_annotations.get(gene_symbol, 'Unknown')  # Get Ensembl ID or default to 'Unknown'
                 col1, col2, col3 = st.columns(3)
                 # Include "Target" in the DataFrame's columns
                 try:
                     df = pd.DataFrame(st.session_state['on_target_results'],
+                                      columns=["Chr", "Start Pos", "End Pos", "Strand", "Transcript", "Exon", "Target", "gRNA", "Prediction"])
                     st.dataframe(df)
                 except ValueError as e:
                     st.error(f"DataFrame creation error: {e}")
                 EXON_BASE = 0  # Base position for exons and CDS on the Y axis
                 EXON_HEIGHT = 0.02  # How 'tall' the exon markers should appear
                 # Plot Exons as small markers on the X-axis
                 for exon in st.session_state['exons']:
                         name='Exon'
                     ))
                 VERTICAL_GAP = 0.2  # Gap between different ranks
                 # Define max and min Y values based on strand and rank
                 # Display the plot
                 st.plotly_chart(fig)
+                # if 'gene_sequence' in st.session_state and st.session_state['gene_sequence']:
+                #     gene_symbol = st.session_state['current_gene_symbol']
+                #     gene_sequence = st.session_state['gene_sequence']
+                #
+                #     # Define file paths
+                #     genbank_file_path = f"{gene_symbol}_crispr_targets.gb"
+                #     bed_file_path = f"{gene_symbol}_crispr_targets.bed"
+                #     csv_file_path = f"{gene_symbol}_crispr_predictions.csv"
+                #
+                #     # Generate files
+                #     cas9on.generate_genbank_file_from_df(df, gene_sequence, gene_symbol, genbank_file_path)
+                #     cas9on.create_bed_file_from_df(df, bed_file_path)
+                #     cas9on.create_csv_from_df(df, csv_file_path)
+                #
+                #     # Prepare an in-memory buffer for the ZIP file
+                #     zip_buffer = io.BytesIO()
+                #     with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
+                #         # For each file, add it to the ZIP file
+                #         zip_file.write(genbank_file_path, arcname=genbank_file_path.split('/')[-1])
+                #         zip_file.write(bed_file_path, arcname=bed_file_path.split('/')[-1])
+                #         zip_file.write(csv_file_path, arcname=csv_file_path.split('/')[-1])
+                #
+                #     # Important: move the cursor to the beginning of the BytesIO buffer before reading it
+                #     zip_buffer.seek(0)
+                #
+                #     # Display the download button for the ZIP file
+                #     st.download_button(
+                #         label="Download genbank,.bed,csv files as ZIP",
+                #         data=zip_buffer.getvalue(),
+                #         file_name=f"{gene_symbol}_files.zip",
+                #         mime="application/zip"
+                #     )
     elif target_selection == 'off-target':
         ENTRY_METHODS = dict(

cas9on.py CHANGED Viewed

@@ -39,167 +39,159 @@ class DCModelOntar:
         yp = self.model.predict(x)
         return yp.ravel()
-# Function to predict on-target efficiency and format output
-def format_prediction_output(targets, model_path):
-    dcModel = DCModelOntar(model_path)
-    formatted_data = []
-    for target in targets:
-        # Encode the gRNA sequence
-        encoded_seq = get_seqcode(target[0]).reshape(-1,4,1,23)
-        # Predict on-target efficiency using the model
-        prediction = dcModel.ontar_predict(encoded_seq)
-        # Format output
-        sgRNA = target[1]
-        chr = target[2]
-        start = target[3]
-        end = target[4]
-        strand = target[5]
-        transcript_id = target[6]
-        formatted_data.append([chr, start, end, strand, transcript_id, target[0], sgRNA, prediction[0]])
-    return formatted_data
 def fetch_ensembl_transcripts(gene_symbol):
-    headers = {"Content-Type": "application/json"}
-    url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_symbol}?expand=1"
-    response = requests.get(url, headers=headers)
     if response.status_code == 200:
         gene_data = response.json()
-        return gene_data.get('Transcript', [])
     else:
         print(f"Error fetching gene data from Ensembl: {response.text}")
         return None
 def fetch_ensembl_sequence(transcript_id):
-    headers = {"Content-Type": "application/json"}
-    url = f"https://rest.ensembl.org/sequence/id/{transcript_id}"
-    response = requests.get(url, headers=headers)
     if response.status_code == 200:
         sequence_data = response.json()
-        return sequence_data.get('seq', '')
     else:
-        print(f"Error fetching sequence data from Ensembl for transcript {transcript_id}: {response.text}")
         return None
-def fetch_ensembl_exons(transcript_id):
-    headers = {"Content-Type": "application/json"}
-    url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=exon"
-    response = requests.get(url, headers=headers)
-    if response.status_code == 200:
-        return response.json()
-    else:
-        print(f"Error fetching exon data from Ensembl for transcript {transcript_id}: {response.text}")
-        return None
-def fetch_ensembl_cds(transcript_id):
-    headers = {"Content-Type": "application/json"}
-    url = f"https://rest.ensembl.org/overlap/id/{transcript_id}?feature=cds"
-    response = requests.get(url, headers=headers)
-    if response.status_code == 200:
-        return response.json()
-    else:
-        print(f"Error fetching CDS data from Ensembl for transcript {transcript_id}: {response.text}")
-        return None
-def find_crispr_targets(sequence, chr, start, strand, transcript_id, pam="NGG", target_length=20):
     targets = []
     len_sequence = len(sequence)
     complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
     if strand == -1:
-        sequence = ''.join([complement[base] for base in reversed(sequence)])
     for i in range(len_sequence - len(pam) + 1):
         if sequence[i + 1:i + 3] == pam[1:]:
             if i >= target_length:
                 target_seq = sequence[i - target_length:i + 3]
                 tar_start = start + i - target_length
                 tar_end = start + i + 3
-                sgRNA = sequence[i - target_length:i]
-                targets.append([target_seq, sgRNA, chr, str(tar_start), str(tar_end), str(strand), transcript_id])
     return targets
 def process_gene(gene_symbol, model_path):
     transcripts = fetch_ensembl_transcripts(gene_symbol)
-    all_data = []
     if transcripts:
-        cdslist = fetch_ensembl_cds(transcripts[0].get('id'))
-        for transcript in transcripts:
-            transcript_id = transcript.get('id')
-            chr = transcript.get('seq_region_name', 'unknown')
-            start = transcript.get('start', 0)
-            strand = transcript.get('strand', 'unknown')
-            # Fetch the gene sequence for each transcript
-            gene_sequence = fetch_ensembl_sequence(transcript_id) or ''
-            # Fetch exon and CDS information is not directly used here but you may need it elsewhere
-            exons = fetch_ensembl_exons(transcript_id)
-            if gene_sequence:
-                # Now correctly passing transcript_id as an argument
-                gRNA_sites = find_crispr_targets(gene_sequence, chr, start, strand, transcript_id)
-                if gRNA_sites:
-                    formatted_data = format_prediction_output(gRNA_sites, model_path)
-                    all_data.extend(formatted_data)
-    # Return the data and potentially any other information as needed
-    return all_data, gene_sequence, exons, cdslist
-def create_genbank_features(formatted_data):
-    features = []
-    for data in formatted_data:
-        # Strand conversion to Biopython's convention
-        strand = 1 if data[3] == '+' else -1
-        location = FeatureLocation(start=int(data[1]), end=int(data[2]), strand=strand)
-        feature = SeqFeature(location=location, type="misc_feature", qualifiers={
-            'label': data[5],  # Use gRNA as the label
-            'target': data[4],  # Include the target sequence
-            'note': f"Prediction: {data[6]}"  # Include the prediction score
-        })
-        features.append(feature)
-    return features
-def generate_genbank_file_from_df(df, gene_sequence, gene_symbol, output_path):
-    features = []
-    for index, row in df.iterrows():
-        # Use 'Transcript ID' if it exists, otherwise use a default value like 'Unknown'
-        transcript_id = row.get("Transcript ID", "Unknown")
-        # Make sure to use the correct column names for Start Pos, End Pos, and Strand
-        location = FeatureLocation(start=int(row["Start Pos"]),
-                                   end=int(row["End Pos"]),
-                                   strand=1 if row["Strand"] == '+' else -1)
-        feature = SeqFeature(location=location, type="gene", qualifiers={
-            'locus_tag': transcript_id,  # Now using the variable that holds the safe value
-            'note': f"gRNA: {row['gRNA']}, Prediction: {row['Prediction']}"
-        })
-        features.append(feature)
-    # The rest of the function remains unchanged
-    record = SeqRecord(Seq(gene_sequence), id=gene_symbol, name=gene_symbol,
-                       description=f'CRISPR Cas9 predicted targets for {gene_symbol}', features=features)
-    record.annotations["molecule_type"] = "DNA"
-    SeqIO.write(record, output_path, "genbank")
-def create_bed_file_from_df(df, output_path):
-    with open(output_path, 'w') as bed_file:
-        for index, row in df.iterrows():
-            # Adjust field names based on your actual formatted data
-            chrom = row["Chr"]
-            start = int(row["Start Pos"])
-            end = int(row["End Pos"])
-            strand = '+' if row["Strand"] == '+' else '-'  # Ensure strand is correctly interpreted
-            gRNA = row["gRNA"]
-            score = str(row["Prediction"])  # Ensure score is converted to string if not already
-            transcript_id = row["Transcript"]  # Extract transcript ID
-            bed_file.write(f"{chrom}\t{start}\t{end}\t{gRNA}\t{score}\t{strand}\t{transcript_id}\n")  # Include transcript ID in BED output
-def create_csv_from_df(df, output_path):
-    df.to_csv(output_path, index=False)

         yp = self.model.predict(x)
         return yp.ravel()
 def fetch_ensembl_transcripts(gene_symbol):
+    url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_symbol}?expand=1;content-type=application/json"
+    response = requests.get(url)
     if response.status_code == 200:
         gene_data = response.json()
+        if 'Transcript' in gene_data:
+            return gene_data['Transcript']
+        else:
+            print("No transcripts found for gene:", gene_symbol)
+            return None
     else:
         print(f"Error fetching gene data from Ensembl: {response.text}")
         return None
 def fetch_ensembl_sequence(transcript_id):
+    url = f"https://rest.ensembl.org/sequence/id/{transcript_id}?content-type=application/json"
+    response = requests.get(url)
     if response.status_code == 200:
         sequence_data = response.json()
+        if 'seq' in sequence_data:
+            return sequence_data['seq']
+        else:
+            print("No sequence found for transcript:", transcript_id)
+            return None
     else:
+        print(f"Error fetching sequence data from Ensembl: {response.text}")
         return None
+def find_crispr_targets(sequence, chr, start, strand, transcript_id, exon_id, pam="NGG", target_length=20):
     targets = []
     len_sequence = len(sequence)
     complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
+    dnatorna = {'A': 'A', 'T': 'U', 'C': 'C', 'G': 'G'}
     if strand == -1:
+        sequence = ''.join([complement[base] for base in sequence])
     for i in range(len_sequence - len(pam) + 1):
         if sequence[i + 1:i + 3] == pam[1:]:
             if i >= target_length:
                 target_seq = sequence[i - target_length:i + 3]
                 tar_start = start + i - target_length
                 tar_end = start + i + 3
+                gRNA = ''.join([dnatorna[base] for base in sequence[i - target_length:i]])
+                targets.append([target_seq, gRNA, chr, str(tar_start), str(tar_end), str(strand), transcript_id, exon_id])
     return targets
+# Function to predict on-target efficiency and format output
+def format_prediction_output(targets, model_path):
+    dcModel = DCModelOntar(model_path)
+    formatted_data = []
+    for target in targets:
+        # Encode the gRNA sequence
+        encoded_seq = get_seqcode(target[0]).reshape(-1,4,1,23)
+        # Predict on-target efficiency using the model
+        prediction = dcModel.ontar_predict(encoded_seq)
+        # Format output
+        gRNA = target[1]
+        chr = target[2]
+        start = target[3]
+        end = target[4]
+        strand = target[5]
+        transcript_id = target[6]
+        exon_id = target[7]
+        formatted_data.append([chr, start, end, strand, transcript_id, exon_id, target[0], gRNA, prediction[0]])
+    return formatted_data
 def process_gene(gene_symbol, model_path):
     transcripts = fetch_ensembl_transcripts(gene_symbol)
+    results = []
     if transcripts:
+        for i in range(len(transcripts)):
+            Exons = transcripts[i]['Exon']
+            transcript_id = transcripts[i]['id']
+            for j in range(len(Exons)):
+                exon_id = Exons[j]['id']
+                gene_sequence = fetch_ensembl_sequence(exon_id)
+                if gene_sequence:
+                    start = Exons[j]['start']
+                    strand = Exons[j]['strand']
+                    chr = Exons[j]['seq_region_name']
+                    targets = find_crispr_targets(gene_sequence, chr, start, strand, transcript_id, exon_id)
+                    if not targets:
+                        print("No gRNA sites found in the gene sequence.")
+                    else:
+                        # Predict on-target efficiency for each gRNA site
+                        formatted_data = format_prediction_output(targets,model_path)
+                        results.append(formatted_data)
+                    # for data in formatted_data:
+                    #    print(f"Chr: {data[0]}, Start: {data[1]}, End: {data[2]}, Strand: {data[3]}, gRNA: {data[4]}, pred_Score: {data[5]}")
+                else:
+                    print("Failed to retrieve gene sequence.")
+    else:
+        print("Failed to retrieve transcripts.")
+    return results, gene_sequence, Exons
+# def create_genbank_features(formatted_data):
+#     features = []
+#     for data in formatted_data:
+#         # Strand conversion to Biopython's convention
+#         strand = 1 if data[3] == '+' else -1
+#         location = FeatureLocation(start=int(data[1]), end=int(data[2]), strand=strand)
+#         feature = SeqFeature(location=location, type="misc_feature", qualifiers={
+#             'label': data[5],  # Use gRNA as the label
+#             'target': data[4],  # Include the target sequence
+#             'note': f"Prediction: {data[6]}"  # Include the prediction score
+#         })
+#         features.append(feature)
+#     return features
+#
+# def generate_genbank_file_from_df(df, gene_sequence, gene_symbol, output_path):
+#     features = []
+#     for index, row in df.iterrows():
+#         # Use 'Transcript ID' if it exists, otherwise use a default value like 'Unknown'
+#         transcript_id = row.get("Transcript ID", "Unknown")
+#
+#         # Make sure to use the correct column names for Start Pos, End Pos, and Strand
+#         location = FeatureLocation(start=int(row["Start Pos"]),
+#                                    end=int(row["End Pos"]),
+#                                    strand=1 if row["Strand"] == '+' else -1)
+#         feature = SeqFeature(location=location, type="gene", qualifiers={
+#             'locus_tag': transcript_id,  # Now using the variable that holds the safe value
+#             'note': f"gRNA: {row['gRNA']}, Prediction: {row['Prediction']}"
+#         })
+#         features.append(feature)
+#
+#     # The rest of the function remains unchanged
+#     record = SeqRecord(Seq(gene_sequence), id=gene_symbol, name=gene_symbol,
+#                        description=f'CRISPR Cas9 predicted targets for {gene_symbol}', features=features)
+#     record.annotations["molecule_type"] = "DNA"
+#     SeqIO.write(record, output_path, "genbank")
+#
+#
+# def create_bed_file_from_df(df, output_path):
+#     with open(output_path, 'w') as bed_file:
+#         for index, row in df.iterrows():
+#             # Adjust field names based on your actual formatted data
+#             chrom = row["Chr"]
+#             start = int(row["Start Pos"])
+#             end = int(row["End Pos"])
+#             strand = '+' if row["Strand"] == '+' else '-'  # Ensure strand is correctly interpreted
+#             gRNA = row["gRNA"]
+#             score = str(row["Prediction"])  # Ensure score is converted to string if not already
+#             transcript_id = row["Transcript"]  # Extract transcript ID
+#             bed_file.write(f"{chrom}\t{start}\t{end}\t{gRNA}\t{score}\t{strand}\t{transcript_id}\n")  # Include transcript ID in BED output
+#
+#
+# def create_csv_from_df(df, output_path):
+#     df.to_csv(output_path, index=False)