Spaces:
Sleeping
Sleeping
| import os | |
| os.system("pip install streamlit pandas xlsxwriter openpyxl pymongo") | |
| import pandas as pd | |
| import random | |
| from collections import defaultdict | |
| from pymongo import MongoClient | |
| import streamlit as st | |
| # MongoDB connection string (replace with your actual password) | |
| client = MongoClient("mongodb+srv://dhruvmangroliya:Eussmh5MbCBIkLJ6@cluster0.rrnbxfw.mongodb.net/BTP_DB?retryWrites=true&w=majority") | |
| db = client['BTP_DB'] | |
| results_collection = db['protein_results'] | |
| # Function to fragment the protein sequence into chunks of max length 1000 | |
| def fragment_protein_sequence(sequence, max_length=1000): | |
| return [sequence[i:i+max_length] for i in range(0, len(sequence), max_length)] | |
| # Function to find repeating amino acid sequences | |
| def find_hetero_amino_acid_repeats(sequence): | |
| repeat_counts = defaultdict(int) | |
| for length in range(2, len(sequence) + 1): | |
| for i in range(len(sequence) - length + 1): | |
| substring = sequence[i:i+length] | |
| repeat_counts[substring] += 1 | |
| return {k: v for k, v in repeat_counts.items() if v > 1} | |
| # Function to check and update repeats at boundaries | |
| def check_boundary_repeats(fragments, final_repeats, overlap=50): | |
| for i in range(len(fragments) - 1): | |
| left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i] | |
| right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1] | |
| overlap_region = left_overlap + right_overlap | |
| boundary_repeats = find_hetero_amino_acid_repeats(overlap_region) | |
| for substring, count in boundary_repeats.items(): | |
| if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring): | |
| final_repeats[substring] += count | |
| return final_repeats | |
| # Function to find new repeats that only appear at fragmentation points | |
| def find_new_boundary_repeats(fragments, final_repeats, overlap=50): | |
| new_repeats = defaultdict(int) | |
| for i in range(len(fragments) - 1): | |
| left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i] | |
| right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1] | |
| overlap_region = left_overlap + right_overlap | |
| boundary_repeats = find_hetero_amino_acid_repeats(overlap_region) | |
| for substring, count in boundary_repeats.items(): | |
| if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring): | |
| if substring not in final_repeats: | |
| new_repeats[substring] += count | |
| return new_repeats | |
| # Main function to process the protein sequence | |
| def process_protein_sequence(sequence, overlap=50): | |
| fragments = fragment_protein_sequence(sequence) | |
| final_repeats = defaultdict(int) | |
| # Find repeats in each fragment | |
| for fragment in fragments: | |
| fragment_repeats = find_hetero_amino_acid_repeats(fragment) | |
| for k, v in fragment_repeats.items(): | |
| final_repeats[k] += v | |
| # Check and update repeats at boundaries | |
| final_repeats = check_boundary_repeats(fragments, final_repeats, overlap) | |
| # Find new repeats emerging at boundaries | |
| new_repeats = find_new_boundary_repeats(fragments, final_repeats, overlap) | |
| # Merge new repeats into final dictionary | |
| for k, v in new_repeats.items(): | |
| final_repeats[k] += v | |
| return final_repeats | |
| # Streamlit UI for uploading and processing the Excel file | |
| st.title("Protein Sequence Repeat Finder from Excel") | |
| # Step 1: Upload the Excel file | |
| uploaded_file = st.file_uploader("Upload Excel file containing Protein Sequences", type=["xlsx"]) | |
| if uploaded_file is not None: | |
| # Step 2: Read the Excel file using Pandas | |
| df = pd.read_excel(uploaded_file) | |
| # Show the first few rows of the uploaded data for preview | |
| st.write("Preview of Uploaded Data:") | |
| st.write(df.head()) | |
| # Step 3: Process each protein sequence | |
| if st.button("Process Protein Sequences"): | |
| results = [] | |
| for index, row in df.iterrows(): | |
| protein_id = row["Entry"] | |
| protein_name = row["Entry Name"] | |
| sequence = row["Sequence"] # Assuming the protein sequence is in a column named 'Protein_Sequence' | |
| # Process the protein sequence | |
| repeats = process_protein_sequence(sequence) | |
| # Prepare data for MongoDB | |
| result_data = { | |
| "protein_id": protein_id, | |
| "protein_name": protein_name, | |
| "protein_sequence": sequence, | |
| "calculated_repeats": repeats | |
| } | |
| # Insert results into MongoDB | |
| results_collection.insert_one(result_data) | |
| # Add results to display | |
| results.append({ | |
| "Entry": protein_id, | |
| "Entry Name": protein_name, | |
| "Repeats": repeats | |
| }) | |
| # Step 4: Display the results | |
| st.subheader("Protein Sequences Processed") | |
| st.write(results) | |
| st.success("Protein sequences processed and results stored in MongoDB.") |