Homo_hetero / app.py
Jayesh13's picture
Update app.py
0ca1e97 verified
import os
os.system("pip install streamlit pandas xlsxwriter openpyxl pymongo")
import pandas as pd
import random
from collections import defaultdict
from pymongo import MongoClient
import streamlit as st
# MongoDB connection string (replace with your actual password)
client = MongoClient("mongodb+srv://dhruvmangroliya:Eussmh5MbCBIkLJ6@cluster0.rrnbxfw.mongodb.net/BTP_DB?retryWrites=true&w=majority")
db = client['BTP_DB']
results_collection = db['protein_results']
# Function to fragment the protein sequence into chunks of max length 1000
def fragment_protein_sequence(sequence, max_length=1000):
return [sequence[i:i+max_length] for i in range(0, len(sequence), max_length)]
# Function to find repeating amino acid sequences
def find_hetero_amino_acid_repeats(sequence):
repeat_counts = defaultdict(int)
for length in range(2, len(sequence) + 1):
for i in range(len(sequence) - length + 1):
substring = sequence[i:i+length]
repeat_counts[substring] += 1
return {k: v for k, v in repeat_counts.items() if v > 1}
# Function to check and update repeats at boundaries
def check_boundary_repeats(fragments, final_repeats, overlap=50):
for i in range(len(fragments) - 1):
left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
overlap_region = left_overlap + right_overlap
boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
for substring, count in boundary_repeats.items():
if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
final_repeats[substring] += count
return final_repeats
# Function to find new repeats that only appear at fragmentation points
def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
new_repeats = defaultdict(int)
for i in range(len(fragments) - 1):
left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
overlap_region = left_overlap + right_overlap
boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
for substring, count in boundary_repeats.items():
if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
if substring not in final_repeats:
new_repeats[substring] += count
return new_repeats
# Main function to process the protein sequence
def process_protein_sequence(sequence, overlap=50):
fragments = fragment_protein_sequence(sequence)
final_repeats = defaultdict(int)
# Find repeats in each fragment
for fragment in fragments:
fragment_repeats = find_hetero_amino_acid_repeats(fragment)
for k, v in fragment_repeats.items():
final_repeats[k] += v
# Check and update repeats at boundaries
final_repeats = check_boundary_repeats(fragments, final_repeats, overlap)
# Find new repeats emerging at boundaries
new_repeats = find_new_boundary_repeats(fragments, final_repeats, overlap)
# Merge new repeats into final dictionary
for k, v in new_repeats.items():
final_repeats[k] += v
return final_repeats
# Streamlit UI for uploading and processing the Excel file
st.title("Protein Sequence Repeat Finder from Excel")
# Step 1: Upload the Excel file
uploaded_file = st.file_uploader("Upload Excel file containing Protein Sequences", type=["xlsx"])
if uploaded_file is not None:
# Step 2: Read the Excel file using Pandas
df = pd.read_excel(uploaded_file)
# Show the first few rows of the uploaded data for preview
st.write("Preview of Uploaded Data:")
st.write(df.head())
# Step 3: Process each protein sequence
if st.button("Process Protein Sequences"):
results = []
for index, row in df.iterrows():
protein_id = row["Entry"]
protein_name = row["Entry Name"]
sequence = row["Sequence"] # Assuming the protein sequence is in a column named 'Protein_Sequence'
# Process the protein sequence
repeats = process_protein_sequence(sequence)
# Prepare data for MongoDB
result_data = {
"protein_id": protein_id,
"protein_name": protein_name,
"protein_sequence": sequence,
"calculated_repeats": repeats
}
# Insert results into MongoDB
results_collection.insert_one(result_data)
# Add results to display
results.append({
"Entry": protein_id,
"Entry Name": protein_name,
"Repeats": repeats
})
# Step 4: Display the results
st.subheader("Protein Sequences Processed")
st.write(results)
st.success("Protein sequences processed and results stored in MongoDB.")