import streamlit as st
import requests
from io import StringIO
from Bio import SeqIO
import os
import time
import pandas as pd
from run_domain2go_app import *
def convert_df(df):
return df.to_csv(index=False).encode('utf-8')
st.markdown("""
Disclaimer
This program is designed to generate predictions for a single protein due to the extended runtime of InterProScan. If you need predictions for multiple UniProtKB/Swiss-Prot proteins, we recommend utilizing our comprehensive protein function prediction dataset available in our Github repository.
""", unsafe_allow_html=True)
domain_tab, pred_tab = st.tabs(['Domains', 'Function predictions'])
with domain_tab:
st.header('Domains in sequence')
with st.sidebar:
st.title("Domain2GO: Mutual Annotation-Based Prediction of Protein Domain Functions")
st.write("[![arXiv](https://img.shields.io/badge/bioRxiv-2022.11.03.514980-b31b1b.svg)](https://www.biorxiv.org/content/10.1101/2022.11.03.514980v1) [![github-repository](https://img.shields.io/badge/GitHub-black?logo=github)](https://github.com/HUBioDataLab/Domain2GO)")
if 'example_seq_button' not in st.session_state:
st.session_state.example_seq_button = False
def click_button():
st.session_state.example_seq_button = not st.session_state.example_seq_button
input_type = st.radio('Select input type', ['Enter sequence', 'Upload FASTA file'])
if input_type == 'Enter sequence':
if st.session_state.example_seq_button:
sequence = st.text_area('Enter protein sequence in FASTA format.',
value='>sp|O18783|PLMN_NOTEU\n'
'MEYGKVIFLFLLFLKSGQGESLENYIKTEGASLSNSQKKQFVASSTEECEALCEKETEFVCRSFEHYNKEQKCVIMSENSKTSSVERKRDVVLFEKRIYLSDCKSGNGRNYRGTLSKTKSGITCQKWSDLSPHVPNYAPSKYPDAGLEKNYCRNPDDDVKGPWCYTTNPDIRYEYCDVPECEDECMHCSGENYRGTISKTESGIECQPWDSQEPHSHEYIPSKFPSKDLKENYCRNPDGEPRPWCFTSNPEKRWEFCNIPRCSSPPPPPGPMLQCLKGRGENYRGKIAVTKSGHTCQRWNKQTPHKHNRTPENFPCRGLDENYCRNPDGELEPWCYTTNPDVRQEYCAIPSCGTSSPHTDRVEQSPVIQECYEGKGENYRGTTSTTISGKKCQAWSSMTPHQHKKTPDNFPNADLIRNYCRNPDGDKSPWCYTMDPTVRWEFCNLEKCSGTGSTVLNAQTTRVPSVDTTSHPESDCMYGSGKDYRGKRSTTVTGTLCQAWTAQEPHRHTIFTPDTYPRAGLEENYCRNPDGDPNGPWCYTTNPKKLFDYCDIPQCVSPSSFDCGKPRVEPQKCPGRIVGGCYAQPHSWPWQISLRTRFGEHFCGGTLIAPQWVLTAAHCLERSQWPGAYKVILGLHREVNPESYSQEIGVSRLFKGPLAADIALLKLNRPAAINDKVIPACLPSQDFMVPDRTLCHVTGWGDTQGTSPRGLLKQASLPVIDNRVCNRHEYLNGRVKSTELCAGHLVGRGDSCQGDSGGPLICFEDDKYVLQGVTSWGLGCARPNKPGVYVRVSRYISWIEDVMKNN')
else:
sequence = st.text_input('Enter protein sequence in FASTA format.')
name = sequence.split('\n')[0].strip('>')
st.button('Use example sequence', on_click=click_button)
else:
protein_input = st.file_uploader('Choose file')
if protein_input:
protein_input_stringio = StringIO(protein_input.getvalue().decode("utf-8"))
fasta_sequences = SeqIO.parse(protein_input_stringio, 'fasta')
for fasta in fasta_sequences:
name, sequence = fasta.id, str(fasta.seq)
email = st.text_input('Enter your email for InterProScan query: ')
# prevent user from clicking 'Find domains' button if email or sequence is empty
domains_submitted = False
if st.button('Find domains'):
if email and sequence:
domains_submitted = True
st.session_state.disabled = True
else:
st.warning('Please enter your email and protein sequence first.')
else:
with domain_tab:
st.warning('Please enter your query and click "Find domains" to see domains in sequence.')
with domain_tab:
no_domains = False
error_in_interproscan = False
if domains_submitted:
with st.spinner('Finding domains in sequence using InterProScan. This may take a while...'):
result = find_domains(email, sequence, name)
result_text = result[0]
if result_text == 'Domains found.':
st.success(result_text + 'You can now see function predictions for the sequence in the "Function predictions" tab.')
st.session_state['domain_df'] = result[1]
elif result_text == 'No domains found.':
st.warning(result_text)
no_domains = True
else:
st.error(result_text)
st.write(f'InterProScan job id: {result[1]}')
st.write(f'InterProScan job response: {result[2]}')
error_in_interproscan = True
if 'domain_df' in st.session_state:
with st.expander('Show domains in sequence'):
st.write(st.session_state.domain_df)
domains_csv = convert_df(st.session_state.domain_df)
st.download_button(
label="Download domains in sequence as CSV",
data=domains_csv,
file_name=f"{name}_domains.csv",
mime="text/csv",
)
with pred_tab:
st.header('Function predictions')
if 'domain_df' not in st.session_state:
if no_domains:
st.warning('No domains found. Please find domains in sequence first.')
elif error_in_interproscan:
st.error('Error in InterProScan. Please check InterProScan job id and response.')
else:
st.warning('Please find domains in sequence first.')
else:
with st.spinner('Generating function predictions...'):
cwd = os.getcwd()
mapping_path = "{}Domain2GO/data".format(cwd.split("Domain2GO")[0])
pred_results = generate_function_predictions(st.session_state.domain_df, mapping_path)
pred_result_text = pred_results[0]
if pred_result_text == 'Function predictions found.':
st.success(pred_result_text)
st.session_state['pred_df'] = pred_results[1]
elif pred_result_text == 'No function predictions found.':
st.warning(pred_result_text)
if 'pred_df' in st.session_state:
with st.expander('Show function predictions'):
st.write(st.session_state.pred_df)
pred_csv = convert_df(st.session_state.pred_df)
st.download_button(
label="Download function predictions as CSV",
data=pred_csv,
file_name=f"{name}_function_predictions.csv",
mime="text/csv",
)