| | import os |
| | import streamlit as st |
| | import pandas as pd |
| | import csv |
| | import collections |
| | import joblib |
| |
|
| | root = os.path.dirname(os.path.abspath(__file__)) |
| |
|
| | FREQUENT_CUTOFF = 40 |
| | MEDIUM_CUTOFF = 10 |
| |
|
| | st.set_page_config( |
| | page_title="Ligand Disovery 2: Explore Protein-sets", |
| | page_icon=":home:", |
| | layout="wide", |
| | initial_sidebar_state="expanded" |
| | ) |
| |
|
| | st.markdown(""" |
| | <style> |
| | .css-13sdm1b.e16nr0p33 { |
| | margin-top: -75px; |
| | } |
| | </style> |
| | """, unsafe_allow_html=True) |
| |
|
| | hide_streamlit_style = """ |
| | <style> |
| | #MainMenu {visibility: hidden;} |
| | footer {visibility: hidden;} |
| | #header {visibility: hidden;} |
| | </style> |
| | """ |
| | st.markdown(hide_streamlit_style, unsafe_allow_html=True) |
| |
|
| |
|
| | |
| |
|
| | @st.cache_data() |
| | def load_screening_hits(): |
| | db = pd.read_csv(os.path.join(root, "./screening_hits.tsv"), sep="\t") |
| | return db |
| |
|
| | @st.cache_data() |
| | def load_human_proteome(): |
| | human_proteome = pd.read_csv(os.path.join(root, "./human_proteome_with_gene_names.tab"), sep="\t") |
| | return human_proteome |
| |
|
| | @st.cache_data() |
| | def load_hek_proteome(): |
| | hek_proteome = [] |
| | with open(os.path.join(root, "./hek293t_core.tsv"), "r") as f: |
| | reader = csv.reader(f) |
| | for r in reader: |
| | hek_proteome += [r[0]] |
| | hek_proteome = set(hek_proteome) |
| | return hek_proteome |
| |
|
| | @st.cache_data() |
| | def load_pid2name_primary(): |
| | return joblib.load(os.path.join(root, "./pid2name_primary.joblib")) |
| |
|
| | @st.cache_data() |
| | def convert_df(df): |
| | return df.to_csv(index=False).encode('utf-8') |
| |
|
| | @st.cache_data() |
| | def convert_df_no_header(df): |
| | return df.to_csv(index=False, header=False).encode('utf-8') |
| |
|
| | @st.cache_data() |
| | def example_input_load(): |
| | pids = [] |
| | with open(os.path.join(root, "./example_input.csv"), "r") as f: |
| | reader = csv.reader(f) |
| | for r in reader: |
| | pids += [r[0]] |
| | return pids |
| |
|
| | db = load_screening_hits() |
| | hek_proteome = load_hek_proteome() |
| | pid2name_primary = load_pid2name_primary() |
| | human_proteome = set(pid2name_primary.keys()) |
| | example_input = example_input_load() |
| |
|
| | any2pid = {} |
| | for k,v in pid2name_primary.items(): |
| | any2pid[v] = k |
| | any2pid[k] = k |
| |
|
| | pid2fid = collections.defaultdict(list) |
| | fid2pid = collections.defaultdict(list) |
| | for r in db[["Accession", "FragID"]].values: |
| | pid2fid[r[0]] += [r[1]] |
| | fid2pid[r[1]] += [r[0]] |
| |
|
| | frequent_hitters = set() |
| | normal_hitters = set() |
| | specific_hitters = set() |
| | for k,v in pid2fid.items(): |
| | if len(v) >= FREQUENT_CUTOFF: |
| | frequent_hitters.update([k]) |
| | continue |
| | if len(v) >= MEDIUM_CUTOFF: |
| | normal_hitters.update([k]) |
| | continue |
| | specific_hitters.update([k]) |
| |
|
| | options = sorted([x for k,v in pid2name_primary.items() for x in [k,v]]) |
| |
|
| | |
| |
|
| | st.sidebar.title("Ligand Discovery 2: Explore Protein-sets") |
| | st.sidebar.write("We screened 407 fully-functionalized small molecule fragments ('Ligands') in HEK293t cells. For {0} of the Ligands, we found at least one protein enriched. In total, we enriched {1} proteins at least once. Query your protein sets of interest and explore them in light of our dataset!".format(len(fid2pid), len(pid2fid))) |
| |
|
| | manual_input = st.sidebar.multiselect(label="Input proteins manually", options = [""] + sorted(options), default=[], help="Select proteins by UniProt Accession code or Gene Symbol") |
| | user_pids = {} |
| | user_input = [] |
| | for i in manual_input: |
| | user_pids[i] = any2pid[i] |
| | user_input += [i] |
| |
|
| | st.sidebar.subheader("OR") |
| |
|
| | fids = sorted(set(db["FragID"])) |
| | fid_input = st.sidebar.selectbox(label="Select pre-screened Ligand by identifier", options = [""] + fids, help="Select an already profiled Ligand in our primary screening (page Interactions). Use the Ligand identifier (example, C001)") |
| | if fid_input != "": |
| | user_input = fid2pid[fid_input] |
| | user_pids = dict((r,r) for r in user_input) |
| |
|
| | st.sidebar.subheader("OR") |
| |
|
| | example_file = db |
| | file_input = st.sidebar.file_uploader(label="Upload a file", help="Provide a file containing one UniProt Accession code or Gene Symbol per row.") |
| | if file_input: |
| | user_input = list(pd.read_csv(file_input, header=None)[0]) |
| | for i in user_input: |
| | user_pids[i] = any2pid[i] |
| |
|
| | st.sidebar.download_button(label="Download example file", data=convert_df_no_header(pd.DataFrame({"uniprot_ac": example_input})), file_name="protein_profile_example.csv", mime="text/csv") |
| |
|
| | |
| |
|
| | if not manual_input: |
| | manual_input = None |
| |
|
| | if not fid_input: |
| | fid_input = None |
| |
|
| | if not file_input: |
| | file_input = None |
| |
|
| | if not manual_input and not file_input and not fid_input: |
| | st.sidebar.info("Use any of the options above to explore a protein profile...") |
| | query_is_available = False |
| | else: |
| | c = 0 |
| | for x in [manual_input, fid_input, file_input]: |
| | if x is not None: |
| | c += 1 |
| | if c > 1: |
| | st.sidebar.error("More than one input type has been provided! Please only choose one of the options, i.e. input proteins manually, or select a pre-screened Ligand, or upload a file. Refresh this window to get started again.") |
| | query_is_available = False |
| | else: |
| | query_is_available = True |
| |
|
| |
|
| | def serialize_s(cat, r): |
| | s = [cat] + r[:-1] + [" ".join(r[-1])] |
| | return s |
| |
|
| | if query_is_available: |
| | columns = st.columns([0.5, 0.5]) |
| |
|
| | done = set() |
| |
|
| | col = columns[0] |
| | cat_name = "Frequently enriched" |
| | S = [] |
| | R = [] |
| | for r in user_input: |
| | pid = user_pids[r] |
| | if pid in frequent_hitters: |
| | R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), pid2fid[pid]]] |
| | S += [serialize_s(cat_name, R[-1])] |
| | done.update([r]) |
| | df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"]) |
| | col.markdown("**{0} (Low specificity)** : {1}".format(cat_name, df.shape[0])) |
| | col.dataframe(df, use_container_width=True) |
| |
|
| | col = columns[1] |
| | cat_name = "Medium specificity" |
| | R = [] |
| | for r in user_input: |
| | pid = user_pids[r] |
| | if pid in normal_hitters: |
| | R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), sorted(pid2fid[pid])]] |
| | S += [serialize_s(cat_name, R[-1])] |
| | done.update([r]) |
| | df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"]) |
| | col.markdown("**{0}** : {1}".format(cat_name, df.shape[0])) |
| | col.dataframe(df, use_container_width=True) |
| |
|
| | st.divider() |
| | columns = st.columns([0.5, 0.25, 0.25]) |
| | |
| | col = columns[0] |
| | cat_name = "High specificity" |
| | R = [] |
| | for r in user_input: |
| | pid = user_pids[r] |
| | if pid in specific_hitters: |
| | R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), sorted(pid2fid[pid])]] |
| | S += [serialize_s(cat_name, R[-1])] |
| | done.update([r]) |
| | df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"]) |
| | col.markdown("**{0}** : {1}".format(cat_name, df.shape[0])) |
| | col.dataframe(df, use_container_width=True) |
| |
|
| | col = columns[1] |
| | cat_name = "Never enriched" |
| | R = [] |
| | for r in user_input: |
| | if r in done: |
| | continue |
| | pid = user_pids[r] |
| | if pid in hek_proteome: |
| | R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), sorted(pid2fid[pid])]] |
| | S += [serialize_s(cat_name, R[-1])] |
| | done.update([r]) |
| | df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"]) |
| | col.markdown("**{0}** : {1}".format(cat_name, df.shape[0])) |
| | col.dataframe(df[["UniProt", "GeneName"]], use_container_width=True) |
| |
|
| | col = columns[2] |
| | cat_name = "Not in HEK293t" |
| | R = [] |
| | for r in user_input: |
| | if r in done: |
| | continue |
| | pid = user_pids[r] |
| | if pid in human_proteome: |
| | fids_ = sorted(pid2fid[pid]) |
| | R += [[pid, pid2name_primary[pid], len(pid2fid[pid]), fids_]] |
| | S += [serialize_s(cat_name, R[-1])] |
| | df = pd.DataFrame(R, columns=["UniProt", "GeneName", "Hits", "Fragments"]) |
| | col.markdown("**{0}** : {1}".format(cat_name, df.shape[0])) |
| | col.dataframe(df[["UniProt", "GeneName"]], use_container_width=True) |
| | |
| | data = pd.DataFrame(S, columns = ["Category", "UniProt", "GeneName", "Hits", "Fragments"]) |
| | data = data.sort_values(by=["Hits", "GeneName", "Category"], ascending=[False, True, True]).reset_index(drop=True) |
| | data = convert_df(data) |
| | st.download_button(label="Download search results", data=data, file_name="ligand_discovery_search_results.csv", mime="text/csv") |