Spaces:
Runtime error
Runtime error
import streamlit as st | |
import time | |
import concurrent.futures | |
import json | |
from gensim.models import Word2Vec | |
import pandas as pd | |
import threading | |
import matplotlib.pyplot as plt | |
import squarify | |
import numpy as np | |
import re | |
import urllib.request | |
import random | |
import plotly.express as px | |
st.set_page_config( | |
page_title="Abstractalytics", | |
page_icon=":microscope:", | |
layout="wide", #centered | |
initial_sidebar_state="auto", | |
menu_items={ | |
'About': "Abstractalytics is a Natural Language Processing (NLP) that harnesses Word2Vec to mine" | |
" insight from pubmed abstracts. Created by Jimmie E. Fata, PhD" | |
} | |
) | |
# Define the HTML and CSS styles | |
st.markdown(""" | |
<style> | |
[data-testid=stSidebar] { | |
background-color: #99CCFF; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<style> | |
body { | |
background-color: #CCFFFF; | |
# color: #ffffff; | |
# font-size: 1px | |
} | |
.stApp { | |
background-color: #CCFFFF; | |
# color: #ffffff; | |
# font-size: 1px | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
st.header(":red[*Abstractalytics*]") | |
st.subheader("*A web app designed to explore :red[*PubMed abstracts*] for deeper understanding and fresh insights, driven " | |
"by Natural Language Processing (NLP) techniques.*") | |
def custom_subheader(text, identifier, font_size): | |
st.markdown(f"<h3 id='{identifier}' style='font-size: {font_size}px;'>{text}</h3>", unsafe_allow_html=True) | |
custom_subheader("Welcome to our innovative web2vec app designed to unlock the wealth of knowledge and insights hidden " | |
"within PubMed abstracts! To begin, simply select a corpus that interests you. Next, enter a single keyword " | |
"you wish to explore within the corpus. Abstractalytics powerful Natural Language " | |
"Processing (NLP) algorithms will analyze the chosen corpus and present you with a list of top words, " | |
"genes, drugs, phytochemicals, and compounds that are contextually and semantically related " | |
"to your input. This advanced text-mining technique enables you to explore and understand complex " | |
"relationships, uncovering new discoveries and connections in your field of research across a massive " | |
"amount of abstracts. Dive in and enjoy the exploration! More oncology-related corpora comming soon.", "unique-id", 18) | |
st.markdown("---") | |
#Define the correct password | |
# CORRECT_PASSWORD = "123" | |
# Define a function to check if the password is correct | |
# def authenticate(password): | |
# if password == CORRECT_PASSWORD: | |
# return True | |
# else: | |
# return False | |
# | |
# # Create a Streamlit input field for the password | |
# password = st.text_input("Enter password:", type="password") | |
# | |
# # If the password is correct, show the app content | |
# if authenticate(password): | |
opt = st.sidebar.radio("Select a PubMed Corpus", | |
options=( | |
'Breast Cancer corpus', 'Lung Cancer corpus')) | |
# if opt == "Clotting corpus": | |
# model_used = ("pubmed_model_clotting") | |
# num_abstracts = 45493 | |
# database_name = "Clotting" | |
# if opt == "Neuroblastoma corpus": | |
# model_used = ("pubmed_model_neuroblastoma") | |
# num_abstracts = 29032 | |
# database_name = "Neuroblastoma" | |
if opt == "Breast Cancer corpus": | |
model_used = ("pubmed_model_breast_cancer2") | |
num_abstracts = 290320 | |
database_name = "Breast_cancer" | |
if opt == "Lung Cancer corpus": | |
model_used = ("lung_cancer_pubmed_model") | |
num_abstracts = 210320 | |
database_name = "Lung_cancer" | |
st.header(f":blue[{database_name} Pubmed corpus.]") | |
text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus") | |
query = text_input_value | |
query = query.lower() | |
query = re.sub("[,.?!&*;:]", "", query) | |
query = re.sub(" ", "-", query) | |
# matches = [" "] | |
# if any([x in query for x in matches]): | |
# st.write("Please only enter one term or a term without spaces") | |
# # query = input ("Enter your keyword(s):") | |
if query: | |
bar = st.progress(0) | |
time.sleep(.05) | |
st.caption(f"Searching {num_abstracts} {database_name} PubMed abstracts covering 1990-2022") | |
for i in range(10): | |
bar.progress((i + 1) * 10) | |
time.sleep(.1) | |
# try: | |
model = Word2Vec.load(f"{model_used}") # you can continue training with the loaded model! | |
words = list(model.wv.key_to_index) | |
X = model.wv[model.wv.key_to_index] | |
# print(model.wv['bfgf']) | |
model2 = model.wv[query] | |
# print(model.wv.similar_by_word('bfgf', topn=50, restrict_vocab=None)) | |
df = pd.DataFrame(X) | |
def get_compound_ids(compound_names): | |
with concurrent.futures.ThreadPoolExecutor() as executor: | |
compound_ids = list(executor.map(get_compound_id, compound_names)) | |
return compound_ids | |
import requests | |
def get_compound_id(compound_name): | |
url = f"http://rest.kegg.jp/find/compound/{compound_name}" | |
response = requests.get(url) | |
if response.status_code == 200: | |
result = response.text.split('\n') | |
if result[0]: | |
compound_id = result[0].split('\t')[0] | |
return compound_id | |
return None | |
# except: | |
# st.error("Term occurrence is too low - please try another term") | |
# st.stop() | |
st.markdown("---") | |
table = model.wv.most_similar_cosmul(query, topn=10000) | |
table = (pd.DataFrame(table)) | |
table.index.name = 'Rank' | |
table.columns = ['Word', 'SIMILARITY'] | |
pd.set_option('display.max_rows', None) | |
table2 = table.copy() | |
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap to visualize " | |
# f"<span style='color:red; font-style: italic;'>words</span> contextually " | |
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> " | |
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>", | |
# unsafe_allow_html=True) | |
# Set the max number of words to display | |
value_word = min(100, len(table2)) | |
st.markdown( | |
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_word} " | |
f"</span>words contextually and semantically similar to " | |
f"<span style='color:red; font-style: italic;'>{query} </span>within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. " | |
f"Click on the squares to expand and also the PubMed and Wikipedia links for more word information</span></p></b>", | |
unsafe_allow_html=True) | |
short_table = table2.head(value_word).round(2) | |
short_table.index += 1 | |
short_table.index = (1 / short_table.index) * 10 | |
sizes = short_table.index.tolist() | |
short_table.set_index('Word', inplace=True) | |
table2["SIMILARITY"] = 'Similarity Score ' + table2.head(value_word)["SIMILARITY"].round(2).astype(str) | |
rank_num = list(short_table.index.tolist()) | |
df = short_table | |
try: | |
df['text'] = short_table.index | |
df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ | |
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index] | |
df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index] | |
df.loc[:, 'database'] = database_name | |
fig = px.treemap(df, path=[short_table.index], values=sizes, custom_data=['href', 'text', 'database', 'href2'], | |
hover_name=(table2.head(value_word)['SIMILARITY'])) | |
fig.update(layout_coloraxis_showscale=False) | |
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) | |
fig.update_annotations(visible=False) | |
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, | |
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", | |
texttemplate="<br><span " | |
"style='font-family: Arial; font-size: 20px;'>%{customdata[1]}<br><br>" | |
"<a href='%{customdata[0]}'>PubMed" | |
"</a><br><br><a href='%{customdata[3]}'>Wikipedia" | |
"</span></a>") | |
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"]) | |
# st.pyplot(fig2) | |
st.plotly_chart(fig, use_container_width=True) | |
# st.caption( | |
# "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/") | |
# st.caption("Gene designation add in exceptions [p21, p53, her2, her3]") | |
csv = table2.head(value_word).to_csv().encode('utf-8') | |
st.download_button(label=f"download top {value_word} words (csv)", data=csv, | |
file_name=f'{database_name}_words.csv', mime='text/csv') | |
except: | |
st.warning( | |
f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus, please choose a lower number") | |
# st.markdown("---") | |
# # st.write(short_table) | |
# # | |
# | |
# # print() | |
# # print("Human genes similar to " + str(query)) | |
# df1 = table.copy() | |
# df2 = pd.read_csv('Human Genes.csv') | |
# m = df1.Word.isin(df2.symbol) | |
# df1 = df1[m] | |
# df1.rename(columns={'Word': 'Human Gene'}, inplace=True) | |
# df1["Human Gene"] = df1["Human Gene"].str.upper() | |
# # print(df1.head(50)) | |
# # print() | |
# # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False) | |
# # time.sleep(2) | |
# # Create the slider with increments of 5 up to 100 | |
# | |
# # Set the maximum number of genes to display up to 100 | |
# value_gene = min(len(df1), 100) | |
# | |
# if value_gene > 0: | |
# # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Treemap visualization of " | |
# # f"<span style='color:red; font-style: italic;'>genes</span> contextually " | |
# # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> " | |
# # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>", | |
# # unsafe_allow_html=True) | |
# | |
# st.markdown( | |
# f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_gene} " | |
# f"</span>genes contextually and semantically similar to " | |
# f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. " | |
# f"Click on the squares to expand and also the Pubmed and GeneCard links for more gene information</span></p></b>", | |
# unsafe_allow_html=True) | |
# | |
# df10 = df1.head(value_gene).copy() | |
# df10.index = (1 / df10.index) * 100000 | |
# sizes = df10.index.tolist() | |
# df10.set_index('Human Gene', inplace=True) | |
# | |
# df3 = df1.copy() | |
# df3["SIMILARITY"] = 'Similarity Score ' + df3.head(value_gene)["SIMILARITY"].round(2).astype(str) | |
# df3.reset_index(inplace=True) | |
# df3 = df3.rename(columns={'Human Gene': 'symbol2'}) | |
# # Use df.query to get a subset of df1 based on ids in df2 | |
# subset = df3.head(value_gene).query('symbol2 in @df2.symbol2') | |
# # Use merge to join the two DataFrames on id | |
# result = pd.merge(subset, df2, on='symbol2') | |
# # Show the result | |
# # print(result) | |
# # label = df10.index.tolist() | |
# # df2 = df10 | |
# # print(df2) | |
# try: | |
# # Define the `text` column for labels and `href` column for links | |
# df10['text'] = df10.index | |
# df10['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ | |
# '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df10['text']] | |
# df10['href2'] = [f'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + c for c in df10['text']] | |
# | |
# df10['name'] = [c for c in result['Approved name']] | |
# assert isinstance(df10, object) | |
# df10.loc[:, 'database'] = database_name | |
# | |
# # print(df['name']) | |
# | |
# # Create the treemap using `px.treemap` | |
# fig = px.treemap(df10, path=[df10['text']], values=sizes, | |
# custom_data=['href', 'name', 'database', 'href2', 'text'], | |
# hover_name=(df3.head(value_gene)['SIMILARITY'])) | |
# | |
# fig.update(layout_coloraxis_showscale=False) | |
# fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) | |
# fig.update_annotations(visible=False) | |
# fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, | |
# hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", | |
# texttemplate="<br><span style='font-family: Arial; font-size: 20px;'>%{customdata[4]}<br><br>" | |
# "%{customdata[1]}<br><br>" | |
# "<a href='%{customdata[0]}'>PubMed" | |
# "</a><br><br><a href='%{customdata[3]}'>GeneCard" | |
# "</span></a>") | |
# fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"]) | |
# # # display the treemap in Streamlit | |
# # with treemap2: | |
# | |
# # st.pyplot(fig2) | |
# st.plotly_chart(fig, use_container_width=True) | |
# | |
# st.caption( | |
# "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/") | |
# st.caption("Gene designation add in exceptions [p21, p53, her2, her3]") | |
# st.caption("Gene information provided by GeneCards: https://www.genecards.org//") | |
# | |
# csv = df1.head(value_gene).to_csv().encode('utf-8') | |
# st.download_button(label=f"download top {value_gene} genes (csv)", data=csv, | |
# file_name=f'{database_name}_genes.csv', mime='text/csv') | |
# | |
# | |
# except: | |
# st.warning(f"No similar genes related to {query} within the {database_name} corpus were found.") | |
st.markdown("---") | |
df1 = table.copy() | |
df2 = pd.read_csv('Human Genes.csv') | |
m = df1.Word.isin(df2.symbol) | |
df1 = df1[m] | |
df1.rename(columns={'Word': 'Genes'}, inplace=True) | |
df_len = len(df1) | |
print(len(df1)) | |
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap to visualize " | |
# f"<span style='color:red; font-style: italic;'>proteins</span> contextually " | |
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> " | |
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>", | |
# unsafe_allow_html=True) | |
# Set the number of proteins to display | |
value_gene = min(df_len, 100) | |
st.markdown( | |
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_gene} " | |
f"</span>human genes contextually and semantically similar to " | |
f"<span style='color:red; font-style: italic;'>{query} </span>within the <span style='color:red; font-style: italic;'>{database_name} </span>corpus. Click on the squares to expand and also the Pubmed and GeneCard links for more gene information</span></p></b>", | |
unsafe_allow_html=True) | |
df11 = df1.head(value_gene).copy() | |
df11.index = (1 / df11.index) * 10000 | |
sizes = df11.index.tolist() | |
df11.set_index('Genes', inplace=True) | |
df4 = df1.copy() | |
# print(df4.head(10)) | |
df4["SIMILARITY"] = 'Similarity Score ' + df4.head(value_gene)["SIMILARITY"].round(2).astype(str) | |
df4.reset_index(inplace=True) | |
# df4 = df4.rename(columns={'Protein': 'symbol2'}) | |
# print(df4) | |
# # Use df.query to get a subset of df1 based on ids in df2 | |
# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2') | |
# # Use merge to join the two DataFrames on id | |
# result = pd.merge(subset, df2b, on='symbol2') | |
# print(result) | |
if value_gene <= df_len: | |
# Define the `text` column for labels and `href` column for links | |
df11['text'] = df11.index | |
df11['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ | |
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df11['text']] | |
df11['href2'] = [f'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + c for c in df11['text']] | |
assert isinstance(df11, object) | |
df11['database'] = database_name | |
# df11['name'] = [c for c in result['Approved name']] | |
# Create the treemap using `px.treemap` | |
fig = px.treemap(df11, path=[df11['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'], | |
hover_name=(df4.head(value_gene)['SIMILARITY'])) | |
fig.update(layout_coloraxis_showscale=False) | |
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) | |
fig.update_annotations(visible=False) | |
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, | |
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", | |
texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>" | |
"<a href='%{customdata[0]}'>PubMed" | |
"</a><br><br><a href='%{customdata[2]}'>GeneCard" | |
"</span></a>") | |
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightPink"]) | |
# # display the treemap in Streamlit | |
# with treemap2: | |
# st.pyplot(fig2) | |
st.plotly_chart(fig, use_container_width=True) | |
# st.caption( | |
# "Gene designation and database provided by KEGG homo sapien gene list: https://rest.kegg.jp/list/hsa") | |
# st.caption("Gene information provided by GeneCards: https://www.genecards.org//") | |
st.caption("Human gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/") | |
st.caption("Gene designation add in exceptions [p21, p53, her2, her3]") | |
st.caption("Gene information provided by GeneCards: https://www.genecards.org//") | |
csv = df1.head(value_gene).to_csv().encode('utf-8') | |
st.download_button(label=f"download top {value_gene} genes (csv)", data=csv, | |
file_name=f'{database_name}_genes.csv', mime='text/csv') | |
else: | |
st.warning( | |
f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number") | |
st.markdown("---") | |
# print() | |
# print("Human genes similar to " + str(query)) | |
df1 = table.copy() | |
df2 = pd.read_csv('kegg_drug_list_lowercase.csv') | |
m = df1.Word.isin(df2.drugs) | |
df1 = df1[m] | |
df1.rename(columns={'Word': 'Drugs'}, inplace=True) | |
df_len = len(df1) | |
# print(len(df1)) | |
# df1["Human Gene"] = df1["Human Gene"].str.upper() | |
# print(df1.head(50)) | |
# print() | |
# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False) | |
# time.sleep(2) | |
# Create the slider with increments of 5 up to 100 | |
# Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100 | |
value_drug = min(df1.shape[0], 100) | |
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize " | |
# f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually " | |
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> " | |
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>", | |
# unsafe_allow_html=True) | |
st.markdown( | |
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_drug} " | |
f"</span>Drugs contextually and semantically similar to " | |
f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>", | |
unsafe_allow_html=True) | |
df13 = df1.head(value_drug).copy() | |
df13.index = (1 / df13.index) * 10000 | |
sizes = df13.index.tolist() | |
df13.set_index('Drugs', inplace=True) | |
df6 = df1.copy() | |
# print(df4.head(10)) | |
df6["SIMILARITY"] = 'Similarity Score ' + df6.head(value_drug)["SIMILARITY"].round(2).astype(str) | |
df6.reset_index(inplace=True) | |
# df4 = df4.rename(columns={'Protein': 'symbol2'}) | |
# print(df4) | |
# # Use df.query to get a subset of df1 based on ids in df2 | |
# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2') | |
# # Use merge to join the two DataFrames on id | |
# result = pd.merge(subset, df2b, on='symbol2') | |
# print(result) | |
if value_drug <= df_len: | |
# Define the `text` column for labels and `href` column for links | |
# Reset the index | |
df13.reset_index(inplace=True) | |
# Replace hyphens with spaces in the 'text' column | |
df13['Drugs'] = df13['Drugs'].str.replace('-', ' ') | |
# Set the 'text' column back as the index | |
df13.set_index('Drugs', inplace=True) | |
df13['text'] = df13.index | |
df13['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ | |
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df13['text']] | |
df13['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df13['text']] | |
assert isinstance(df13, object) | |
df13['database'] = database_name | |
# df11['name'] = [c for c in result['Approved name']] | |
# Create the treemap using `px.treemap` | |
fig = px.treemap(df13, path=[df13['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'], | |
hover_name=(df6.head(value_drug)['SIMILARITY'])) | |
fig.update(layout_coloraxis_showscale=False) | |
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) | |
fig.update_annotations(visible=False) | |
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, | |
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", | |
texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>" | |
"<a href='%{customdata[0]}'>PubMed" | |
"</a><br><br><a href='%{customdata[2]}'>Wikipedia" | |
"</span></a>") | |
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["Thistle"]) | |
# # display the treemap in Streamlit | |
# with treemap2: | |
# st.pyplot(fig2) | |
st.plotly_chart(fig, use_container_width=True) | |
st.caption( | |
"Drug designation and database provided by KEGG: https://www.kegg.jp/kegg/drug/") | |
csv = df1.head(value_drug).to_csv().encode('utf-8') | |
st.download_button(label=f"download top {value_drug} drugs (csv)", data=csv, | |
file_name=f'{database_name}_drugs.csv', mime='text/csv') | |
else: | |
st.warning( | |
f"This selection exceeds the number of similar drugs related to {query} within the {database_name} corpus, please choose a lower number") | |
st.markdown("---") | |
# | |
# st.markdown("---") | |
# # print() | |
# # print("Human genes similar to " + str(query)) | |
# df1 = table.copy() | |
# df2 = pd.read_csv('diseasesKegg.csv') | |
# m = df1.Word.isin(df2.disease) | |
# df1 = df1[m] | |
# df1.rename(columns={'Word': 'Disease'}, inplace=True) | |
# df_len = len(df1) | |
# # print(len(df1)) | |
# # df1["Human Gene"] = df1["Human Gene"].str.upper() | |
# # print(df1.head(50)) | |
# # print() | |
# # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False) | |
# # time.sleep(2) | |
# # Create the slider with increments of 5 up to 100 | |
# | |
# # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100 | |
# value_disease = min(df1.shape[0], 100) | |
# | |
# # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize " | |
# # f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually " | |
# # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> " | |
# # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>", | |
# # unsafe_allow_html=True) | |
# | |
# st.markdown( | |
# f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_disease} " | |
# f"</span>Diseases contextually and semantically similar to " | |
# f"<span style='color:red; font-style: italic;'>{query}:</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>", | |
# unsafe_allow_html=True) | |
# | |
# df14 = df1.head(value_disease).copy() | |
# | |
# df14.index = (1 / df14.index) * 10000 | |
# sizes = df14.index.tolist() | |
# | |
# df14.set_index('Disease', inplace=True) | |
# | |
# df7 = df1.copy() | |
# # print(df4.head(10)) | |
# df7["SIMILARITY"] = 'Similarity Score ' + df7.head(value_disease)["SIMILARITY"].round(2).astype(str) | |
# df7.reset_index(inplace=True) | |
# # df4 = df4.rename(columns={'Protein': 'symbol2'}) | |
# # print(df4) | |
# # # Use df.query to get a subset of df1 based on ids in df2 | |
# # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2') | |
# # # Use merge to join the two DataFrames on id | |
# # result = pd.merge(subset, df2b, on='symbol2') | |
# # print(result) | |
# if value_disease <= df_len: | |
# # Define the `text` column for labels and `href` column for links | |
# # Reset the index | |
# df14.reset_index(inplace=True) | |
# | |
# # Replace hyphens with spaces in the 'text' column | |
# df14['Disease'] = df14['Disease'].str.replace('-', ' ') | |
# | |
# # Set the 'text' column back as the index | |
# df14.set_index('Disease', inplace=True) | |
# df14['text'] = df14.index | |
# df14['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ | |
# '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df14['text']] | |
# df14['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df14['text']] | |
# assert isinstance(df14, object) | |
# df14['database'] = database_name | |
# | |
# # df11['name'] = [c for c in result['Approved name']] | |
# | |
# # Create the treemap using `px.treemap` | |
# fig = px.treemap(df14, path=[df14['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'], | |
# hover_name=(df7.head(value_disease)['SIMILARITY'])) | |
# | |
# fig.update(layout_coloraxis_showscale=False) | |
# fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) | |
# fig.update_annotations(visible=False) | |
# fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, | |
# hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", | |
# texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>" | |
# "<a href='%{customdata[0]}'>PubMed" | |
# "</a><br><br><a href='%{customdata[2]}'>Wikipedia" | |
# "</span></a>") | |
# fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["PaleGoldenRod"]) | |
# # # display the treemap in Streamlit | |
# # with treemap2: | |
# | |
# # st.pyplot(fig2) | |
# st.plotly_chart(fig, use_container_width=True) | |
# | |
# st.caption("Disease designation and database provided by KEGG: https://www.genome.jp/kegg/disease/") | |
# | |
# csv = df1.head(value_disease).to_csv().encode('utf-8') | |
# st.download_button(label=f"download top {value_disease} diseases (csv)", data=csv, | |
# file_name=f'{database_name}_disease.csv', mime='text/csv') | |
# | |
# | |
# else: | |
# st.warning( | |
# f"This selection exceeds the number of similar diseases related to {query} within the {database_name} corpus, please choose a lower number") | |
# st.markdown("---") | |
# st.markdown("---") | |
# # print() | |
# # print("Human genes similar to " + str(query)) | |
# df1 = table.copy() | |
# df2 = pd.read_csv('pathwaysKegg.csv') | |
# m = df1.Word.isin(df2.pathway) | |
# df1 = df1[m] | |
# df1.rename(columns={'Word': 'Pathway'}, inplace=True) | |
# df_len = len(df1) | |
# # print(len(df1)) | |
# # df1["Human Gene"] = df1["Human Gene"].str.upper() | |
# # print(df1.head(50)) | |
# # print() | |
# # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False) | |
# # time.sleep(2) | |
# # Create the slider with increments of 5 up to 100 | |
# | |
# # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100 | |
# value_pathway = min(df1.shape[0], 100) | |
# | |
# # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize " | |
# # f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually " | |
# # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> " | |
# # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>", | |
# # unsafe_allow_html=True) | |
# | |
# st.markdown( | |
# f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_pathway} " | |
# f"</span>Pathways contextually and semantically similar to " | |
# f"<span style='color:red; font-style: italic;'>{query}:</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>", | |
# unsafe_allow_html=True) | |
# | |
# df16 = df1.head(value_pathway).copy() | |
# | |
# df16.index = (1 / df16.index) * 10000 | |
# sizes = df16.index.tolist() | |
# | |
# df16.set_index('Pathway', inplace=True) | |
# | |
# df9 = df1.copy() | |
# # print(df4.head(10)) | |
# df9["SIMILARITY"] = 'Similarity Score ' + df9.head(value_pathway)["SIMILARITY"].round(2).astype(str) | |
# df9.reset_index(inplace=True) | |
# # df4 = df4.rename(columns={'Protein': 'symbol2'}) | |
# # print(df4) | |
# # # Use df.query to get a subset of df1 based on ids in df2 | |
# # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2') | |
# # # Use merge to join the two DataFrames on id | |
# # result = pd.merge(subset, df2b, on='symbol2') | |
# # print(result) | |
# if value_pathway <= df_len: | |
# # Define the `text` column for labels and `href` column for links | |
# # Reset the index | |
# df16.reset_index(inplace=True) | |
# | |
# # Replace hyphens with spaces in the 'text' column | |
# df16['Pathway'] = df16['Pathway'].str.replace('-', ' ') | |
# | |
# # Set the 'text' column back as the index | |
# df16.set_index('Pathway', inplace=True) | |
# df16['text'] = df16.index | |
# df16['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ | |
# '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df16['text']] | |
# df16['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df16['text']] | |
# assert isinstance(df16, object) | |
# df16['database'] = database_name | |
# | |
# # df11['name'] = [c for c in result['Approved name']] | |
# | |
# # Create the treemap using `px.treemap` | |
# fig = px.treemap(df16, path=[df16['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'], | |
# hover_name=(df9.head(value_pathway)['SIMILARITY'])) | |
# | |
# fig.update(layout_coloraxis_showscale=False) | |
# fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) | |
# fig.update_annotations(visible=False) | |
# fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, | |
# hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", | |
# texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>" | |
# "<a href='%{customdata[0]}'>PubMed" | |
# "</a><br><br><a href='%{customdata[2]}'>Wikipedia" | |
# "</span></a>") | |
# fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["FloralWhite"]) | |
# # # display the treemap in Streamlit | |
# # with treemap2: | |
# | |
# # st.pyplot(fig2) | |
# st.plotly_chart(fig, use_container_width=True) | |
# | |
# st.caption("Pathway designation and database provided by KEGG: https://www.genome.jp/kegg/pathway.html") | |
# | |
# csv = df1.head(value_pathway).to_csv().encode('utf-8') | |
# st.download_button(label=f"download top {value_pathway} pathways (csv)", data=csv, | |
# file_name=f'{database_name}_pathways.csv', mime='text/csv') | |
# | |
# | |
# else: | |
# st.warning( | |
# f"This selection exceeds the number of similar pathways related to {query} within the {database_name} corpus, please choose a lower number") | |
# st.markdown("---") | |
st.markdown("---") | |
# print() | |
# print("Human genes similar to " + str(query)) | |
df1 = table.copy() | |
df2 = pd.read_csv('phytochemicals.csv') | |
m = df1.Word.isin(df2.phyto) | |
df1 = df1[m] | |
df1.rename(columns={'Word': 'Phytochemical'}, inplace=True) | |
df_len = len(df1) | |
# print(len(df1)) | |
# df1["Human Gene"] = df1["Human Gene"].str.upper() | |
# print(df1.head(50)) | |
# print() | |
# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False) | |
# time.sleep(2) | |
# Create the slider with increments of 5 up to 100 | |
# Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100 | |
value_phyto = min(df1.shape[0], 100) | |
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize " | |
# f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually " | |
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> " | |
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>", | |
# unsafe_allow_html=True) | |
st.markdown( | |
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_phyto} " | |
f"</span>Phytochemicals contextually and semantically similar to " | |
f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. " | |
f"Click on the squares to expand and also the Pubmed and Wikipedia links for more compound information</span></p></b>", | |
unsafe_allow_html=True) | |
df15 = df1.head(value_phyto).copy() | |
df15.index = (1 / df15.index) * 10000 | |
sizes = df15.index.tolist() | |
df15.set_index('Phytochemical', inplace=True) | |
df8 = df1.copy() | |
# print(df4.head(10)) | |
df8["SIMILARITY"] = 'Similarity Score ' + df8.head(value_phyto)["SIMILARITY"].round(2).astype(str) | |
df8.reset_index(inplace=True) | |
# df4 = df4.rename(columns={'Protein': 'symbol2'}) | |
# print(df4) | |
# # Use df.query to get a subset of df1 based on ids in df2 | |
# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2') | |
# # Use merge to join the two DataFrames on id | |
# result = pd.merge(subset, df2b, on='symbol2') | |
# print(result) | |
if value_phyto <= df_len: | |
# Define the `text` column for labels and `href` column for links | |
# Reset the index | |
df15.reset_index(inplace=True) | |
# Replace hyphens with spaces in the 'text' column | |
df15['Phytochemical'] = df15['Phytochemical'].str.replace('-', ' ') | |
# Set the 'text' column back as the index | |
df15.set_index('Phytochemical', inplace=True) | |
df15['text'] = df15.index | |
df15['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ | |
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df15['text']] | |
df15['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df15['text']] | |
assert isinstance(df15, object) | |
df15['database'] = database_name | |
# df11['name'] = [c for c in result['Approved name']] | |
# Create the treemap using `px.treemap` | |
fig = px.treemap(df15, path=[df15['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'], | |
hover_name=(df8.head(value_phyto)['SIMILARITY'])) | |
fig.update(layout_coloraxis_showscale=False) | |
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) | |
fig.update_annotations(visible=False) | |
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, | |
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", | |
texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>" | |
"<a href='%{customdata[0]}'>PubMed" | |
"</a><br><br><a href='%{customdata[2]}'>Wikipedia" | |
"</span></a>") | |
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightSeaGreen"]) | |
# # display the treemap in Streamlit | |
# with treemap2: | |
# st.pyplot(fig2) | |
st.plotly_chart(fig, use_container_width=True) | |
st.caption("Phytochemical designation and database provided by PhytoHub: https://phytohub.eu/") | |
csv = df1.head(value_phyto).to_csv().encode('utf-8') | |
st.download_button(label=f"download top {value_phyto} phytochemicals (csv)", data=csv, | |
file_name=f'{database_name}_phytochemicals.csv', mime='text/csv') | |
else: | |
st.warning( | |
f"This selection exceeds the number of similar pythochemicals related to {query} within the {database_name} corpus, please choose a lower number") | |
st.markdown("---") | |
# print() | |
# print("Human genes similar to " + str(query)) | |
df1 = table.copy() | |
df2 = pd.read_csv('kegg_compounds_lowercase.csv') | |
m = df1.Word.isin(df2.compound) | |
df1 = df1[m] | |
df1.rename(columns={'Word': 'Compounds'}, inplace=True) | |
df_len = len(df1) | |
# df1["Human Gene"] = df1["Human Gene"].str.upper() | |
# print(df1.head(50)) | |
# print() | |
# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False) | |
# time.sleep(2) | |
# Create the slider with increments of 5 up to 100 | |
# Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100 | |
value_compound = min(df1.shape[0], 100) | |
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize " | |
# f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually " | |
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> " | |
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>", | |
# unsafe_allow_html=True) | |
st.markdown( | |
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_compound} " | |
f"</span>Compounds contextually and semantically similar to " | |
f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. " | |
f"Click on the squares to expand and the Pubmed, Wikipedia, and KEGG links for more compound information (may take time to load)</span></p></b>", | |
unsafe_allow_html=True) | |
df12 = df1.head(value_compound).copy() | |
df12.index = (1 / df12.index) * 10000 | |
sizes = df12.index.tolist() | |
df12.set_index('Compounds', inplace=True) | |
df5 = df1.copy() | |
# print(df4.head(10)) | |
df5["SIMILARITY"] = 'Similarity Score ' + df5.head(value_compound)["SIMILARITY"].round(2).astype(str) | |
df5.reset_index(inplace=True) | |
# df4 = df4.rename(columns={'Protein': 'symbol2'}) | |
# print(df4) | |
# # Use df.query to get a subset of df1 based on ids in df2 | |
# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2') | |
# # Use merge to join the two DataFrames on id | |
# result = pd.merge(subset, df2b, on='symbol2') | |
# print(result) | |
if value_compound <= df_len: | |
# Define the `text` column for labels and `href` column for links | |
# Reset the index | |
df12.reset_index(inplace=True) | |
# Replace hyphens with spaces in the 'text' column | |
df12['Compounds'] = df12['Compounds'].str.replace('-', ' ') | |
# Set the 'text' column back as the index | |
df12.set_index('Compounds', inplace=True) | |
df12['text'] = df12.index | |
df12['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ | |
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df12['text']] | |
df12['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df12['text']] | |
df12['href3'] = [f'https://www.genome.jp/entry/{compound_id}' for compound_id in get_compound_ids(df12['text'])] | |
assert isinstance(df12, object) | |
df12['database'] = database_name | |
# df11['name'] = [c for c in result['Approved name']] | |
# Create the treemap using `px.treemap` | |
fig = px.treemap(df12, path=[df12['text']], values=sizes, | |
custom_data=['href', 'database', 'href2', 'text', 'href3'], | |
hover_name=(df5.head(value_compound)['SIMILARITY'])) | |
fig.update(layout_coloraxis_showscale=False) | |
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) | |
fig.update_annotations(visible=False) | |
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, | |
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", | |
texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>" | |
"<a href='%{customdata[0]}'>PubMed" | |
"</a><br><br><a href='%{customdata[2]}'>Wikipedia" | |
"</a><br><br><a href='%{customdata[4]}'>KEGG Compound Page" | |
"</span></a>") | |
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightYellow"]) | |
# # display the treemap in Streamlit | |
# with treemap2: | |
# st.pyplot(fig2) | |
st.plotly_chart(fig, use_container_width=True) | |
st.caption("Compound designation and database provided by KEGG: https://www.kegg.jp/kegg/compound/") | |
csv = df1.head(value_compound).to_csv().encode('utf-8') | |
st.download_button(label=f"download top {value_compound} compounds (csv)", data=csv, | |
file_name=f'{database_name}_compounds.csv', mime='text/csv') | |
else: | |
st.warning( | |
f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number") | |
st.markdown("---") | |
def save_comment(comment): | |
with open('comments.txt', 'a') as f: | |
f.write(f'{comment}\n') | |
def save_comment_threaded(comment): | |
t = threading.Thread(target=save_comment, args=(comment,)) | |
t.start() | |
st.title("Abstractalytics Web App") | |
st.write("We appreciate your feedback!") | |
user_comment = st.text_area("Please send us your anonymous remarks/suggestions about the Abstractalytics Web App: " | |
"(app will pause while we save your comments)") | |
if st.button("Submit"): | |
if user_comment: | |
save_comment_threaded(user_comment) | |
st.success("Your comment has been saved. Thank you for your feedback!") | |
else: | |
st.warning("Please enter a comment before submitting.") | |
st.markdown("---") | |
st.subheader("Cancer-related videos") | |
if query: | |
idlist = [] | |
search_keyword = {query} | |
html = urllib.request.urlopen("https://www.youtube.com/@NCIgov/search?query=cancer") | |
html2 = urllib.request.urlopen("https://www.youtube.com/@CancerCenter/search?query=cancer") | |
html3 = urllib.request.urlopen("https://www.youtube.com/@NorthwesternMedicine/search?query=cancer") | |
html4 = urllib.request.urlopen("https://www.youtube.com/@TEDEd/search?query=cancer") | |
html5 = urllib.request.urlopen("https://www.youtube.com/@CancerResearchUK/search?query=cancer") | |
video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode()) | |
video_ids2 = re.findall(r"watch\?v=(\S{11})", html2.read().decode()) | |
video_ids3 = re.findall(r"watch\?v=(\S{11})", html3.read().decode()) | |
video_ids4 = re.findall(r"watch\?v=(\S{11})", html4.read().decode()) | |
video_ids5 = re.findall(r"watch\?v=(\S{11})", html5.read().decode()) | |
for i in video_ids2: | |
video_ids.append(i) | |
for i in video_ids3: | |
video_ids.append(i) | |
for i in video_ids4: | |
video_ids.append(i) | |
for i in video_ids5: | |
video_ids.append(i) | |
random.shuffle(video_ids) | |
c1, c2, c3 = st.columns(3) | |
with c1: | |
st.video("https://www.youtube.com/watch?v=" + video_ids[0]) | |
with c2: | |
st.video("https://www.youtube.com/watch?v=" + video_ids[1]) | |
with c3: | |
st.video("https://www.youtube.com/watch?v=" + video_ids[2]) | |
st.markdown("---") | |
# else: | |
# st.error("The password you entered is incorrect.") | |