Spaces:
Runtime error
Runtime error
import streamlit as st | |
import time | |
import concurrent.futures | |
# import json | |
from sklearn.manifold import TSNE | |
# import umap | |
# import tensorflow | |
from gensim.models import Word2Vec | |
import pandas as pd | |
# import threading | |
# import matplotlib.pyplot as plt | |
# import squarify | |
import numpy as np | |
import re | |
import urllib.request | |
import random | |
import plotly.express as px | |
import plotly.graph_objs as go | |
from streamlit.components.v1 import html | |
st.set_page_config(page_title="OncoDigger", page_icon=":microscope:", layout="wide", # centered | |
initial_sidebar_state="auto", menu_items={ | |
'About': "OncoDigger is a Natural Language Processing (NLP) that harnesses Word2Vec to mine" | |
" insight from pubmed abstracts. Created by Jimmie E. Fata, PhD, fata4science@gmail.com"}) | |
# analytics_code = ''' | |
# <head> | |
# <!-- Google tag (gtag.js) --> | |
# <script async src="https://www.googletagmanager.com/gtag/js?id=G-EKFSW65C2P"></script> | |
# <script> | |
# window.dataLayer = window.dataLayer || []; | |
# function gtag(){dataLayer.push(arguments);} | |
# gtag('js', new Date()); | |
# | |
# gtag('config', 'G-EKFSW65C2P'); | |
# </script> | |
# </head> | |
# ''' | |
# | |
# html(analytics_code, height=0) | |
# from google_analytics_component.google_analytics import google_analytics | |
# | |
# google_analytics() | |
# Define the HTML and CSS styles | |
st.markdown(""" | |
<style> | |
[data-testid=stSidebar] { | |
background-color: #99CCFF; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<style> | |
body { | |
background-color: #CCFFFF; | |
# color: #ffffff; | |
# font-size: 1px | |
} | |
.stApp { | |
background-color: #CCFFFF; | |
# color: #ffffff; | |
# font-size: 1px | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
st.header(":red[*O*]nco:red[*D*]igger") | |
st.subheader( | |
"A web app designed to explore massive amounts of :red[*PubMed Cancer abstracts*] for a deeper understanding of your research. Results are driven " | |
"by Machine Learning and Natural Language Processing algorithms, which allow you to scan and mine information from millions of abstracts in seconds.") | |
def custom_subheader(text, identifier, font_size): | |
st.markdown(f"<h3 id='{identifier}' style='font-size: {font_size}px;'>{text}</h3>", unsafe_allow_html=True) | |
custom_subheader("To begin, simply select a cancer corpus from the left sidebar and enter a keyword " | |
"you wish to explore within the corpus. OncoDigger will determine the top words, " | |
"genes, drugs, phytochemicals, and compounds that are contextually and semantically related " | |
"to your input, both directly and indirectly. Dive in and enjoy the exploration!", "unique-id", 18) | |
st.markdown("---") | |
# Define the correct password | |
# CORRECT_PASSWORD = "123" | |
# Define a function to check if the password is correct | |
# def authenticate(password): | |
# if password == CORRECT_PASSWORD: | |
# return True | |
# else: | |
# return False | |
# | |
# # Create a Streamlit input field for the password | |
# password = st.text_input("Enter password:", type="password") | |
# | |
# # If the password is correct, show the app content | |
# if authenticate(password): | |
opt = st.sidebar.radio("Select a PubMed Corpus (1990-2022)", options=( | |
'Breast Cancer corpus', 'Skin Cancer corpus', 'Lung Cancer corpus', 'Head and Neck Cancer corpus', 'Colorectal Cancer corpus', | |
'Lymphoma Cancer corpus', 'Leukemia Cancer corpus', 'Brain Cancer corpus', | |
'Prostate Cancer corpus', 'Liver Cancer corpus', 'Myeloma Cancer corpus', 'Uterine Cancer corpus', 'Urinary Cancer corpus', | |
'Kidney Cancer corpus', 'Stomach Cancer corpus', 'Bone Cancer corpus','Cervical Cancer corpus', 'Ovarian Cancer corpus', 'Bladder Cancer corpus', | |
'Pancreas Cancer corpus', 'Esophageal Cancer corpus', 'Thyroid Cancer corpus', 'Neuroblastoma Cancer corpus', 'Testicular Cancer corpus', 'Laryngeal Cancer corpus' | |
)) | |
# if opt == "Clotting corpus": | |
# model_used = ("pubmed_model_clotting") | |
# num_abstracts = 45493 | |
# database_name = "Clotting" | |
# if opt == "Neuroblastoma corpus": | |
# model_used = ("pubmed_model_neuroblastoma") | |
# num_abstracts = 29032 | |
# database_name = "Neuroblastoma" | |
# if opt == "Cancer corpus": | |
# model_used = ("cancer_pubmed_model") | |
# num_abstracts = 1909197 | |
# database_name = "Cancer" | |
# st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Breast Cancer corpus": | |
model_used = ("breast_cancer_pubmed_model") | |
num_abstracts = 204414 | |
database_name = "Breast_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Lung Cancer corpus": | |
model_used = ("lung_cancer_pubmed_model") | |
num_abstracts = 143916 | |
database_name = "Lung_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Colorectal Cancer corpus": | |
model_used = ("colorectal_cancer_pubmed_model") | |
num_abstracts = 138679 | |
database_name = "Colorectal_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Prostate Cancer corpus": | |
model_used = ("prostate_cancer_pubmed_model") | |
num_abstracts = 89791 | |
database_name = "Prostate_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Skin Cancer corpus": | |
model_used = ("skin_cancer_pubmed_model") | |
num_abstracts = 176587 | |
database_name = "Skin_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Lymphoma Cancer corpus": | |
model_used = ("lymphoma_cancer_pubmed_model") | |
num_abstracts = 102603 | |
database_name = "Lymphoma_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Urinary Cancer corpus": | |
model_used = ("urinary_cancer_pubmed_model") | |
num_abstracts = 60876 | |
database_name = "Urinary_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Kidney Cancer corpus": | |
model_used = ("kidney_cancer_pubmed_model") | |
num_abstracts = 39016 | |
database_name = "Kidney_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Uterine Cancer corpus": | |
model_used = ("uterine_cancer_pubmed_model") | |
num_abstracts = 72634 | |
database_name = "Uterine_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Cervical Cancer corpus": | |
model_used = ("cervical_cancer_pubmed_model") | |
num_abstracts = 43327 | |
database_name = "Cervical_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Pancreas Cancer corpus": | |
model_used = ("pancreas_cancer_pubmed_model") | |
num_abstracts = 50023 | |
database_name = "Pancreas_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Leukemia Cancer corpus": | |
model_used = ("leukemia_cancer_pubmed_model") | |
num_abstracts = 107145 | |
database_name = "Leukemia_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Brain Cancer corpus": | |
model_used = ("brain_cancer_pubmed_model") | |
num_abstracts = 84483 | |
database_name = "Brain_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Thyroid Cancer corpus": | |
model_used = ("thyroid_cancer_pubmed_model") | |
num_abstracts = 30992 | |
database_name = "Thyroid_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Ovarian Cancer corpus": | |
model_used = ("ovarian_cancer_pubmed_model") | |
num_abstracts = 53164 | |
database_name = "Ovarian_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Liver Cancer corpus": | |
model_used = ("liver_cancer_pubmed_model") | |
num_abstracts = 107234 | |
database_name = "Liver_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Stomach Cancer corpus": | |
model_used = ("stomach_cancer_pubmed_model") | |
num_abstracts = 53249 | |
database_name = "stomach_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Neuroblastoma Cancer corpus": | |
model_used = ("neuroblastoma_cancer_pubmed_model") | |
num_abstracts = 19820 | |
database_name = "Neuroblastoma_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Head and Neck Cancer corpus": | |
model_used = ("head_and_neck_cancer_pubmed_model") | |
num_abstracts = 169171 | |
database_name = "Head_and_Neck_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Bone Cancer corpus": | |
model_used = ("bone_cancer_pubmed_model") | |
num_abstracts = 55781 | |
database_name = "Bone_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Bladder Cancer corpus": | |
model_used = ("bladder_cancer_pubmed_model") | |
num_abstracts = 29468 | |
database_name = "Bladder_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Myeloma Cancer corpus": | |
model_used = ("myeloma_cancer_pubmed_model") | |
num_abstracts = 26345 | |
database_name = "Myeloma_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Testicular Cancer corpus": | |
model_used = ("testicular_cancer_pubmed_model") | |
num_abstracts = 9601 | |
database_name = "Testicular_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Esophageal Cancer corpus": | |
model_used = ("esophageal_cancer_pubmed_model") | |
num_abstracts = 30660 | |
database_name = "Esophageal_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
if opt == "Laryngeal Cancer corpus": | |
model_used = ("laryngeal_cancer_pubmed_model") | |
num_abstracts = 10218 | |
database_name = "Laryngeal_cancer" | |
st.sidebar.markdown(f"**Number of abstracts in this selection:** {num_abstracts}") | |
st.header(f":blue[{database_name} Pubmed corpus.]") | |
text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus") | |
query = text_input_value | |
query = query.lower() | |
query = query.strip() # This line will remove any leading or trailing spaces | |
query = re.sub("[,.?!&*;:]", "", query) | |
query = re.sub(" ", "-", query) | |
# matches = [" "] | |
# if any([x in query for x in matches]): | |
# st.write("Please only enter one term or a term without spaces") | |
# # query = input ("Enter your keyword(s):") | |
if query: | |
bar = st.progress(0) | |
time.sleep(.05) | |
st.caption( | |
f"Searching {num_abstracts} {database_name} PubMed primary abstracts covering 1990-2022 (Reviews not included)") | |
for i in range(10): | |
bar.progress((i + 1) * 10) | |
time.sleep(.1) | |
try: | |
model = Word2Vec.load(f"{model_used}") # you can continue training with the loaded model! | |
words = list(model.wv.key_to_index) | |
X = model.wv[model.wv.key_to_index] | |
# print(model.wv['bfgf']) | |
model2 = model.wv[query] | |
# print(model.wv.similar_by_word('bfgf', topn=50, restrict_vocab=None)) | |
df = pd.DataFrame(X) | |
# | |
# if 'melanin' in model.wv.key_to_index: | |
# print("The term 'melanin' is present in the model.") | |
# else: | |
# print("The term 'melanin' is not present in the model.") | |
def get_compound_ids(compound_names): | |
with concurrent.futures.ThreadPoolExecutor() as executor: | |
compound_ids = list(executor.map(get_compound_id, compound_names)) | |
return compound_ids | |
import requests | |
def get_compound_id(compound_name): | |
url = f"http://rest.kegg.jp/find/compound/{compound_name}" | |
response = requests.get(url) | |
if response.status_code == 200: | |
result = response.text.split('\n') | |
if result[0]: | |
compound_id = result[0].split('\t')[0] | |
return compound_id | |
return None | |
# except: | |
# st.error("Term occurrence is too low - please try another term") | |
# st.stop() | |
st.markdown("---") | |
try: | |
table = model.wv.most_similar_cosmul(query, topn=10000) | |
table = (pd.DataFrame(table)) | |
table.index.name = 'Rank' | |
table.columns = ['Word', 'SIMILARITY'] | |
pd.set_option('display.max_rows', None) | |
table2 = table.copy() | |
st.markdown(f"<h2 style='text-align: center; font-family: Arial; font-size: 20px; font-weight: bold;'>" | |
f"Top <span style='color:red; font-style: italic;'>500</span> words in a dimension-reduced embedding map showing similarity to <span style='color:red; font-style: italic;'>{query}</span> in <span style='color:red; font-style: italic;'>{database_name}</span> " | |
f"corpus</span></h2>", | |
unsafe_allow_html=True) | |
# Set the max number of words to display | |
value_word = min(100, len(table2)) | |
try: | |
value_word = min(50, len(table2)) | |
# Get the top 10000 similar words to the query | |
top_words = model.wv.most_similar_cosmul(query, topn=500) | |
words = [word for word, sim in top_words] | |
words = [word.replace(' ', '-') for word in words] | |
sims = [sim for word, sim in top_words] | |
X = model.wv[words] | |
# Add the query to the list of words and the embeddings array | |
words_with_query = [query] + words | |
X_with_query = np.vstack((model.wv[[query]], X)) | |
# Perform t-SNE | |
tsne = TSNE(n_components=2, random_state=42) | |
X_tsne = tsne.fit_transform(X_with_query) | |
# Extract the t-SNE-transformed coordinates of the query and the top words | |
query_tsne = X_tsne[0] | |
X_top = X_tsne[1:] | |
# Compute similarities between query and top 100 words | |
sims_query_top = sims # print(sims_query_top) | |
except Exception as e: | |
print("Error:", e) | |
# Generate a 2D scatter plot of word embeddings using Plotly | |
fig = px.scatter(x=X_top[:, 0], y=X_top[:, 1], color=sims, color_continuous_scale="RdYlGn") | |
# Change background color to black | |
fig.update_layout(plot_bgcolor='#CCFFFF') | |
# Change color of text to white | |
fig.update_layout(xaxis=dict(gridcolor='#CCFFFF', color='blue'), | |
yaxis=dict(gridcolor='#CCFFFF', color='blue')) | |
# fig.update_traces(hovertemplate='<b>%{hovertext}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>') | |
# fig.update_layout(title=dict( | |
# text=f"Top 10000 words in an interactive embedding map for {query} in {database_name} PubMed corpus" | |
# f": Zoom in to the black diamond to find {query}", x=0.5, y=1, xanchor='center', yanchor='top', | |
# font=dict(color='black'))) | |
fig.update_coloraxes(colorbar_title=f"Similarity with {query}") | |
# Represent query as a large red diamond | |
fig.add_trace(go.Scatter(x=[model.wv[query][0]], y=[model.wv[query][1]], mode='markers', | |
marker=dict(size=7, color='black', symbol='diamond'), name=query, hovertext=query, | |
showlegend=False)) | |
# Add label for the query above the diamond | |
fig.add_trace(go.Scatter(x=[model.wv[query][0]], y=[model.wv[query][1]], mode='text', text=[query], | |
textposition='top right', textfont=dict(color='blue', size=12), hoverinfo='none', | |
showlegend=False)) | |
# Add circles for the top 50 similar words | |
fig.add_trace(go.Scatter(x=X_top[:, 0], y=X_top[:, 1], mode='markers', | |
marker=dict(size=2, color=sims_query_top, colorscale='RdYlGn', symbol='circle'), | |
text=words, customdata=sims, name='')) | |
fig.update(layout_coloraxis_showscale=True) | |
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) | |
fig.update_annotations(visible=False) | |
st.plotly_chart(fig, use_container_width=True) | |
st.markdown( | |
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_word} " | |
f"</span>words contextually and semantically similar to " | |
f"<span style='color:red; font-style: italic;'>{query} </span>within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. " | |
f"Click on the squares to expand and also the PubMed and Wikipedia links for more word information</span></p></b>", | |
unsafe_allow_html=True) | |
short_table = table2.head(value_word).round(2) | |
short_table.index += 1 | |
short_table.index = (1 / short_table.index) * 10 | |
sizes = short_table.index.tolist() | |
short_table.set_index('Word', inplace=True) | |
table2["SIMILARITY"] = 'Similarity Score ' + table2.head(value_word)["SIMILARITY"].round(2).astype(str) | |
rank_num = list(short_table.index.tolist()) | |
df = short_table | |
df['text'] = short_table.index | |
df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ | |
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in | |
short_table.index] | |
df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index] | |
df.loc[:, 'database'] = database_name | |
fig = px.treemap(df, path=[short_table.index], values=sizes, | |
custom_data=['href', 'text', 'database', 'href2'], | |
hover_name=(table2.head(value_word)['SIMILARITY'])) | |
fig.update(layout_coloraxis_showscale=False) | |
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) | |
fig.update_annotations(visible=False) | |
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, | |
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", | |
texttemplate="<br><span " | |
"style='font-family: Arial; font-size: 20px;'>%{customdata[1]}<br><br>" | |
"<a href='%{customdata[0]}'>PubMed" | |
"</a><br><br><a href='%{customdata[3]}'>Wikipedia" | |
"</span></a>") | |
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"]) | |
# st.pyplot(fig2) | |
st.plotly_chart(fig, use_container_width=True) | |
# st.caption( | |
# "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/") | |
# st.caption("Gene designation add in exceptions [p21, p53, her2, her3]") | |
csv = table2.head(value_word).to_csv().encode('utf-8') | |
st.download_button(label=f"download top {value_word} words (csv)", data=csv, | |
file_name=f'{database_name}_words.csv', mime='text/csv') | |
except: | |
st.warning( | |
f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus, please choose a lower number") | |
except KeyError: | |
st.warning( | |
"This word is not found in the corpus, it could be because it is not spelled correctly or could be that it does not have enough representation within the corpus, please try again") | |
# try: | |
# value_word = min(50, len(table2)) | |
# # Get the top 50 similar words to the query | |
# top_words = model.wv.most_similar_cosmul(query, topn=value_word) | |
# words = [word for word, sim in top_words] | |
# words = [word.replace(' ', '-') for word in words] | |
# sims = [sim for word, sim in top_words] | |
# X_top = model.wv[words] | |
# | |
# # Compute similarities between query and top 100 words | |
# sims_query_top = sims # print(sims_query_top) | |
# except Exception as e: | |
# print("Error:", e) | |
# | |
# | |
# # Generate a 3D scatter plot of word embeddings using Plotly | |
# fig = px.scatter_3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], text=words, color=sims_query_top, | |
# color_continuous_scale="RdYlGn", hover_name=words, hover_data={"color": sims_query_top}) | |
# | |
# # Change background color to black | |
# fig.update_layout(scene=dict(bgcolor='#CCFFFF')) | |
# | |
# # Change color of text to white | |
# fig.update_layout(scene=dict(xaxis=dict(backgroundcolor='#CCFFFF', color='blue'), | |
# yaxis=dict(backgroundcolor='#CCFFFF', color='blue'), | |
# zaxis=dict(backgroundcolor='#CCFFFF', color='blue'))) | |
# | |
# fig.update_traces(hovertemplate='<b>%{hovertext}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>') | |
# fig.update_layout(title=dict(text=f"Word embedding map for {query} in {database_name} PubMed corpus", x=0.5, y=0.95, | |
# xanchor='center', yanchor='top', font=dict(color='black')), | |
# scene=dict(xaxis_title="Dimension 1", yaxis_title="Dimension 2", zaxis_title="Dimension 3")) | |
# fig.update_coloraxes(colorbar_title="Similarity with query") | |
# | |
# # Represent query as a large red diamond | |
# fig.add_trace(go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='markers', | |
# marker=dict(size=7, color='black', symbol='diamond'), name=query, hovertext=query, showlegend=False)) | |
# | |
# # Add label for the query above the diamond | |
# fig.add_trace( | |
# go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='text', text=[query], | |
# textposition='bottom center', textfont=dict(color='blue', size=10), hoverinfo='none', showlegend=False)) | |
# | |
# # Add circles for the top 50 similar words | |
# fig.add_trace(go.Scatter3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], mode='markers', | |
# marker=dict(size=2, color=sims_query_top, colorscale='RdYlGn', symbol='circle'), | |
# hovertemplate='<b>%{text}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>', | |
# text=words, customdata=sims, name='')) | |
# | |
# fig.update(layout_coloraxis_showscale=True) | |
# fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) | |
# fig.update_annotations(visible=False) | |
# | |
# st.plotly_chart(fig, use_container_width=True) | |
st.markdown("---") | |
try: | |
df1 = table.copy() | |
df2 = pd.read_csv('Human Genes.csv') | |
m = df1.Word.isin(df2.symbol) | |
df1 = df1[m] | |
df1.rename(columns={'Word': 'Genes'}, inplace=True) | |
df1.reset_index(drop=True, inplace=True) # Reset the index here | |
df_len = len(df1) | |
# print(len(df1)) | |
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap to visualize " | |
# f"<span style='color:red; font-style: italic;'>proteins</span> contextually " | |
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> " | |
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>", | |
# unsafe_allow_html=True) | |
# Set the number of proteins to display | |
value_gene = min(df_len, 100) | |
st.markdown( | |
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_gene} " | |
f"</span>human genes contextually and semantically similar to " | |
f"<span style='color:red; font-style: italic;'>{query} </span>within the <span style='color:red; font-style: italic;'>{database_name} </span>corpus. Click on the squares to expand and also the Pubmed and GeneCard links for more gene information</span></p></b>", | |
unsafe_allow_html=True) | |
df11 = df1.head(value_gene).copy() | |
df11.index = (1 / (df11.index + 1)) * 10000 # Add 1 to the index before dividing | |
sizes = df11.index.tolist() | |
df11.set_index('Genes', inplace=True) | |
df4 = df1.copy() | |
# print(df4.head(10)) | |
df4["SIMILARITY"] = 'Similarity Score ' + df4.head(value_gene)["SIMILARITY"].round(2).astype(str) | |
df4.reset_index(inplace=True) | |
# df4 = df4.rename(columns={'Protein': 'symbol2'}) | |
# print(df4) | |
# # Use df.query to get a subset of df1 based on ids in df2 | |
# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2') | |
# # Use merge to join the two DataFrames on id | |
# result = pd.merge(subset, df2b, on='symbol2') | |
# print(result) | |
if value_gene <= df_len: | |
# Define the `text` column for labels and `href` column for links | |
df11['text'] = df11.index | |
df11['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ | |
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df11['text']] | |
df11['href2'] = [f'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + c for c in df11['text']] | |
assert isinstance(df11, object) | |
df11['database'] = database_name | |
# df11['name'] = [c for c in result['Approved name']] | |
# Create the treemap using `px.treemap` | |
fig = px.treemap(df11, path=[df11['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'], | |
hover_name=(df4.head(value_gene)['SIMILARITY'])) | |
fig.update(layout_coloraxis_showscale=False) | |
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) | |
fig.update_annotations(visible=False) | |
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, | |
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", | |
texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>" | |
"<a href='%{customdata[0]}'>PubMed" | |
"</a><br><br><a href='%{customdata[2]}'>GeneCard" | |
"</span></a>") | |
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightPink"]) | |
# # display the treemap in Streamlit | |
# with treemap2: | |
# st.pyplot(fig2) | |
st.plotly_chart(fig, use_container_width=True) | |
# st.caption( | |
# "Gene designation and database provided by KEGG homo sapien gene list: https://rest.kegg.jp/list/hsa") | |
# st.caption("Gene information provided by GeneCards: https://www.genecards.org//") | |
st.caption( | |
"Human gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/") | |
st.caption("Gene designation add in exceptions [p21, p53, her2, her3]") | |
st.caption("Gene information provided by GeneCards: https://www.genecards.org//") | |
st.caption( | |
"In some cases genes may represent abbreviations of words and not genes, use pubmed link to confirm output is a gene") | |
csv = df1.head(value_gene).to_csv().encode('utf-8') | |
st.download_button(label=f"download top {value_gene} genes (csv)", data=csv, | |
file_name=f'{database_name}_genes.csv', mime='text/csv') | |
else: | |
st.warning( | |
f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number") | |
value_gene = min(df_len, 50) | |
st.markdown( | |
f"<h2 style='text-align: center; font-family: Arial; font-size: 20px; font-weight: bold;'>3D interactive " | |
f"gene embedding map for <span style='color:red; font-style: italic;'>{value_gene}</span> genes most similar " | |
f"with <span style='color:red; font-style: italic;'>{query}</span> in <span style='color:red; font-style: italic;'>{database_name}</span> PubMed corpus</h2>", | |
unsafe_allow_html=True) | |
try: | |
# Get the top 50 similar genes to the query | |
value_gene = min(df_len, 50) | |
top_words = model.wv.most_similar_cosmul(query, topn=value_gene) | |
words = df11.head(value_gene).index | |
words = [word.replace(' ', '-') for word in words] | |
# print(words) | |
sims = df4.head(value_gene)["SIMILARITY"].tolist() | |
# print(sims) | |
X_top = model.wv[words] # print(X_top) | |
except Exception as e: | |
print("Error:", e) | |
# Remove the text "Similarity Score" from each element in the sims list | |
sims_query_top = [float(sim.split()[-1]) for sim in sims] | |
# print(sims_query_top) | |
# Generate a 3D scatter plot of word embeddings using Plotly | |
fig2 = px.scatter_3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], text=words, color=sims_query_top, | |
color_continuous_scale="RdYlGn", hover_name=words, hover_data={"color": sims_query_top}) | |
# Change background color to black | |
fig2.update_layout(scene=dict(bgcolor='#CCFFFF')) | |
# Change color of text to white | |
fig2.update_layout(scene=dict(xaxis=dict(backgroundcolor='#CCFFFF', color='blue'), | |
yaxis=dict(backgroundcolor='#CCFFFF', color='blue'), | |
zaxis=dict(backgroundcolor='#CCFFFF', color='blue'))) | |
fig2.update_traces(hovertemplate='<b>%{hovertext}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>') | |
fig2.update_layout( | |
title=dict(text=f"", x=0.5, y=0.95, xanchor='center', yanchor='top', font=dict(color='black')), | |
scene=dict(xaxis_title="Dimension 1", yaxis_title="Dimension 2", zaxis_title="Dimension 3")) | |
fig2.update_coloraxes(colorbar_title=f"Similarity with {query}") | |
# Represent query as a large red diamond | |
fig2.add_trace( | |
go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='markers', | |
marker=dict(size=7, color='black', symbol='diamond'), name=query, hovertext=query, | |
showlegend=False)) | |
# Add label for the query above the diamond | |
fig2.add_trace(go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='text', | |
text=[query], textposition='bottom center', textfont=dict(color='blue', size=10), | |
hoverinfo='none', showlegend=False)) | |
# Add circles for the top 50 similar words | |
fig2.add_trace(go.Scatter3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], mode='markers', | |
marker=dict(size=2, color=sims_query_top, colorscale='RdYlGn', symbol='circle'), | |
hovertemplate='<b>%{text}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>', | |
text=words, customdata=sims, name='')) | |
fig2.update(layout_coloraxis_showscale=True) | |
fig2.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) | |
fig2.update_annotations(visible=False) | |
st.plotly_chart(fig2, use_container_width=True) | |
st.markdown("---") | |
# print() | |
# print("Human genes similar to " + str(query)) | |
df1 = table.copy() | |
df2 = pd.read_csv('kegg_drug_list_lowercase.csv') | |
m = df1.Word.isin(df2.drugs) | |
df1 = df1[m] | |
df1.rename(columns={'Word': 'Drugs'}, inplace=True) | |
df1.reset_index(drop=True, inplace=True) | |
df_len = len(df1) | |
# print(len(df1)) | |
# df1["Human Gene"] = df1["Human Gene"].str.upper() | |
# print(df1.head(50)) | |
# print() | |
# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False) | |
# time.sleep(2) | |
# Create the slider with increments of 5 up to 100 | |
# Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100 | |
value_drug = min(df1.shape[0], 100) | |
# print(value_drug) | |
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize " | |
# f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually " | |
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> " | |
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>", | |
# unsafe_allow_html=True) | |
st.markdown( | |
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_drug} " | |
f"</span>Drugs contextually and semantically similar to " | |
f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>", | |
unsafe_allow_html=True) | |
df13 = df1.head(value_drug).copy() | |
df13.index = (1 / (df13.index + 1)) * 10000 | |
sizes = df13.index.tolist() | |
df13.set_index('Drugs', inplace=True) | |
df6 = df1.copy() | |
# print(df4.head(10)) | |
df6["SIMILARITY"] = 'Similarity Score ' + df6.head(value_drug)["SIMILARITY"].round(2).astype(str) | |
df6.reset_index(inplace=True) | |
# df4 = df4.rename(columns={'Protein': 'symbol2'}) | |
# print(df4) | |
# # Use df.query to get a subset of df1 based on ids in df2 | |
# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2') | |
# # Use merge to join the two DataFrames on id | |
# result = pd.merge(subset, df2b, on='symbol2') | |
# print(result) | |
if value_drug <= df_len: | |
# Define the `text` column for labels and `href` column for links | |
# Reset the index | |
df13.reset_index(inplace=True) | |
database_name = database_name | |
# Split the string on the underscore | |
words = database_name.split('_') | |
# Assign the split words to two separate variables | |
word1, word2 = words | |
# Replace hyphens with spaces in the 'text' column | |
df13['Drugs'] = df13['Drugs'].str.replace('-', ' ') | |
# Set the 'text' column back as the index | |
df13.set_index('Drugs', inplace=True) | |
df13['text'] = df13.index | |
df13['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ | |
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df13['text']] | |
df13['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df13['text']] | |
df13['href3'] = [f'https://beta.clinicaltrials.gov/search?cond={word1}%20{word2}&term={c}&viewType=Table' for c in df13['text']] | |
assert isinstance(df13, object) | |
df13['database'] = database_name | |
# df11['name'] = [c for c in result['Approved name']] | |
# Create the treemap using `px.treemap` | |
fig = px.treemap(df13, path=[df13['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text', 'href3'], | |
hover_name=(df6.head(value_drug)['SIMILARITY'])) | |
fig.update(layout_coloraxis_showscale=False) | |
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) | |
fig.update_annotations(visible=False) | |
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, | |
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", | |
texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>" | |
"<a href='%{customdata[0]}'>PubMed" | |
"</a><br><br><a href='%{customdata[2]}'>Wikipedia" | |
"</a><br><br><a href='%{customdata[4]}'>ClinicalTrials.gov" | |
"</span></a>") | |
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["Thistle"]) | |
# # display the treemap in Streamlit | |
# with treemap2: | |
# st.pyplot(fig2) | |
st.plotly_chart(fig, use_container_width=True) | |
st.caption("Drug designation and database provided by KEGG: https://www.kegg.jp/kegg/drug/") | |
csv = df1.head(value_drug).to_csv().encode('utf-8') | |
st.download_button(label=f"download top {value_drug} drugs (csv)", data=csv, | |
file_name=f'{database_name}_drugs.csv', mime='text/csv') | |
else: | |
st.warning( | |
f"This selection exceeds the number of similar drugs related to {query} within the {database_name} corpus, please choose a lower number") | |
# try: | |
# value_drug = min(df_len, 50) | |
# top_words = model.wv.most_similar_cosmul(query, topn=value_drug) | |
# # print(top_words) | |
# words = df13.head(value_drug).index | |
# words = [word.replace(' ', '-') for word in words] | |
# # print(words) | |
# sims = df6.head(value_drug)["SIMILARITY"].tolist() | |
# # print(sims) | |
# X_top = model.wv[words] | |
# except Exception as e: | |
# print("Error:", e) | |
# | |
# | |
# # Remove the text "Similarity Score" from each element in the sims list | |
# sims_query_top = [float(sim.split()[-1]) for sim in sims] | |
# # print(sims_query_top) | |
# | |
# # Generate a 3D scatter plot of word embeddings using Plotly | |
# fig4 = px.scatter_3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], text=words, color=sims_query_top, | |
# color_continuous_scale="RdYlGn", hover_name=words, hover_data={"color": sims_query_top}) | |
# | |
# # Change background color to black | |
# fig4.update_layout(scene=dict(bgcolor='#CCFFFF')) | |
# | |
# # Change color of text to white | |
# fig4.update_layout(scene=dict(xaxis=dict(backgroundcolor='#CCFFFF', color='blue'), | |
# yaxis=dict(backgroundcolor='#CCFFFF', color='blue'), | |
# zaxis=dict(backgroundcolor='#CCFFFF', color='blue'))) | |
# | |
# fig4.update_traces(hovertemplate='<b>%{hovertext}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>') | |
# fig4.update_layout( | |
# title=dict(text=f"Word embedding map for {query} in {database_name} PubMed corpus", x=0.5, y=0.95, | |
# xanchor='center', yanchor='top', font=dict(color='black')), | |
# scene=dict(xaxis_title="Dimension 1", yaxis_title="Dimension 2", zaxis_title="Dimension 3")) | |
# fig4.update_coloraxes(colorbar_title="Similarity with query") | |
# | |
# # Represent query as a large red diamond | |
# fig4.add_trace( | |
# go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='markers', | |
# marker=dict(size=7, color='black', symbol='diamond'), name=query, hovertext=query, | |
# showlegend=False)) | |
# | |
# # Add label for the query above the diamond | |
# fig4.add_trace(go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='text', | |
# text=[query], textposition='bottom center', textfont=dict(color='blue', size=10), | |
# hoverinfo='none', showlegend=False)) | |
# | |
# # Add circles for the top 50 similar words | |
# fig4.add_trace(go.Scatter3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], mode='markers', | |
# marker=dict(size=2, color=sims_query_top, colorscale='RdYlGn', symbol='circle'), | |
# hovertemplate='<b>%{text}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>', | |
# text=words, customdata=sims, name='')) | |
# | |
# fig4.update(layout_coloraxis_showscale=True) | |
# fig4.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) | |
# fig4.update_annotations(visible=False) | |
# | |
# st.plotly_chart(fig4, use_container_width=True) | |
# st.markdown("---") | |
st.markdown("---") | |
# print() | |
# print("Human genes similar to " + str(query)) | |
df1 = table.copy() | |
df2 = pd.read_csv('phytochemicals.csv') | |
m = df1.Word.isin(df2.phyto) | |
df1 = df1[m] | |
df1.rename(columns={'Word': 'Phytochemical'}, inplace=True) | |
df1.reset_index(drop=True, inplace=True) | |
df_len = len(df1) | |
# print(len(df1)) | |
# df1["Human Gene"] = df1["Human Gene"].str.upper() | |
# print(df1.head(50)) | |
# print() | |
# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False) | |
# time.sleep(2) | |
# Create the slider with increments of 5 up to 100 | |
# Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100 | |
value_phyto = min(df1.shape[0], 100) | |
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize " | |
# f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually " | |
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> " | |
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>", | |
# unsafe_allow_html=True) | |
st.markdown( | |
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_phyto} " | |
f"</span>Phytochemicals contextually and semantically similar to " | |
f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. " | |
f"Click on the squares to expand and also the Pubmed and Wikipedia links for more compound information</span></p></b>", | |
unsafe_allow_html=True) | |
df15 = df1.head(value_phyto).copy() | |
df15.index = (1 / (df15.index + 1)) * 10000 | |
sizes = df15.index.tolist() | |
df15.set_index('Phytochemical', inplace=True) | |
df8 = df1.copy() | |
# print(df4.head(10)) | |
df8["SIMILARITY"] = 'Similarity Score ' + df8.head(value_phyto)["SIMILARITY"].round(2).astype(str) | |
df8.reset_index(inplace=True) | |
# df4 = df4.rename(columns={'Protein': 'symbol2'}) | |
# print(df4) | |
# # Use df.query to get a subset of df1 based on ids in df2 | |
# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2') | |
# # Use merge to join the two DataFrames on id | |
# result = pd.merge(subset, df2b, on='symbol2') | |
# print(result) | |
if value_phyto <= df_len: | |
# Define the `text` column for labels and `href` column for links | |
# Reset the index | |
df15.reset_index(inplace=True) | |
# Replace hyphens with spaces in the 'text' column | |
df15['Phytochemical'] = df15['Phytochemical'].str.replace('-', ' ') | |
# Set the 'text' column back as the index | |
df15.set_index('Phytochemical', inplace=True) | |
df15['text'] = df15.index | |
df15['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ | |
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df15['text']] | |
df15['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df15['text']] | |
assert isinstance(df15, object) | |
df15['database'] = database_name | |
# df11['name'] = [c for c in result['Approved name']] | |
# Create the treemap using `px.treemap` | |
fig = px.treemap(df15, path=[df15['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'], | |
hover_name=(df8.head(value_phyto)['SIMILARITY'])) | |
fig.update(layout_coloraxis_showscale=False) | |
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) | |
fig.update_annotations(visible=False) | |
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, | |
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", | |
texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>" | |
"<a href='%{customdata[0]}'>PubMed" | |
"</a><br><br><a href='%{customdata[2]}'>Wikipedia" | |
"</span></a>") | |
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightSeaGreen"]) | |
# # display the treemap in Streamlit | |
# with treemap2: | |
# st.pyplot(fig2) | |
st.plotly_chart(fig, use_container_width=True) | |
st.caption("Phytochemical designation and database provided by PhytoHub: https://phytohub.eu/") | |
csv = df1.head(value_phyto).to_csv().encode('utf-8') | |
st.download_button(label=f"download top {value_phyto} phytochemicals (csv)", data=csv, | |
file_name=f'{database_name}_phytochemicals.csv', mime='text/csv') | |
else: | |
st.warning( | |
f"This selection exceeds the number of similar pythochemicals related to {query} within the {database_name} corpus, please choose a lower number") | |
# try: | |
# value_phyto = min(df_len, 50) | |
# top_words = model.wv.most_similar_cosmul(query, topn=value_phyto) | |
# words = df15.head(value_phyto).index | |
# words = [word.replace(' ', '-') for word in words] | |
# # print(words) | |
# sims = df8.head(value_phyto)["SIMILARITY"].tolist() | |
# # print(sims) | |
# X_top = model.wv[words] # print(X_top) | |
# except Exception as e: | |
# print("Error:", e) | |
# | |
# # Remove the text "Similarity Score" from each element in the sims list | |
# sims_query_top = [float(sim.split()[-1]) for sim in sims] | |
# # print(sims_query_top) | |
# | |
# # Generate a 3D scatter plot of word embeddings using Plotly | |
# fig4 = px.scatter_3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], text=words, color=sims_query_top, | |
# color_continuous_scale="RdYlGn", hover_name=words, hover_data={"color": sims_query_top}) | |
# | |
# # Change background color to black | |
# fig4.update_layout(scene=dict(bgcolor='#CCFFFF')) | |
# | |
# # Change color of text to white | |
# fig4.update_layout(scene=dict(xaxis=dict(backgroundcolor='#CCFFFF', color='blue'), | |
# yaxis=dict(backgroundcolor='#CCFFFF', color='blue'), | |
# zaxis=dict(backgroundcolor='#CCFFFF', color='blue'))) | |
# | |
# fig4.update_traces(hovertemplate='<b>%{hovertext}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>') | |
# fig4.update_layout( | |
# title=dict(text=f"Word embedding map for {query} in {database_name} PubMed corpus", x=0.5, y=0.95, | |
# xanchor='center', yanchor='top', font=dict(color='black')), | |
# scene=dict(xaxis_title="Dimension 1", yaxis_title="Dimension 2", zaxis_title="Dimension 3")) | |
# fig4.update_coloraxes(colorbar_title="Similarity with query") | |
# | |
# # Represent query as a large red diamond | |
# fig4.add_trace( | |
# go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='markers', | |
# marker=dict(size=7, color='black', symbol='diamond'), name=query, hovertext=query, | |
# showlegend=False)) | |
# | |
# # Add label for the query above the diamond | |
# fig4.add_trace(go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='text', | |
# text=[query], textposition='bottom center', textfont=dict(color='blue', size=10), | |
# hoverinfo='none', showlegend=False)) | |
# | |
# # Add circles for the top 50 similar words | |
# fig4.add_trace(go.Scatter3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], mode='markers', | |
# marker=dict(size=2, color=sims_query_top, colorscale='RdYlGn', symbol='circle'), | |
# hovertemplate='<b>%{text}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>', | |
# text=words, customdata=sims, name='')) | |
# | |
# fig4.update(layout_coloraxis_showscale=True) | |
# fig4.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) | |
# fig4.update_annotations(visible=False) | |
# | |
# st.plotly_chart(fig4, use_container_width=True) | |
# st.markdown("---") | |
st.markdown("---") | |
# print() | |
# print("Human genes similar to " + str(query)) | |
df1 = table.copy() | |
df2 = pd.read_csv('kegg_compounds_lowercase.csv') | |
m = df1.Word.isin(df2.compound) | |
df1 = df1[m] | |
df1.rename(columns={'Word': 'Compounds'}, inplace=True) | |
df1.reset_index(drop=True, inplace=True) | |
df_len = len(df1) | |
# df1["Human Gene"] = df1["Human Gene"].str.upper() | |
# print(df1.head(50)) | |
# print() | |
# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False) | |
# time.sleep(2) | |
# Create the slider with increments of 5 up to 100 | |
# Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100 | |
value_compound = min(df1.shape[0], 100) | |
# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize " | |
# f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually " | |
# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> " | |
# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>", | |
# unsafe_allow_html=True) | |
st.markdown( | |
f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_compound} " | |
f"</span>Compounds contextually and semantically similar to " | |
f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. " | |
f"Click on the squares to expand and the Pubmed, Wikipedia, and KEGG links for more compound information (may take time to load)</span></p></b>", | |
unsafe_allow_html=True) | |
df12 = df1.head(value_compound).copy() | |
df12.index = (1 / (df12.index + 1)) * 10000 | |
sizes = df12.index.tolist() | |
df12.set_index('Compounds', inplace=True) | |
df5 = df1.copy() | |
# print(df4.head(10)) | |
df5["SIMILARITY"] = 'Similarity Score ' + df5.head(value_compound)["SIMILARITY"].round(2).astype(str) | |
df5.reset_index(inplace=True) | |
# df4 = df4.rename(columns={'Protein': 'symbol2'}) | |
# print(df4) | |
# # Use df.query to get a subset of df1 based on ids in df2 | |
# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2') | |
# # Use merge to join the two DataFrames on id | |
# result = pd.merge(subset, df2b, on='symbol2') | |
# print(result) | |
if value_compound <= df_len: | |
# Define the `text` column for labels and `href` column for links | |
# Reset the index | |
df12.reset_index(inplace=True) | |
# Replace hyphens with spaces in the 'text' column | |
df12['Compounds'] = df12['Compounds'].str.replace('-', ' ') | |
# Set the 'text' column back as the index | |
df12.set_index('Compounds', inplace=True) | |
df12['text'] = df12.index | |
df12['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ | |
'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df12['text']] | |
df12['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df12['text']] | |
df12['href3'] = [f'https://www.genome.jp/entry/{compound_id}' for compound_id in | |
get_compound_ids(df12['text'])] | |
assert isinstance(df12, object) | |
df12['database'] = database_name | |
# df11['name'] = [c for c in result['Approved name']] | |
# Create the treemap using `px.treemap` | |
fig = px.treemap(df12, path=[df12['text']], values=sizes, | |
custom_data=['href', 'database', 'href2', 'text', 'href3'], | |
hover_name=(df5.head(value_compound)['SIMILARITY'])) | |
fig.update(layout_coloraxis_showscale=False) | |
fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) | |
fig.update_annotations(visible=False) | |
fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, | |
hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", | |
texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>" | |
"<a href='%{customdata[0]}'>PubMed" | |
"</a><br><br><a href='%{customdata[2]}'>Wikipedia" | |
"</a><br><br><a href='%{customdata[4]}'>KEGG Compound Page" | |
"</span></a>") | |
fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightYellow"]) | |
# # display the treemap in Streamlit | |
# with treemap2: | |
# st.pyplot(fig2) | |
st.plotly_chart(fig, use_container_width=True) | |
st.caption("Compound designation and database provided by KEGG: https://www.kegg.jp/kegg/compound/") | |
csv = df1.head(value_compound).to_csv().encode('utf-8') | |
st.download_button(label=f"download top {value_compound} compounds (csv)", data=csv, | |
file_name=f'{database_name}_compounds.csv', mime='text/csv') | |
else: | |
st.warning( | |
f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number") | |
# try: | |
# value_compound = min(df_len, 50) | |
# top_words = model.wv.most_similar_cosmul(query, topn=value_compound) | |
# words = df12.head(value_compound).index | |
# words = [word.replace(' ', '-') for word in words] | |
# | |
# sims = df5.head(value_compound)["SIMILARITY"].tolist() | |
# # print(sims) | |
# X_top = model.wv[words] # print(X_top) | |
# except Exception as e: | |
# print("Error:", e) | |
# | |
# # Remove the text "Similarity Score" from each element in the sims list | |
# sims_query_top = [float(sim.split()[-1]) for sim in sims] | |
# # print(sims_query_top) | |
# | |
# # Generate a 3D scatter plot of word embeddings using Plotly | |
# fig5 = px.scatter_3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], text=words, color=sims_query_top, | |
# color_continuous_scale="RdYlGn", hover_name=words, hover_data={"color": sims_query_top}) | |
# | |
# # Change background color to black | |
# fig5.update_layout(scene=dict(bgcolor='#CCFFFF')) | |
# | |
# # Change color of text to white | |
# fig5.update_layout(scene=dict(xaxis=dict(backgroundcolor='#CCFFFF', color='blue'), | |
# yaxis=dict(backgroundcolor='#CCFFFF', color='blue'), | |
# zaxis=dict(backgroundcolor='#CCFFFF', color='blue'))) | |
# | |
# fig5.update_traces(hovertemplate='<b>%{hovertext}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>') | |
# fig5.update_layout( | |
# title=dict(text=f"Word embedding map for {query} in {database_name} PubMed corpus", x=0.5, y=0.95, | |
# xanchor='center', yanchor='top', font=dict(color='black')), | |
# scene=dict(xaxis_title="Dimension 1", yaxis_title="Dimension 2", zaxis_title="Dimension 3")) | |
# fig5.update_coloraxes(colorbar_title="Similarity with query") | |
# | |
# # Represent query as a large red diamond | |
# fig5.add_trace( | |
# go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='markers', | |
# marker=dict(size=7, color='black', symbol='diamond'), name=query, hovertext=query, | |
# showlegend=False)) | |
# | |
# # Add label for the query above the diamond | |
# fig5.add_trace(go.Scatter3d(x=[model.wv[query][0]], y=[model.wv[query][1]], z=[model.wv[query][2]], mode='text', | |
# text=[query], textposition='bottom center', textfont=dict(color='blue', size=10), | |
# hoverinfo='none', showlegend=False)) | |
# | |
# # Add circles for the top 50 similar words | |
# fig5.add_trace(go.Scatter3d(x=X_top[:, 0], y=X_top[:, 1], z=X_top[:, 2], mode='markers', | |
# marker=dict(size=2, color=sims_query_top, colorscale='RdYlGn', symbol='circle'), | |
# hovertemplate='<b>%{text}</b><br>Similarity score: %{customdata[0]:.2f}<extra></extra>', | |
# text=words, customdata=sims, name='')) | |
# | |
# fig5.update(layout_coloraxis_showscale=True) | |
# fig5.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) | |
# fig5.update_annotations(visible=False) | |
# | |
# st.plotly_chart(fig5, use_container_width=True) | |
# st.markdown("---") | |
# import os | |
# from datasets import Dataset | |
# # Check if the comments directory exists | |
# if os.path.exists('comments'): | |
# # Load the dataset from disk | |
# dataset = Dataset.load_from_disk('comments') | |
# else: | |
# # Create a new dataset | |
# dataset = Dataset.from_dict({'id': [], 'text': []}) | |
# def save_comment(comment): | |
# # Check if the dataset exists | |
# if os.path.exists('comments'): | |
# dataset = Dataset.load_from_disk('comments') | |
# else: | |
# dataset = Dataset.from_dict({'id': [], 'text': []}) | |
# # Append the new comment to the dataset | |
# new_comment = {'id': len(dataset), 'text': comment} | |
# dataset = dataset.concatenate(Dataset.from_dict(new_comment)) | |
# # Save the dataset to disk | |
# dataset.save_to_disk('comments') | |
# print('Comment saved to dataset.') | |
# st.title("Abstractalytics Web App") | |
# st.write("We appreciate your feedback!") | |
# user_comment = st.text_area("Please send us your anonymous remarks/suggestions about the Abstractalytics Web App: " | |
# "(app will pause while we save your comments)") | |
# if st.button("Submit"): | |
# if user_comment: | |
# save_comment(user_comment) | |
# st.success("Your comment has been saved. Thank you for your feedback!") | |
# else: | |
# st.warning("Please enter a comment before submitting.") | |
# # Load the comments dataset from disk | |
# if os.path.exists('comments'): | |
# dataset = Dataset.load_from_disk('comments') | |
# else: | |
# dataset = Dataset.from_dict({'id': [], 'text': []}) | |
# # Access the text column of the dataset | |
# comments = dataset['text'] | |
# # Define the password | |
# PASSWORD = 'ram100pass' | |
# # Prompt the user for the password | |
# password = st.text_input('Password:', type='password') | |
# # Display the comments if the password is correct | |
# if password == PASSWORD: | |
# st.title('Comments') | |
# for comment in comments: | |
# st.write(comment) | |
# else: | |
# st.warning('Incorrect password') | |
st.markdown("---") | |
except: | |
st.warning("") | |
st.subheader("Cancer-related videos") | |
if query: | |
idlist = [] | |
search_keyword = {query} | |
html = urllib.request.urlopen("https://www.youtube.com/@NCIgov/search?query=cancer") | |
html2 = urllib.request.urlopen("https://www.youtube.com/@CancerCenter/search?query=cancer") | |
html3 = urllib.request.urlopen("https://www.youtube.com/@NorthwesternMedicine/search?query=cancer") | |
html4 = urllib.request.urlopen("https://www.youtube.com/@TEDEd/search?query=cancer") | |
html5 = urllib.request.urlopen("https://www.youtube.com/@CancerResearchUK/search?query=cancer") | |
video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode()) | |
video_ids2 = re.findall(r"watch\?v=(\S{11})", html2.read().decode()) | |
video_ids3 = re.findall(r"watch\?v=(\S{11})", html3.read().decode()) | |
video_ids4 = re.findall(r"watch\?v=(\S{11})", html4.read().decode()) | |
video_ids5 = re.findall(r"watch\?v=(\S{11})", html5.read().decode()) | |
for i in video_ids2: | |
video_ids.append(i) | |
for i in video_ids3: | |
video_ids.append(i) | |
for i in video_ids4: | |
video_ids.append(i) | |
for i in video_ids5: | |
video_ids.append(i) | |
random.shuffle(video_ids) | |
c1, c2, c3 = st.columns(3) | |
with c1: | |
st.video("https://www.youtube.com/watch?v=" + video_ids[0]) | |
with c2: | |
st.video("https://www.youtube.com/watch?v=" + video_ids[1]) | |
with c3: | |
st.video("https://www.youtube.com/watch?v=" + video_ids[2]) | |
st.markdown("---") | |
# Add a section header for useful resources | |
st.header("Learn More About Word2Vec the algorithm behind OncoDigger") | |
# Add links to videos and webpages | |
# Add links to videos and webpages | |
st.markdown(""" | |
Here are some useful resources to help you learn more about Word2Vec: | |
1. [Word2Vec Tutorial - The Skip-Gram Model](http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/) - A blog post by Chris McCormick providing a detailed explanation of the skip-gram model used in Word2Vec. | |
2. [Word2Vec Tutorial Part 2 - Negative Sampling](http://mccormickml.com/2017/01/11/word2vec-tutorial-part-2-negative-sampling/) - A follow-up blog post by Chris McCormick discussing negative sampling in Word2Vec. | |
3. [Efficient Estimation of Word Representations in Vector Space](https://arxiv.org/pdf/1301.3781.pdf) - The original research paper by Mikolov et al. that introduced the Word2Vec algorithm. | |
4. [Word2Vec Tutorial: Vector Representation of Words](https://www.youtube.com/watch?v=64qSgA66P-8) - A YouTube video by Sentdex explaining the Word2Vec algorithm and its implementation in Python. | |
5. [Word2Vec: How to Implement Word2Vec in Python](https://www.youtube.com/watch?v=ISPId9Lhc1g&t=6s) - A YouTube video by Data Talks demonstrating how to implement Word2Vec in Python using the Gensim library. | |
6. [Cosine Similarity Calculator](https://www.omnicalculator.com/math/cosine-similarity) - A calculator for computing cosine similarity, a common metric used in measuring similarity between vectors. | |
""") | |
# else: | |
# st.error("The password you entered is incorrect.") | |