Spaces:
Runtime error
Runtime error
import streamlit as st | |
import time | |
import json | |
from gensim.models import Word2Vec | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import squarify | |
import numpy as np | |
st.set_page_config( | |
page_title="FATA4 Science", | |
page_icon=":microscope:", | |
layout="wide", | |
initial_sidebar_state="expanded", | |
menu_items={ | |
'About': "FATA4 Science is a Natural Language Processing (NLP) that ...." | |
} | |
) | |
# Define the HTML and CSS styles | |
st.markdown(""" | |
<style> | |
body { | |
background-color: #EBF5FB; | |
# color: #ffffff; | |
} | |
.stApp { | |
background-color: #EBF5FB; | |
# color: #ffffff; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
opt=st.sidebar.radio("Select a PubMed Corpus", options=('Clotting corpus', 'Neuroblastoma corpus')) | |
if opt == "Clotting corpus": | |
model_used = ("pubmed_model_clotting") | |
num_abstracts = 45493 | |
database_name = "Clotting" | |
if opt == "Neuroblastoma corpus": | |
model_used = ("pubmed_model_neuroblastoma") | |
num_abstracts = 29032 | |
database_name = "Neuroblastoma" | |
st.title(":red[Fast Acting Text Analysis (FATA) 4 Science]") | |
st.markdown("---") | |
st.subheader("Uncovering knowledge through Natural Language Processing (NLP)") | |
st.header(f"{database_name} Pubmed corpus.") | |
text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus", max_chars=50) | |
query = text_input_value | |
query = query.lower() | |
# query = input ("Enter your keyword(s):") | |
if query: | |
bar = st.progress(0) | |
time.sleep(.2) | |
st.caption(f":LightSkyBlue[searching {num_abstracts} {database_name} PubMed abstracts] covering 1990-2022") | |
for i in range(10): | |
bar.progress((i + 1) * 10) | |
time.sleep(.1) | |
try: | |
model = Word2Vec.load(model_used) # you can continue training with the loaded model! | |
words = list(model.wv.key_to_index) | |
X = model.wv[model.wv.key_to_index] | |
model2 = model.wv[query] | |
df = pd.DataFrame(X) | |
except: | |
st.error("Term occurrence is too low - please try another term") | |
st.stop() | |
# def findRelationships(query, df): | |
table = model.wv.most_similar_cosmul(query, topn=10000) | |
table = (pd.DataFrame(table)) | |
table.index.name = 'Rank' | |
table.columns = ['Word', 'SIMILARITY'] | |
print() | |
print("Similarity to " + str(query)) | |
pd.set_option('display.max_rows', None) | |
print(table.head(50)) | |
# table.head(10).to_csv("clotting_sim1.csv", index=True) | |
# short_table = table.head(50) | |
# print(table) | |
st.subheader(f"Top 10 Words closely related to {query}") | |
# calculate the sizes of the squares in the treemap | |
short_table = table.head(10) | |
short_table.index += 1 | |
short_table.index = 1 / short_table.index | |
sizes = short_table.index.tolist() | |
cmap = plt.cm.Greens(np.linspace(0.05, .5, len(sizes))) | |
color = [cmap[i] for i in range(len(sizes))] | |
short_table.set_index('Word', inplace=True) | |
squarify.plot(sizes=sizes, label=short_table.index.tolist(), color=color, edgecolor="#EBF5FB", | |
text_kwargs={'fontsize': 10}) | |
# # plot the treemap using matplotlib | |
plt.axis('off') | |
fig = plt.gcf() | |
fig.patch.set_facecolor('#EBF5FB') | |
# # display the treemap in Streamlit | |
st.pyplot(fig) | |
plt.clf() | |
csv = table.head(100).to_csv().encode('utf-8') | |
st.download_button(label="download top 100 words (csv)", data=csv, file_name=f'{database_name}_words.csv', mime='text/csv') | |
# st.write(short_table) | |
# | |
print() | |
print("Human genes similar to " + str(query)) | |
df1 = table | |
df2 = pd.read_csv('Human_Genes.csv') | |
m = df1.Word.isin(df2.symbol) | |
df1 = df1[m] | |
df1.rename(columns={'Word': 'Human Gene'}, inplace=True) | |
df1["Human Gene"] = df1["Human Gene"].str.upper() | |
print(df1.head(50)) | |
print() | |
# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False) | |
# time.sleep(2) | |
st.subheader(f"Top 10 Genes closely related to {query}") | |
df10 = df1.head(10) | |
df10.index = 1 / df10.index | |
sizes = df10.index.tolist() | |
cmap2 = plt.cm.Blues(np.linspace(0.05, .5, len(sizes))) | |
color2 = [cmap2[i] for i in range(len(sizes))] | |
df10.set_index('Human Gene', inplace=True) | |
squarify.plot(sizes=sizes, label=df10.index.tolist(), color=color2, edgecolor="#EBF5FB", | |
text_kwargs={'fontsize': 12}) | |
# | |
# # plot the treemap using matplotlib | |
plt.axis('off') | |
fig2 = plt.gcf() | |
fig2.patch.set_facecolor('#EBF5FB') | |
# plt.show() | |
# | |
# # display the treemap in Streamlit | |
st.pyplot(fig2) | |
csv = df1.head(100).to_csv().encode('utf-8') | |
st.download_button(label="download top 100 genes (csv)", data=csv, file_name=f'{database_name}_genes.csv', | |
mime='text/csv') | |
DEFAULT_WIDTH = 80 | |
VIDEO_DATA = f"https://www.youtube.com/@NCIgov/search?query=cancer" | |
width = st.sidebar.slider( | |
label="Width", min_value=0, max_value=100, value=DEFAULT_WIDTH, format="%d%%" | |
) | |
width = max(width, 0.01) | |
side = max((100 - width) / 2, 0.01) | |
_, container, _ = st.columns([side, width, side]) | |
container.video(data=VIDEO_DATA) | |
# model = gensim.models.KeyedVectors.load_word2vec_format('pubmed_model_clotting', binary=True) | |
# similar_words = model.most_similar(word) | |
# output = json.dumps({"word": word, "similar_words": similar_words}) | |
# st.write(output) | |