import streamlit as st import time import json from gensim.models import Word2Vec import pandas as pd import matplotlib.pyplot as plt import squarify import numpy as np import re import urllib.request st.set_page_config( page_title="FATA4 Science", page_icon=":microscope:", layout="wide", initial_sidebar_state="auto", menu_items={ 'About': "FATA4 Science is a Natural Language Processing (NLP) that ...." } ) # Define the HTML and CSS styles st.markdown(""" """, unsafe_allow_html=True) st.markdown(""" """, unsafe_allow_html=True) opt=st.sidebar.radio("Select a PubMed Corpus", options=('Clotting corpus', 'Neuroblastoma corpus')) if opt == "Clotting corpus": model_used = ("pubmed_model_clotting") num_abstracts = 45493 database_name = "Clotting" if opt == "Neuroblastoma corpus": model_used = ("pubmed_model_neuroblastoma") num_abstracts = 29032 database_name = "Neuroblastoma" st.title(":red[Fast Acting Text Analysis (FATA) 4 Science]") st.markdown("---") st.subheader("Uncovering knowledge through Natural Language Processing (NLP)") st.header(f"{database_name} Pubmed corpus.") text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus") query = text_input_value query = query.lower() query = re.sub("[,.?!&*;:]", "", query) matches = [" "] if any([x in query for x in matches]): st.write("Please only enter one term or a term without spaces") # query = input ("Enter your keyword(s):") if query: bar = st.progress(0) time.sleep(.2) st.caption(f":LightSkyBlue[searching {num_abstracts} {database_name} PubMed abstracts] covering 1990-2022") for i in range(10): bar.progress((i + 1) * 10) time.sleep(.1) try: model = Word2Vec.load(model_used) # you can continue training with the loaded model! words = list(model.wv.key_to_index) X = model.wv[model.wv.key_to_index] model2 = model.wv[query] df = pd.DataFrame(X) except: st.error("Term occurrence is too low - please try another term") st.stop() # def findRelationships(query, df): table = model.wv.most_similar_cosmul(query, topn=10000) table = (pd.DataFrame(table)) table.index.name = 'Rank' table.columns = ['Word', 'SIMILARITY'] print() print("Similarity to " + str(query)) pd.set_option('display.max_rows', None) print(table.head(50)) # table.head(10).to_csv("clotting_sim1.csv", index=True) # short_table = table.head(50) # print(table) st.subheader(f"Top 10 Words closely related to {query}") # calculate the sizes of the squares in the treemap short_table = table.head(10) short_table.index += 1 short_table.index = 1 / short_table.index sizes = short_table.index.tolist() cmap = plt.cm.Greens(np.linspace(0.05, .5, len(sizes))) color = [cmap[i] for i in range(len(sizes))] short_table.set_index('Word', inplace=True) squarify.plot(sizes=sizes, label=short_table.index.tolist(), color=color, edgecolor="#EBF5FB", text_kwargs={'fontsize': 10}) # # plot the treemap using matplotlib plt.axis('off') fig = plt.gcf() fig.patch.set_facecolor('#CCFFFF') # # display the treemap in Streamlit st.pyplot(fig) plt.clf() csv = table.head(100).to_csv().encode('utf-8') st.download_button(label="download top 100 words (csv)", data=csv, file_name=f'{database_name}_words.csv', mime='text/csv') # st.write(short_table) # print() print("Human genes similar to " + str(query)) df1 = table df2 = pd.read_csv('Human_Genes.csv') m = df1.Word.isin(df2.symbol) df1 = df1[m] df1.rename(columns={'Word': 'Human Gene'}, inplace=True) df1["Human Gene"] = df1["Human Gene"].str.upper() print(df1.head(50)) print() # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False) # time.sleep(2) st.subheader(f"Top 10 Genes closely related to {query}") df10 = df1.head(10) df10.index = 1 / df10.index sizes = df10.index.tolist() cmap2 = plt.cm.Blues(np.linspace(0.05, .5, len(sizes))) color2 = [cmap2[i] for i in range(len(sizes))] df10.set_index('Human Gene', inplace=True) squarify.plot(sizes=sizes, label=df10.index.tolist(), color=color2, edgecolor="#EBF5FB", text_kwargs={'fontsize': 12}) # # # plot the treemap using matplotlib plt.axis('off') fig2 = plt.gcf() fig2.patch.set_facecolor('#CCFFFF') # plt.show() # # # display the treemap in Streamlit st.pyplot(fig2) csv = df1.head(100).to_csv().encode('utf-8') st.download_button(label="download top 100 genes (csv)", data=csv, file_name=f'{database_name}_genes.csv', mime='text/csv') if query: search_keyword = {query} html = urllib.request.urlopen(f"https://www.youtube.com/results?search_query={database_name}") video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode()) # st.video("https://www.youtube.com/watch?v=" + video_ids[0]) VIDEO_DATA = "https://www.youtube.com/watch?v=" + video_ids[0] width = 80 side = 10 _, container, _ = st.columns([side, width, side]) container.video(data=VIDEO_DATA) # model = gensim.models.KeyedVectors.load_word2vec_format('pubmed_model_clotting', binary=True) # similar_words = model.most_similar(word) # output = json.dumps({"word": word, "similar_words": similar_words}) # st.write(output)