import streamlit as st import time import json from gensim.models import Word2Vec import pandas as pd import matplotlib.pyplot as plt import squarify import numpy as np # Define the HTML and CSS styles st.markdown( """ """, unsafe_allow_html=True ) st.header("Word2Vec App for Clotting Pubmed Database.") text_input_value = st.text_input("Enter some text") query = text_input_value query = query.lower() # query = input ("Enter your keyword(s):") if query: model = Word2Vec.load("pubmed_model_clotting") # you can continue training with the loaded model! words = list(model.wv.key_to_index) X = model.wv[model.wv.key_to_index] model2 = model.wv[query] df = pd.DataFrame(X) # def findRelationships(query, df): table = model.wv.most_similar_cosmul(query, topn=10000) table = (pd.DataFrame(table)) table.index.name = 'Rank' table.columns = ['Word', 'SIMILARITY'] print() print("Similarity to " + str(query)) pd.set_option('display.max_rows', None) print(table.head(50)) table.head(10).to_csv("clotting_sim1.csv", index=True) # short_table = table.head(50) # print(table) st.subheader(f"Similar Words to {query}") # calculate the sizes of the squares in the treemap short_table = table.head(10) short_table.index += 1 short_table.index = 1 / short_table.index sizes = short_table.index.tolist() cmap = plt.cm.Greens(np.linspace(0.05, .5, len(sizes))) color = [cmap[i] for i in range(len(sizes))] short_table.set_index('Word', inplace=True) squarify.plot(sizes=sizes, label=short_table.index.tolist(), color=color, edgecolor="#EBF5FB", text_kwargs={'fontsize': 12}) # # plot the treemap using matplotlib plt.axis('off') fig = plt.gcf() fig.patch.set_facecolor('#EBF5FB') # # display the treemap in Streamlit st.pyplot(fig) plt.clf() # st.write(short_table) # print() print("Human genes similar to " + str(query)) df1 = table df2 = pd.read_csv('Human_Genes.csv') m = df1.Word.isin(df2.symbol) df1 = df1[m] df1.rename(columns={'Word': 'Human Gene'}, inplace=True) df1["Human Gene"] = df1["Human Gene"].str.upper() print(df1.head(50)) print() df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False) # time.sleep(2) st.subheader(f"Similar Genes to {query}") df1 = df1.head(10) df1.index = 1/df1.index sizes = df1.index.tolist() cmap2 = plt.cm.Blues(np.linspace(0.05, .5, len(sizes))) color2 = [cmap2[i] for i in range(len(sizes))] df1.set_index('Human Gene', inplace=True) squarify.plot(sizes=sizes, label=df1.index.tolist(), color=color2, edgecolor="#EBF5FB", text_kwargs={'fontsize': 12}) # # # plot the treemap using matplotlib plt.axis('off') fig2 = plt.gcf() fig2.patch.set_facecolor('#EBF5FB') # plt.show() # # # display the treemap in Streamlit st.pyplot(fig2) # findRelationships(query, df) # model = gensim.models.KeyedVectors.load_word2vec_format('pubmed_model_clotting', binary=True) # similar_words = model.most_similar(word) # output = json.dumps({"word": word, "similar_words": similar_words}) # st.write(output)