Spaces:

jfataphd
/

OncoDigger

Runtime error

File size: 4,010 Bytes

1699569
 
 
 
 
e5a12b8
 
 
1699569
 
a6d026f
 
 
f192d73
3c93bf0
3559da9
f192d73
afb8bf9
3c93bf0
3559da9
afb8bf9
a6d026f
 
 
 
2bba935
3c93bf0
1699569
70d1c6a
2bba935
b2912c4
e5a12b8
1699569
 
c85c4ca
8715634
67703fc
c85c4ca
b6e45c4
c85c4ca
03f26bd
1699569
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5a12b8
f658f80
e5a12b8
 
3c93bf0
e5a12b8
 
50cfb9e
e5a12b8
 
 
 
 
 
 
 
c11ca30
e5a12b8
 
 
6dc9e75
e5a12b8
 
 
 
f658f80
 
 
 
 
 
 
e5a12b8
1699569
4b2cc15
1699569
 
 
4b2cc15
1699569
 
 
e5a12b8
 
1699569
f658f80
e5a12b8
3c93bf0
e5a12b8
f658f80
 
 
e5a12b8
 
 
 
 
80e99ff
e5a12b8
 
 
 
 
6dc9e75
e5a12b8
 
 
 
 
f658f80
 
 
 
 
 
 
8715634
7183901
e5a12b8
 
 
 
 
 
1699569
65ce061
2f21339
5ba2c0e
e5a12b8
 
 
 
1699569

import streamlit as st
import time
import json
from gensim.models import Word2Vec
import pandas as pd
import matplotlib.pyplot as plt
import squarify
import numpy as np

# Define the HTML and CSS styles
st.markdown(
    """
    <style>
    body {
        background-color: #EBF5FB;
        # color: #ffffff;
    }
    .stApp {
        background-color: #EBF5FB;
        # color: #ffffff;
    }
    </style>
    """,
    unsafe_allow_html=True
)

st.header("Word2Vec App for Clotting Pubmed Database.")

text_input_value = st.text_input("Enter one term to search within the Clotting database")
query = text_input_value
query = query.lower()
# query = input ("Enter your keyword(s):")

if query:
    bar = st.progress(0)
    time.sleep(.2)
    st.caption(":LightSkyBlue[searching 40123 PubMed abstracts]")
    for i in range(10):
        bar.progress((i+1)*10)
        time.sleep(.1)
    
    model = Word2Vec.load("pubmed_model_clotting")  # you can continue training with the loaded model!
    words = list(model.wv.key_to_index)
    X = model.wv[model.wv.key_to_index]
    model2 = model.wv[query]
    df = pd.DataFrame(X)


# def findRelationships(query, df):
    table = model.wv.most_similar_cosmul(query, topn=10000)
    table = (pd.DataFrame(table))
    table.index.name = 'Rank'
    table.columns = ['Word', 'SIMILARITY']
    print()
    print("Similarity to " + str(query))
    pd.set_option('display.max_rows', None)
    print(table.head(50))
    # table.head(10).to_csv("clotting_sim1.csv", index=True)
    # short_table = table.head(50)
    # print(table)
    st.subheader(f"Similar Words to {query}")

    # calculate the sizes of the squares in the treemap
    short_table = table.head(10)
    short_table.index += 1
    short_table.index = 1 / short_table.index
    sizes = short_table.index.tolist()

    cmap = plt.cm.Greens(np.linspace(0.05, .5, len(sizes)))
    color = [cmap[i] for i in range(len(sizes))]

    short_table.set_index('Word', inplace=True)
    squarify.plot(sizes=sizes, label=short_table.index.tolist(), color=color, edgecolor="#EBF5FB", text_kwargs={'fontsize': 10})
    # # plot the treemap using matplotlib
    plt.axis('off')
    fig = plt.gcf()
    fig.patch.set_facecolor('#EBF5FB')
    # # display the treemap in Streamlit
    st.pyplot(fig)
    plt.clf()

    csv = table.head(100)
    st.download_button(
        label="download top 100 words (csv)",
        data=csv,
        file_name='clotting_words.csv',
        mime='text/csv')

    # st.write(short_table)
    #

    print()
    print("Human genes similar to " + str(query))
    df1 = table
    df2 = pd.read_csv('Human_Genes.csv')
    m = df1.Word.isin(df2.symbol)
    df1 = df1[m]
    df1.rename(columns={'Word': 'Human Gene'}, inplace=True)
    df1["Human Gene"] = df1["Human Gene"].str.upper()
    print(df1.head(50))
    print()
    # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
    # time.sleep(2)
    st.subheader(f"Similar Genes to {query}")

    df10 = df1.head(10)
    df10.index = 1/df10.index
    sizes = df10.index.tolist()

    cmap2 = plt.cm.Blues(np.linspace(0.05, .5, len(sizes)))
    color2 = [cmap2[i] for i in range(len(sizes))]

    df1.set_index('Human Gene', inplace=True)
    squarify.plot(sizes=sizes, label=df1.index.tolist(), color=color2, edgecolor="#EBF5FB", text_kwargs={'fontsize': 12})
    #
    # # plot the treemap using matplotlib

    plt.axis('off')
    fig2 = plt.gcf()
    fig2.patch.set_facecolor('#EBF5FB')
    # plt.show()
    #
    # # display the treemap in Streamlit
    st.pyplot(fig2)

    csv = df1.head(100)
    st.download_button(
        label="download top 100 genes (csv)",
        data=csv,
        file_name='clotting_genes.csv',
        mime='text/csv')

    



# findRelationships(query, df)







# model = gensim.models.KeyedVectors.load_word2vec_format('pubmed_model_clotting', binary=True)
# similar_words = model.most_similar(word)
# output = json.dumps({"word": word, "similar_words": similar_words})
# st.write(output)