File size: 5,441 Bytes
1699569
 
 
 
 
e5a12b8
 
 
1699569
c6e3011
 
 
 
 
 
 
 
 
 
1699569
f21967a
a6d026f
f192d73
3c93bf0
3559da9
f192d73
afb8bf9
3c93bf0
3559da9
afb8bf9
a6d026f
f21967a
2bba935
90c2875
 
e48b5b5
90c2875
e48b5b5
90c2875
 
 
e48b5b5
1699569
c6e3011
 
e48b5b5
 
c6e3011
90c2875
2bba935
b2912c4
e5a12b8
1699569
e48b5b5
 
c6e3011
e48b5b5
 
 
f21967a
 
e48b5b5
f21967a
 
 
 
 
 
 
 
 
 
1699569
 
 
 
 
 
 
e5a12b8
f658f80
e5a12b8
 
82bee27
e5a12b8
 
50cfb9e
e5a12b8
 
 
 
 
 
 
 
f21967a
 
e5a12b8
 
 
6dc9e75
e5a12b8
 
 
 
96f1aee
e48b5b5
f658f80
e5a12b8
1699569
4b2cc15
1699569
 
 
4b2cc15
1699569
 
 
e5a12b8
 
1699569
f658f80
e5a12b8
82bee27
e5a12b8
f658f80
f21967a
f658f80
e5a12b8
 
 
 
b1a4aa9
f21967a
 
e5a12b8
 
 
 
 
6dc9e75
e5a12b8
 
 
 
 
96f1aee
c6e3011
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5a12b8
 
 
 
 
 
 
1699569
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import streamlit as st
import time
import json
from gensim.models import Word2Vec
import pandas as pd
import matplotlib.pyplot as plt
import squarify
import numpy as np

st.set_page_config(
    page_title="FATA4 Science",
                page_icon=":microscope:",
                layout="wide",
                initial_sidebar_state="expanded",
                menu_items={
                    'About': "FATA4 Science is a Natural Language Processing (NLP) that ...."
                }
                )

# Define the HTML and CSS styles
st.markdown("""
    <style>
    body {
        background-color: #EBF5FB;
        # color: #ffffff;
    }
    .stApp {
        background-color: #EBF5FB;
        # color: #ffffff;
    }
    </style>
    """, unsafe_allow_html=True)

opt=st.sidebar.radio("Select a PubMed Corpus", options=('Clotting corpus', 'Neuroblastoma corpus'))
if opt == "Clotting corpus":
    model_used = ("pubmed_model_clotting")
    num_abstracts = 45493
    database_name = "Clotting"
if opt == "Neuroblastoma corpus":
    model_used = ("pubmed_model_neuroblastoma")
    num_abstracts = 29032
    database_name = "Neuroblastoma"

st.title(":red[Fast Acting Text Analysis (FATA) 4 Science]")
st.markdown("---")
st.subheader("Uncovering knowledge through Natural Language Processing (NLP)")

st.header(f"{database_name} Pubmed corpus.")
text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus", max_chars=50)
query = text_input_value
query = query.lower()
# query = input ("Enter your keyword(s):")
if query:
    bar = st.progress(0)
    time.sleep(.2)
    st.caption(f":LightSkyBlue[searching {num_abstracts} {database_name} PubMed abstracts] covering 1990-2022")
    for i in range(10):
        bar.progress((i + 1) * 10)
        time.sleep(.1)

    try:
        model = Word2Vec.load(model_used)  # you can continue training with the loaded model!
        words = list(model.wv.key_to_index)
        X = model.wv[model.wv.key_to_index]
        model2 = model.wv[query]
        df = pd.DataFrame(X)

    except:
        st.error("Term occurrence is too low - please try another term")
        st.stop()

    # def findRelationships(query, df):
    table = model.wv.most_similar_cosmul(query, topn=10000)
    table = (pd.DataFrame(table))
    table.index.name = 'Rank'
    table.columns = ['Word', 'SIMILARITY']
    print()
    print("Similarity to " + str(query))
    pd.set_option('display.max_rows', None)
    print(table.head(50))
    # table.head(10).to_csv("clotting_sim1.csv", index=True)
    # short_table = table.head(50)
    # print(table)
    st.subheader(f"Top 10 Words closely related to {query}")

    # calculate the sizes of the squares in the treemap
    short_table = table.head(10)
    short_table.index += 1
    short_table.index = 1 / short_table.index
    sizes = short_table.index.tolist()

    cmap = plt.cm.Greens(np.linspace(0.05, .5, len(sizes)))
    color = [cmap[i] for i in range(len(sizes))]

    short_table.set_index('Word', inplace=True)
    squarify.plot(sizes=sizes, label=short_table.index.tolist(), color=color, edgecolor="#EBF5FB",
                  text_kwargs={'fontsize': 10})
    # # plot the treemap using matplotlib
    plt.axis('off')
    fig = plt.gcf()
    fig.patch.set_facecolor('#EBF5FB')
    # # display the treemap in Streamlit
    st.pyplot(fig)
    plt.clf()

    csv = table.head(100).to_csv().encode('utf-8')
    st.download_button(label="download top 100 words (csv)", data=csv, file_name=f'{database_name}_words.csv', mime='text/csv')

    # st.write(short_table)
    #

    print()
    print("Human genes similar to " + str(query))
    df1 = table
    df2 = pd.read_csv('Human_Genes.csv')
    m = df1.Word.isin(df2.symbol)
    df1 = df1[m]
    df1.rename(columns={'Word': 'Human Gene'}, inplace=True)
    df1["Human Gene"] = df1["Human Gene"].str.upper()
    print(df1.head(50))
    print()
    # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
    # time.sleep(2)
    st.subheader(f"Top 10 Genes closely related to {query}")

    df10 = df1.head(10)
    df10.index = 1 / df10.index
    sizes = df10.index.tolist()

    cmap2 = plt.cm.Blues(np.linspace(0.05, .5, len(sizes)))
    color2 = [cmap2[i] for i in range(len(sizes))]

    df10.set_index('Human Gene', inplace=True)
    squarify.plot(sizes=sizes, label=df10.index.tolist(), color=color2, edgecolor="#EBF5FB",
                  text_kwargs={'fontsize': 12})
    #
    # # plot the treemap using matplotlib

    plt.axis('off')
    fig2 = plt.gcf()
    fig2.patch.set_facecolor('#EBF5FB')
    # plt.show()
    #
    # # display the treemap in Streamlit
    st.pyplot(fig2)

    csv = df1.head(100).to_csv().encode('utf-8')
    st.download_button(label="download top 100 genes (csv)", data=csv, file_name=f'{database_name}_genes.csv',
                       mime='text/csv')

DEFAULT_WIDTH = 80
VIDEO_DATA = f"https://www.youtube.com/@NCIgov/search?query=cancer"

width = st.sidebar.slider(
    label="Width", min_value=0, max_value=100, value=DEFAULT_WIDTH, format="%d%%"
)

width = max(width, 0.01)
side = max((100 - width) / 2, 0.01)

_, container, _ = st.columns([side, width, side])
container.video(data=VIDEO_DATA)




# model = gensim.models.KeyedVectors.load_word2vec_format('pubmed_model_clotting', binary=True)
# similar_words = model.most_similar(word)
# output = json.dumps({"word": word, "similar_words": similar_words})
# st.write(output)