import streamlit as st import time import concurrent.futures import json from gensim.models import Word2Vec import pandas as pd import threading import matplotlib.pyplot as plt import squarify import numpy as np import re import urllib.request import random import plotly.express as px st.set_page_config( page_title="Abstractalytics", page_icon=":microscope:", layout="wide", #centered initial_sidebar_state="auto", menu_items={ 'About': "Abstractalytics is a Natural Language Processing (NLP) that harnesses Word2Vec to mine" " insight from pubmed abstracts. Created by Jimmie E. Fata, PhD" } ) # Define the HTML and CSS styles st.markdown(""" """, unsafe_allow_html=True) st.markdown(""" """, unsafe_allow_html=True) st.header(":red[*Abstractalytics*]") st.subheader("*A web app designed to explore :red[*PubMed abstracts*] for deeper understanding and fresh insights, driven " "by Natural Language Processing (NLP) techniques.*") def custom_subheader(text, identifier, font_size): st.markdown(f"
Populate a treemap to visualize " # f"words contextually " # f"and semantically similar to {query} " # f"within the {database_name} corpus.
", # unsafe_allow_html=True) # Set the max number of words to display value_word = min(100, len(table2)) st.markdown( f"Top {value_word} " f"words contextually and semantically similar to " f"{query} within the {database_name} corpus. " f"Click on the squares to expand and also the PubMed and Wikipedia links for more word information
", unsafe_allow_html=True) short_table = table2.head(value_word).round(2) short_table.index += 1 short_table.index = (1 / short_table.index) * 10 sizes = short_table.index.tolist() short_table.set_index('Word', inplace=True) table2["SIMILARITY"] = 'Similarity Score ' + table2.head(value_word)["SIMILARITY"].round(2).astype(str) rank_num = list(short_table.index.tolist()) df = short_table try: df['text'] = short_table.index df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index] df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index] df.loc[:, 'database'] = database_name fig = px.treemap(df, path=[short_table.index], values=sizes, custom_data=['href', 'text', 'database', 'href2'], hover_name=(table2.head(value_word)['SIMILARITY'])) fig.update(layout_coloraxis_showscale=False) fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) fig.update_annotations(visible=False) fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", texttemplate="Treemap visualization of " # # f"genes contextually " # # f"and semantically similar to {query} " # # f"within the {database_name} corpus.
", # # unsafe_allow_html=True) # # st.markdown( # f"Top {value_gene} " # f"genes contextually and semantically similar to " # f"{query} within the {database_name} database. " # f"Click on the squares to expand and also the Pubmed and GeneCard links for more gene information
", # unsafe_allow_html=True) # # df10 = df1.head(value_gene).copy() # df10.index = (1 / df10.index) * 100000 # sizes = df10.index.tolist() # df10.set_index('Human Gene', inplace=True) # # df3 = df1.copy() # df3["SIMILARITY"] = 'Similarity Score ' + df3.head(value_gene)["SIMILARITY"].round(2).astype(str) # df3.reset_index(inplace=True) # df3 = df3.rename(columns={'Human Gene': 'symbol2'}) # # Use df.query to get a subset of df1 based on ids in df2 # subset = df3.head(value_gene).query('symbol2 in @df2.symbol2') # # Use merge to join the two DataFrames on id # result = pd.merge(subset, df2, on='symbol2') # # Show the result # # print(result) # # label = df10.index.tolist() # # df2 = df10 # # print(df2) # try: # # Define the `text` column for labels and `href` column for links # df10['text'] = df10.index # df10['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ # '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df10['text']] # df10['href2'] = [f'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + c for c in df10['text']] # # df10['name'] = [c for c in result['Approved name']] # assert isinstance(df10, object) # df10.loc[:, 'database'] = database_name # # # print(df['name']) # # # Create the treemap using `px.treemap` # fig = px.treemap(df10, path=[df10['text']], values=sizes, # custom_data=['href', 'name', 'database', 'href2', 'text'], # hover_name=(df3.head(value_gene)['SIMILARITY'])) # # fig.update(layout_coloraxis_showscale=False) # fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) # fig.update_annotations(visible=False) # fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, # hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", # texttemplate="Populate a treemap to visualize " # f"proteins contextually " # f"and semantically similar to {query} " # f"within the {database_name} corpus.
", # unsafe_allow_html=True) # Set the number of proteins to display value_gene = min(df_len, 100) st.markdown( f"Top {value_gene} " f"human genes contextually and semantically similar to " f"{query} within the {database_name} corpus. Click on the squares to expand and also the Pubmed and GeneCard links for more gene information
", unsafe_allow_html=True) df11 = df1.head(value_gene).copy() df11.index = (1 / df11.index) * 10000 sizes = df11.index.tolist() df11.set_index('Genes', inplace=True) df4 = df1.copy() # print(df4.head(10)) df4["SIMILARITY"] = 'Similarity Score ' + df4.head(value_gene)["SIMILARITY"].round(2).astype(str) df4.reset_index(inplace=True) # df4 = df4.rename(columns={'Protein': 'symbol2'}) # print(df4) # # Use df.query to get a subset of df1 based on ids in df2 # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2') # # Use merge to join the two DataFrames on id # result = pd.merge(subset, df2b, on='symbol2') # print(result) if value_gene <= df_len: # Define the `text` column for labels and `href` column for links df11['text'] = df11.index df11['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df11['text']] df11['href2'] = [f'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + c for c in df11['text']] assert isinstance(df11, object) df11['database'] = database_name # df11['name'] = [c for c in result['Approved name']] # Create the treemap using `px.treemap` fig = px.treemap(df11, path=[df11['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'], hover_name=(df4.head(value_gene)['SIMILARITY'])) fig.update(layout_coloraxis_showscale=False) fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) fig.update_annotations(visible=False) fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", texttemplate="%{customdata[3]}Visualize " # f"KEGG compounds contextually " # f"and semantically similar to {query} " # f"within the {database_name} corpus.
", # unsafe_allow_html=True) st.markdown( f"Top {value_drug} " f"Drugs contextually and semantically similar to " f"{query} within the {database_name} corpus. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information
", unsafe_allow_html=True) df13 = df1.head(value_drug).copy() df13.index = (1 / df13.index) * 10000 sizes = df13.index.tolist() df13.set_index('Drugs', inplace=True) df6 = df1.copy() # print(df4.head(10)) df6["SIMILARITY"] = 'Similarity Score ' + df6.head(value_drug)["SIMILARITY"].round(2).astype(str) df6.reset_index(inplace=True) # df4 = df4.rename(columns={'Protein': 'symbol2'}) # print(df4) # # Use df.query to get a subset of df1 based on ids in df2 # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2') # # Use merge to join the two DataFrames on id # result = pd.merge(subset, df2b, on='symbol2') # print(result) if value_drug <= df_len: # Define the `text` column for labels and `href` column for links # Reset the index df13.reset_index(inplace=True) # Replace hyphens with spaces in the 'text' column df13['Drugs'] = df13['Drugs'].str.replace('-', ' ') # Set the 'text' column back as the index df13.set_index('Drugs', inplace=True) df13['text'] = df13.index df13['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df13['text']] df13['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df13['text']] assert isinstance(df13, object) df13['database'] = database_name # df11['name'] = [c for c in result['Approved name']] # Create the treemap using `px.treemap` fig = px.treemap(df13, path=[df13['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'], hover_name=(df6.head(value_drug)['SIMILARITY'])) fig.update(layout_coloraxis_showscale=False) fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) fig.update_annotations(visible=False) fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", texttemplate="%{customdata[3]}Visualize " # # f"KEGG compounds contextually " # # f"and semantically similar to {query} " # # f"within the {database_name} corpus.
", # # unsafe_allow_html=True) # # st.markdown( # f"Top {value_disease} " # f"Diseases contextually and semantically similar to " # f"{query}: within the {database_name} database. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information
", # unsafe_allow_html=True) # # df14 = df1.head(value_disease).copy() # # df14.index = (1 / df14.index) * 10000 # sizes = df14.index.tolist() # # df14.set_index('Disease', inplace=True) # # df7 = df1.copy() # # print(df4.head(10)) # df7["SIMILARITY"] = 'Similarity Score ' + df7.head(value_disease)["SIMILARITY"].round(2).astype(str) # df7.reset_index(inplace=True) # # df4 = df4.rename(columns={'Protein': 'symbol2'}) # # print(df4) # # # Use df.query to get a subset of df1 based on ids in df2 # # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2') # # # Use merge to join the two DataFrames on id # # result = pd.merge(subset, df2b, on='symbol2') # # print(result) # if value_disease <= df_len: # # Define the `text` column for labels and `href` column for links # # Reset the index # df14.reset_index(inplace=True) # # # Replace hyphens with spaces in the 'text' column # df14['Disease'] = df14['Disease'].str.replace('-', ' ') # # # Set the 'text' column back as the index # df14.set_index('Disease', inplace=True) # df14['text'] = df14.index # df14['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ # '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df14['text']] # df14['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df14['text']] # assert isinstance(df14, object) # df14['database'] = database_name # # # df11['name'] = [c for c in result['Approved name']] # # # Create the treemap using `px.treemap` # fig = px.treemap(df14, path=[df14['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'], # hover_name=(df7.head(value_disease)['SIMILARITY'])) # # fig.update(layout_coloraxis_showscale=False) # fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) # fig.update_annotations(visible=False) # fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, # hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", # texttemplate="%{customdata[3]}Visualize " # # f"KEGG compounds contextually " # # f"and semantically similar to {query} " # # f"within the {database_name} corpus.
", # # unsafe_allow_html=True) # # st.markdown( # f"Top {value_pathway} " # f"Pathways contextually and semantically similar to " # f"{query}: within the {database_name} database. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information
", # unsafe_allow_html=True) # # df16 = df1.head(value_pathway).copy() # # df16.index = (1 / df16.index) * 10000 # sizes = df16.index.tolist() # # df16.set_index('Pathway', inplace=True) # # df9 = df1.copy() # # print(df4.head(10)) # df9["SIMILARITY"] = 'Similarity Score ' + df9.head(value_pathway)["SIMILARITY"].round(2).astype(str) # df9.reset_index(inplace=True) # # df4 = df4.rename(columns={'Protein': 'symbol2'}) # # print(df4) # # # Use df.query to get a subset of df1 based on ids in df2 # # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2') # # # Use merge to join the two DataFrames on id # # result = pd.merge(subset, df2b, on='symbol2') # # print(result) # if value_pathway <= df_len: # # Define the `text` column for labels and `href` column for links # # Reset the index # df16.reset_index(inplace=True) # # # Replace hyphens with spaces in the 'text' column # df16['Pathway'] = df16['Pathway'].str.replace('-', ' ') # # # Set the 'text' column back as the index # df16.set_index('Pathway', inplace=True) # df16['text'] = df16.index # df16['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ # '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df16['text']] # df16['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df16['text']] # assert isinstance(df16, object) # df16['database'] = database_name # # # df11['name'] = [c for c in result['Approved name']] # # # Create the treemap using `px.treemap` # fig = px.treemap(df16, path=[df16['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'], # hover_name=(df9.head(value_pathway)['SIMILARITY'])) # # fig.update(layout_coloraxis_showscale=False) # fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) # fig.update_annotations(visible=False) # fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, # hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", # texttemplate="%{customdata[3]}Visualize " # f"KEGG compounds contextually " # f"and semantically similar to {query} " # f"within the {database_name} corpus.
", # unsafe_allow_html=True) st.markdown( f"Top {value_phyto} " f"Phytochemicals contextually and semantically similar to " f"{query} within the {database_name} corpus. " f"Click on the squares to expand and also the Pubmed and Wikipedia links for more compound information
", unsafe_allow_html=True) df15 = df1.head(value_phyto).copy() df15.index = (1 / df15.index) * 10000 sizes = df15.index.tolist() df15.set_index('Phytochemical', inplace=True) df8 = df1.copy() # print(df4.head(10)) df8["SIMILARITY"] = 'Similarity Score ' + df8.head(value_phyto)["SIMILARITY"].round(2).astype(str) df8.reset_index(inplace=True) # df4 = df4.rename(columns={'Protein': 'symbol2'}) # print(df4) # # Use df.query to get a subset of df1 based on ids in df2 # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2') # # Use merge to join the two DataFrames on id # result = pd.merge(subset, df2b, on='symbol2') # print(result) if value_phyto <= df_len: # Define the `text` column for labels and `href` column for links # Reset the index df15.reset_index(inplace=True) # Replace hyphens with spaces in the 'text' column df15['Phytochemical'] = df15['Phytochemical'].str.replace('-', ' ') # Set the 'text' column back as the index df15.set_index('Phytochemical', inplace=True) df15['text'] = df15.index df15['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df15['text']] df15['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df15['text']] assert isinstance(df15, object) df15['database'] = database_name # df11['name'] = [c for c in result['Approved name']] # Create the treemap using `px.treemap` fig = px.treemap(df15, path=[df15['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'], hover_name=(df8.head(value_phyto)['SIMILARITY'])) fig.update(layout_coloraxis_showscale=False) fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) fig.update_annotations(visible=False) fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", texttemplate="%{customdata[3]}Visualize " # f"KEGG compounds contextually " # f"and semantically similar to {query} " # f"within the {database_name} corpus.
", # unsafe_allow_html=True) st.markdown( f"Top {value_compound} " f"Compounds contextually and semantically similar to " f"{query} within the {database_name} corpus. " f"Click on the squares to expand and the Pubmed, Wikipedia, and KEGG links for more compound information (may take time to load)
", unsafe_allow_html=True) df12 = df1.head(value_compound).copy() df12.index = (1 / df12.index) * 10000 sizes = df12.index.tolist() df12.set_index('Compounds', inplace=True) df5 = df1.copy() # print(df4.head(10)) df5["SIMILARITY"] = 'Similarity Score ' + df5.head(value_compound)["SIMILARITY"].round(2).astype(str) df5.reset_index(inplace=True) # df4 = df4.rename(columns={'Protein': 'symbol2'}) # print(df4) # # Use df.query to get a subset of df1 based on ids in df2 # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2') # # Use merge to join the two DataFrames on id # result = pd.merge(subset, df2b, on='symbol2') # print(result) if value_compound <= df_len: # Define the `text` column for labels and `href` column for links # Reset the index df12.reset_index(inplace=True) # Replace hyphens with spaces in the 'text' column df12['Compounds'] = df12['Compounds'].str.replace('-', ' ') # Set the 'text' column back as the index df12.set_index('Compounds', inplace=True) df12['text'] = df12.index df12['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \ '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df12['text']] df12['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df12['text']] df12['href3'] = [f'https://www.genome.jp/entry/{compound_id}' for compound_id in get_compound_ids(df12['text'])] assert isinstance(df12, object) df12['database'] = database_name # df11['name'] = [c for c in result['Approved name']] # Create the treemap using `px.treemap` fig = px.treemap(df12, path=[df12['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text', 'href3'], hover_name=(df5.head(value_compound)['SIMILARITY'])) fig.update(layout_coloraxis_showscale=False) fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0)) fig.update_annotations(visible=False) fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None, hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", texttemplate="%{customdata[3]}