pritamdeka's picture
Update app.py
de7836d
raw history blame
No virus
13.1 kB
import nltk
import re
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('brown')
from newspaper import Article
from newspaper import fulltext
import requests
import itertools
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from pandas import ExcelWriter
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import *
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import scipy.spatial
import networkx as nx
from nltk.tokenize import sent_tokenize
import scispacy
import spacy
import en_core_sci_lg
import string
from nltk.stem.wordnet import WordNetLemmatizer
import gradio as gr
import inflect
from Bio import Entrez
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score, davies_bouldin_score
import json
from xml.etree import ElementTree as ET
p = inflect.engine()
nlp = en_core_sci_lg.load()
sp = en_core_sci_lg.load()
all_stopwords = sp.Defaults.stop_words
word_embedding_model = models.Transformer('cambridgeltl/SapBERT-from-PubMedBERT-fulltext')
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False)
embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
def remove_stopwords(sen):
sen_new = " ".join([i for i in sen if i not in stop_words])
return sen_new
def keyphrase_generator(article_link, model_1, model_2, max_num_keywords):
element=[]
cluster_list_final=[]
comb_list=[]
comb=[]
title_list=[]
titles_list=[]
abstracts_list=[]
silhouette_score_list=[]
final_textrank_list=[]
document=[]
text_doc=[]
final_list=[]
score_list=[]
sum_list=[]
model_1 = SentenceTransformer(model_1)
model_2 = SentenceTransformer(model_2)
url = article_link
html = requests.get(url).text
article = fulltext(html)
corpus=sent_tokenize(article)
indicator_list=['concluded','concludes','in a study', 'concluding','conclude','in sum','in a recent study','therefore','thus','so','hence',
'as a result','accordingly','consequently','in short','proves that','shows that','suggests that','demonstrates that','found that','observed that',
'indicated that','suggested that','demonstrated that']
count_dict={}
for l in corpus:
c=0
for l2 in indicator_list:
if l.find(l2)!=-1:#then it is a substring
c=1
break
if c:#
count_dict[l]=1
else:
count_dict[l]=0
for sent, score in count_dict.items():
score_list.append(score)
clean_sentences_new = pd.Series(corpus).str.replace("[^a-zA-Z]", " ").tolist()
corpus_embeddings = model_1.encode(clean_sentences_new)
sim_mat = np.zeros([len(clean_sentences_new), len(clean_sentences_new)])
for i in range(len(clean_sentences_new)):
len_embeddings=(len(corpus_embeddings[i]))
for j in range(len(clean_sentences_new)):
if i != j:
if(len_embeddings == 1024):
sim_mat[i][j] = cosine_similarity(corpus_embeddings[i].reshape(1,1024), corpus_embeddings[j].reshape(1,1024))[0,0]
elif(len_embeddings == 768):
sim_mat[i][j] = cosine_similarity(corpus_embeddings[i].reshape(1,768), corpus_embeddings[j].reshape(1,768))[0,0]
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)
sentences=((scores[i],s) for i,s in enumerate(corpus))
for elem in sentences:
element.append(elem[0])
for sc, lst in zip(score_list, element): ########## taking the scores from both the lists
sum1=sc+lst
sum_list.append(sum1)
x=sorted(((sum_list[i],s) for i,s in enumerate(corpus)), reverse=True)
for elem in x:
final_textrank_list.append(elem[1])
a=int((10*len(final_textrank_list))/100.0)
if(a<5):
total=5
else:
total=int(a)
for i in range(total):
document.append(final_textrank_list[i])
doc=" ".join(document)
for i in document:
doc_1=nlp(i)
text_doc.append([X.text for X in doc_1.ents])
entity_list = [item for sublist in text_doc for item in sublist]
entity_list = [word for word in entity_list if not word in all_stopwords]
entity_list = [word_entity for word_entity in entity_list if(p.singular_noun(word_entity) == False)]
entity_list=list(dict.fromkeys(entity_list))
doc_embedding = model_2.encode([doc])
candidates=entity_list
candidate_embeddings = model_2.encode(candidates)
distances = cosine_similarity(doc_embedding, candidate_embeddings)
top_n = max_num_keywords
keyword_list = [candidates[index] for index in distances.argsort()[0][-top_n:]]
keywords = '\n'.join(keyword_list)
c_len=(len(keyword_list))
keyword_embeddings = embedder.encode(keyword_list)
data_embeddings = embedder.encode(keyword_list)
for num_clusters in range(1, top_n):
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(keyword_embeddings)
cluster_assignment = clustering_model.labels_
clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
clustered_sentences[cluster_id].append(keyword_list[sentence_id])
cl_sent_len=(len(clustered_sentences))
list_cluster=list(clustered_sentences)
a=len(list_cluster)
cluster_list_final.append(list_cluster)
if (c_len==cl_sent_len and c_len>=3) or cl_sent_len==1:
silhouette_avg = 0
silhouette_score_list.append(silhouette_avg)
elif c_len==cl_sent_len==2:
silhouette_avg = 1
silhouette_score_list.append(silhouette_avg)
else:
silhouette_avg = silhouette_score(keyword_embeddings, cluster_assignment)
silhouette_score_list.append(silhouette_avg)
res_dict = dict(zip(silhouette_score_list, cluster_list_final))
cluster_items=res_dict[max(res_dict)]
for i in cluster_items:
z=' OR '.join(i)
comb.append("("+z+")")
comb_list.append(comb)
combinations = []
for subset in itertools.combinations(comb, 2):
combinations.append(subset)
f1_list=[]
for s in combinations:
final = ' AND '.join(s)
f1_list.append("("+final+")")
f_1=' OR '.join(f1_list)
final_list.append(f_1)
ncbi_url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
last_url='esearch.fcgi?db=pubmed'+'&term='+f_1
search_rettype = '&rettype=json'
overall_url=ncbi_url+last_url+search_rettype+'&sort=relevance'
pubmed_search_request = requests.get(overall_url)
root = ET.fromstring(pubmed_search_request.text)
levels = root.findall('.//Id')
search_id_list=[]
for level in levels:
name = level.text
search_id_list.append(name)
all_search_ids = ','.join(search_id_list)
fetch_url='efetch.fcgi?db=pubmed'
search_id='&id='+all_search_ids
ret_type='&rettype=text'
ret_mode='&retmode=xml'
ret_max='&retmax=20'
ret_sort='&sort=relevance'
return_url=ncbi_url+fetch_url+search_id+ret_type+ret_mode+ret_max+ret_sort
pubmed_abstract_request = requests.get(return_url)
root_1 = ET.fromstring(pubmed_abstract_request.text)
article_title = root_1.findall('.//ArticleTitle')
for a in article_title:
article_title_name = a.text
titles_list.append(article_title_name)
article_abstract = root_1.findall('.//AbstractText')
for b in article_abstract:
article_abstract_name = b.text
abstracts_list.append(article_abstract_name)
mydict = {'Title': titles_list, 'Abstract':abstracts_list}
df_new = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in mydict.items() ]))
return df_new
igen_pubmed = gr.Interface(keyphrase_generator,
inputs=[gr.inputs.Textbox(lines=1, placeholder="Provide article web link here",default="", label="Article web link"),
gr.inputs.Dropdown(choices=['sentence-transformers/all-mpnet-base-v2',
'sentence-transformers/all-mpnet-base-v1',
'sentence-transformers/all-distilroberta-v1',
'sentence-transformers/gtr-t5-large',
'pritamdeka/S-Bluebert-snli-multinli-stsb',
'pritamdeka/S-Biomed-Roberta-snli-multinli-stsb',
'sentence-transformers/stsb-mpnet-base-v2',
'sentence-transformers/stsb-roberta-base-v2',
'sentence-transformers/stsb-distilroberta-base-v2',
'sentence-transformers/sentence-t5-large',
'sentence-transformers/sentence-t5-base'],
type="value",
default='sentence-transformers/all-mpnet-base-v1',
label="Select any SBERT model for TextRank from the list below"),
gr.inputs.Dropdown(choices=['sentence-transformers/paraphrase-mpnet-base-v2',
'sentence-transformers/all-mpnet-base-v1',
'sentence-transformers/paraphrase-distilroberta-base-v1',
'sentence-transformers/paraphrase-xlm-r-multilingual-v1',
'sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
'sentence-transformers/paraphrase-albert-small-v2',
'sentence-transformers/paraphrase-albert-base-v2',
'sentence-transformers/paraphrase-MiniLM-L12-v2',
'sentence-transformers/paraphrase-MiniLM-L6-v2',
'sentence-transformers/all-MiniLM-L12-v2',
'sentence-transformers/all-distilroberta-v1',
'sentence-transformers/paraphrase-TinyBERT-L6-v2',
'sentence-transformers/paraphrase-MiniLM-L3-v2',
'sentence-transformers/all-MiniLM-L6-v2'],
type="value",
default='sentence-transformers/all-mpnet-base-v1',
label="Select any SBERT model for keyphrases from the list below"),
gr.inputs.Slider(minimum=5, maximum=30, step=1, default=10, label="Max Keywords")],
outputs=gr.outputs.Dataframe(type="auto", label="dataframe",max_cols=None, max_rows=10, overflow_row_behaviour="paginate"),
theme="dark-peach",
title="PubMed Abstract Retriever", description="Retrieves relevant PubMed abstracts for an online article which can be used as further references.",
article= "This work is based on the paper <a href=https://dl.acm.org/doi/10.1145/3487664.3487701>provided here</a>."
"\t It uses the TextRank algorithm with SBERT to first find the top sentences and then extracts the keyphrases from those sentences using scispaCy and SBERT."
"\t The application then uses a UMLS based BERT model, <a href=https://arxiv.org/abs/2010.11784>SapBERT</a> to cluster the keyphrases using K-means clustering method and finally create a boolean query. After that the top 20 titles and abstracts are retrieved from PubMed database and displayed according to relevancy. "
"\t The list of SBERT models required in the textboxes can be found in <a href=www.sbert.net/docs/pretrained_models.html>SBERT Pre-trained models hub</a>."
"\t The default model names are provided which can be changed from the list of pretrained models. "
"\t The value of keyphrases can be changed. The default value is 10, minimum is 5 and a maximum value of 30.")
igen_pubmed.launch(share=True,server_name='0.0.0.0',show_error=True)