pritamdeka commited on
Commit
0f67148
β€’
1 Parent(s): f798e92

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -0
app.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ import re
3
+ import nltkmodules
4
+
5
+ from nltk.tokenize import word_tokenize
6
+ from sentence_transformers import SentenceTransformer
7
+ import pandas as pd
8
+ import numpy as np
9
+ from pandas import ExcelWriter
10
+ from torch.utils.data import DataLoader
11
+ import math
12
+ from sentence_transformers import models, losses
13
+ from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
14
+ from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
15
+ from sentence_transformers.readers import *
16
+ import logging
17
+ import glob
18
+ from datetime import datetime
19
+ import sys
20
+ from nltk.corpus import stopwords
21
+ stop_words = stopwords.words('english')
22
+ from sklearn.metrics.pairwise import cosine_similarity
23
+ import scipy.spatial
24
+ import networkx as nx
25
+ from nltk.tokenize import sent_tokenize
26
+ import scispacy
27
+ import spacy
28
+ import en_core_sci_lg
29
+ from spacy import displacy
30
+ from scispacy.abbreviation import AbbreviationDetector
31
+ from scispacy.umls_linking import UmlsEntityLinker
32
+ from transformers import AutoTokenizer, AutoModel
33
+ import statistics
34
+ import string
35
+ from nltk.stem.wordnet import WordNetLemmatizer
36
+ import gradio as gr
37
+
38
+ nlp = en_core_sci_lg.load()
39
+ sp = en_core_sci_lg.load()
40
+ all_stopwords = sp.Defaults.stop_words
41
+
42
+
43
+ def remove_stopwords(sen):
44
+ sen_new = " ".join([i for i in sen if i not in stop_words])
45
+ return sen_new
46
+
47
+ def keyphrase_generator(article, model_1, model_2, max_num_keywords):
48
+ element=[]
49
+ document=[]
50
+ text=[]
51
+ model_1 = SentenceTransformer(model_1)
52
+ model_2 = SentenceTransformer(model_2)
53
+ corpus=sent_tokenize(article)
54
+ clean_sentences_new = pd.Series(corpus).str.replace("[^a-zA-Z]", " ").tolist()
55
+ corpus_embeddings = model_1.encode(clean_sentences_new)
56
+ sim_mat = np.zeros([len(clean_sentences_new), len(clean_sentences_new)])
57
+ for i in range(len(clean_sentences_new)):
58
+ for j in range(len(clean_sentences_new)):
59
+ if i != j:
60
+ sim_mat[i][j] = cosine_similarity(corpus_embeddings[i].reshape(1,768), corpus_embeddings[j].reshape(1,768))[0,0]
61
+ nx_graph = nx.from_numpy_array(sim_mat)
62
+ scores = nx.pagerank(nx_graph)
63
+ ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(corpus)), reverse=True)
64
+ for elem in ranked_sentences:
65
+ element.append(elem[1])
66
+ a=int((10*len(element))/100.0)
67
+ if(a<5):
68
+ total=5
69
+ else:
70
+ total=int(a)
71
+ for i in range(total):
72
+ document.append(element[i])
73
+ doc=" ".join(document)
74
+ for i in document:
75
+ doc_1=nlp(i)
76
+ text.append([X.text for X in doc_1.ents])
77
+ entity_list = [item for sublist in text for item in sublist]
78
+ entity_list = [word for word in entity_list if not word in all_stopwords]
79
+ entity_list=list(dict.fromkeys(entity_list))
80
+ doc_embedding = model_2.encode([doc])
81
+ candidates=entity_list
82
+ candidate_embeddings = model_2.encode(candidates)
83
+ distances = cosine_similarity(doc_embedding, candidate_embeddings)
84
+ top_n = max_num_keywords
85
+ keyword_list = [candidates[index] for index in distances.argsort()[0][-top_n:]]
86
+ keywords = '\n'.join(keyword_list)
87
+ return keywords
88
+
89
+
90
+ gr.Interface(keyphrase_generator,
91
+ inputs=[gr.inputs.Textbox(lines=10, placeholder="Copy article text here",default="", label="article text"),gr.inputs.Textbox(lines=1, placeholder="SBERT model",default="all-mpnet-base-v2", label="Model for TextRank (e.g. all-mpnet-base-v2)"),gr.inputs.Textbox(lines=1, placeholder="SBERT model",default="all-distilroberta-v1",label="Model for keyphrases (e.g. all-distilroberta-v1)"),gr.inputs.Slider(minimum=5, maximum=30, step=1, default=10, label="Max Keywords")],
92
+ outputs="text", theme=None, title="Scientifc Article Keyphrase Generator", article="Generates the keyphrases from an article which best describes the article."
93
+ "\t The work is part of the paper ""."
94
+ "\t It uses the TextRank algorithm to first find the top sentences and then extracts the keyphrases from those sentences."
95
+ "\t The list of SBERT models required in the textboxes can be found in https://www.sbert.net/docs/pretrained_models.html."
96
+ "\t The default model names are provided which can be changed from the list of pretrained models. "
97
+ "\t The value of output keyphrases can be changed. The default value is 10, minimum is 5 and a maximum value of 30.").launch(share=True)