Faizan15 commited on
Commit
1f73310
1 Parent(s): 03f159d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +313 -0
app.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Project_KeyExtraction-NLP.ipynb
3
+ Automatically generated by Colaboratory.
4
+ Original file is located at
5
+ https://colab.research.google.com/drive/1adCS5In25XQnEQ53D2H9CjaX7jL9yz6Z
6
+ """
7
+
8
+ import pandas
9
+ import nltk
10
+ nltk.download('wordnet')
11
+
12
+ # load the dataset
13
+ dataset = pandas.read_csv('covid_abstracts.csv')
14
+ dataset.head()
15
+
16
+ #Fetch wordcount for each abstract
17
+ dataset['word_count'] = dataset['title'].apply(lambda x: len(str(x).split(" ")))
18
+ dataset[['title','word_count']].head()
19
+
20
+ ##Descriptive statistics of word counts
21
+ dataset.word_count.describe()
22
+
23
+ #Identify common words
24
+ freq = pandas.Series(' '.join(dataset['title'].astype(str)).split()).value_counts()[:20]
25
+
26
+ #freq = pandas.Series(' '.join(dataset['title']).split()).value_counts()[:20]
27
+ freq
28
+
29
+ #Identify uncommon words
30
+ freq1 = pandas.Series(' '.join(dataset['title'].astype(str)).split()).value_counts()[-20:]
31
+
32
+ #freq1 = pandas.Series(' '.join(dataset
33
+ # ['title']).split()).value_counts()[-20:]
34
+ freq1
35
+
36
+ from nltk.stem.porter import PorterStemmer
37
+ from nltk.stem.wordnet import WordNetLemmatizer
38
+ lem = WordNetLemmatizer()
39
+ stem = PorterStemmer()
40
+ word = "cryptogenic"
41
+ print("stemming:",stem.stem(word))
42
+ print("lemmatization:", lem.lemmatize(word, "v"))
43
+
44
+ import nltk
45
+ nltk.download('wordnet')
46
+
47
+ # Libraries for text preprocessing
48
+ import re
49
+ import nltk
50
+ nltk.download('stopwords')
51
+ from nltk.corpus import stopwords
52
+ from nltk.stem.porter import PorterStemmer
53
+ from nltk.tokenize import RegexpTokenizer
54
+ #nltk.download('wordnet')
55
+ from nltk.stem.wordnet import WordNetLemmatizer
56
+
57
+ ##Creating a list of stop words and adding custom stopwords
58
+ stop_words = set(stopwords.words("english"))
59
+ ##Creating a list of custom stopwords
60
+ new_words = ["using", "show", "result", "large", "also", "iv", "one", "two", "new", "previously", "shown"]
61
+ stop_words = stop_words.union(new_words)
62
+
63
+ print(stop_words)
64
+
65
+ print(new_words)
66
+
67
+ corpus = []
68
+ for i in range(0, 3847):
69
+ #Remove punctuations
70
+ text = re.sub('[^a-zA-Z]', ' ', dataset['title'][i])
71
+
72
+ #Convert to lowercase
73
+ text = text.lower()
74
+
75
+ #remove tags
76
+ text=re.sub("</?.*?>"," <> ",text)
77
+
78
+ # remove special characters and digits
79
+ text=re.sub("(\\d|\\W)+"," ",text)
80
+
81
+ ##Convert to list from string
82
+ text = text.split()
83
+
84
+ ##Stemming
85
+ ps=PorterStemmer()
86
+ #Lemmatisation
87
+ lem = WordNetLemmatizer()
88
+ text = [lem.lemmatize(word) for word in text if not word in
89
+ stop_words]
90
+ text = " ".join(text)
91
+ corpus.append(text)
92
+
93
+ #View corpus item
94
+ corpus[222]
95
+
96
+ #View corpus item
97
+ corpus[300]
98
+
99
+ #Word cloud
100
+ from os import path
101
+ from PIL import Image
102
+ from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
103
+ import matplotlib.pyplot as plt
104
+
105
+ wordcloud = WordCloud(
106
+ background_color='white',
107
+ stopwords=stop_words,
108
+ max_words=100,
109
+ max_font_size=50,
110
+ random_state=42
111
+ ).generate(str(corpus))
112
+ print(wordcloud)
113
+ fig = plt.figure(1)
114
+ plt.imshow(wordcloud)
115
+ plt.axis('off')
116
+ plt.show()
117
+ fig.savefig("word1.png", dpi=900)
118
+ from sklearn.feature_extraction.text import CountVectorizer
119
+ import re
120
+
121
+ # Assuming you have the 'corpus' defined
122
+ # and 'stop_words' defined as in your previous code
123
+
124
+ # Create a CountVectorizer with predefined English stop words
125
+ cv = CountVectorizer(max_df=0.8, stop_words='english', max_features=10000, ngram_range=(1, 3))
126
+ X = cv.fit_transform(corpus)
127
+
128
+ # Alternatively, use your custom stop words
129
+ custom_stop_words = ['same', 'hers', 'they', 'with', 'if', 'y', 'iv', 'new', ...] # Add your custom stop words
130
+ cv = CountVectorizer(max_df=0.8, stop_words=custom_stop_words, max_features=10000, ngram_range=(1, 3))
131
+ X = cv.fit_transform(corpus)
132
+
133
+ #from sklearn.feature_extraction.text import CountVectorizer
134
+ #import re
135
+ #cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))
136
+ #X=cv.fit_transform(corpus)
137
+
138
+ from sklearn.feature_extraction.text import CountVectorizer
139
+
140
+ cv = CountVectorizer(max_df=0.8, stop_words='english', max_features=10000, ngram_range=(1,3))
141
+ X = cv.fit_transform(corpus)
142
+
143
+ custom_stop_words = ['from', 'to', 'against', 'each', 'own', ...] # Add your custom stop words
144
+ cv = CountVectorizer(max_df=0.8, stop_words=custom_stop_words, max_features=10000, ngram_range=(1,3))
145
+ X = cv.fit_transform(corpus)
146
+
147
+ list(cv.vocabulary_.keys())[:10]
148
+
149
+ #Most frequently occuring words
150
+ def get_top_n_words(corpus, n=None):
151
+ vec = CountVectorizer().fit(corpus)
152
+ bag_of_words = vec.transform(corpus)
153
+ sum_words = bag_of_words.sum(axis=0)
154
+ words_freq = [(word, sum_words[0, idx]) for word, idx in
155
+ vec.vocabulary_.items()]
156
+ words_freq =sorted(words_freq, key = lambda x: x[1],
157
+ reverse=True)
158
+ return words_freq[:n]
159
+ #Convert most freq words to dataframe for plotting bar plot
160
+ top_words = get_top_n_words(corpus, n=20)
161
+ top_df = pandas.DataFrame(top_words)
162
+ top_df.columns=["Word", "Freq"]
163
+ #Barplot of most freq words
164
+ import seaborn as sns
165
+ sns.set(rc={'figure.figsize':(13,8)})
166
+ g = sns.barplot(x="Word", y="Freq", data=top_df)
167
+ g.set_xticklabels(g.get_xticklabels(), rotation=30)
168
+
169
+ #Most frequently occuring Bi-grams
170
+ def get_top_n2_words(corpus, n=None):
171
+ vec1 = CountVectorizer(ngram_range=(2,2),
172
+ max_features=2000).fit(corpus)
173
+ bag_of_words = vec1.transform(corpus)
174
+ sum_words = bag_of_words.sum(axis=0)
175
+ words_freq = [(word, sum_words[0, idx]) for word, idx in
176
+ vec1.vocabulary_.items()]
177
+ words_freq =sorted(words_freq, key = lambda x: x[1],
178
+ reverse=True)
179
+ return words_freq[:n]
180
+ top2_words = get_top_n2_words(corpus, n=20)
181
+ top2_df = pandas.DataFrame(top2_words)
182
+ top2_df.columns=["Bi-gram", "Freq"]
183
+ print(top2_df)
184
+ #Barplot of most freq Bi-grams
185
+ import seaborn as sns
186
+ sns.set(rc={'figure.figsize':(13,8)})
187
+ h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df)
188
+ h.set_xticklabels(h.get_xticklabels(), rotation=45)
189
+
190
+ #Most frequently occuring Tri-grams
191
+ def get_top_n3_words(corpus, n=None):
192
+ vec1 = CountVectorizer(ngram_range=(3,3),
193
+ max_features=2000).fit(corpus)
194
+ bag_of_words = vec1.transform(corpus)
195
+ sum_words = bag_of_words.sum(axis=0)
196
+ words_freq = [(word, sum_words[0, idx]) for word, idx in
197
+ vec1.vocabulary_.items()]
198
+ words_freq =sorted(words_freq, key = lambda x: x[1],
199
+ reverse=True)
200
+ return words_freq[:n]
201
+ top3_words = get_top_n3_words(corpus, n=20)
202
+ top3_df = pandas.DataFrame(top3_words)
203
+ top3_df.columns=["Tri-gram", "Freq"]
204
+ print(top3_df)
205
+ #Barplot of most freq Tri-grams
206
+ import seaborn as sns
207
+ sns.set(rc={'figure.figsize':(13,8)})
208
+ j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df)
209
+ j.set_xticklabels(j.get_xticklabels(), rotation=45)
210
+
211
+ from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
212
+
213
+ # Assuming you already have the 'corpus' defined
214
+
215
+ # Create a CountVectorizer
216
+ cv = CountVectorizer(max_df=0.8, stop_words='english', max_features=10000, ngram_range=(1, 3))
217
+
218
+ # Fit and transform the corpus
219
+ X = cv.fit_transform(corpus)
220
+
221
+ # Create a TfidfTransformer and fit it to the CountVectorizer output
222
+ tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
223
+ tfidf_transformer.fit(X)
224
+
225
+ # Get feature names from CountVectorizer
226
+ feature_names = cv.get_feature_names_out()
227
+
228
+ # Fetch document for which keywords need to be extracted
229
+ doc = corpus[82]
230
+
231
+ # Generate tf-idf for the given document
232
+ tf_idf_vector = tfidf_transformer.transform(cv.transform([doc]))
233
+
234
+ # Now you can proceed with your further code
235
+
236
+ #Function for sorting tf_idf in descending order
237
+ from scipy.sparse import coo_matrix
238
+ def sort_coo(coo_matrix):
239
+ tuples = zip(coo_matrix.col, coo_matrix.data)
240
+ return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
241
+
242
+ def extract_topn_from_vector(feature_names, sorted_items, topn=10):
243
+ """get the feature names and tf-idf score of top n items"""
244
+
245
+ #use only top n items from vector
246
+ sorted_items = sorted_items[:topn]
247
+
248
+ score_vals = []
249
+ feature_vals = []
250
+
251
+ # word index and corresponding tf-idf score
252
+ for idx, score in sorted_items:
253
+
254
+ #keep track of feature name and its corresponding score
255
+ score_vals.append(round(score, 3))
256
+ feature_vals.append(feature_names[idx])
257
+
258
+ #create a tuples of feature,score
259
+ #results = zip(feature_vals,score_vals)
260
+ results= {}
261
+ for idx in range(len(feature_vals)):
262
+ results[feature_vals[idx]]=score_vals[idx]
263
+
264
+ return results
265
+ #sort the tf-idf vectors by descending order of scores
266
+ sorted_items=sort_coo(tf_idf_vector.tocoo())
267
+ #extract only the top n; n here is 10
268
+ keywords=extract_topn_from_vector(feature_names,sorted_items,10)
269
+
270
+ # now print the results
271
+ print("\nAbstract:")
272
+ print(doc)
273
+ print("\nKeywords:")
274
+ for k in keywords:
275
+ print(k,keywords[k])
276
+
277
+ from gensim.models import word2vec
278
+ tokenized_sentences = [sentence.split() for sentence in corpus]
279
+ model = word2vec.Word2Vec(tokenized_sentences, min_count=1)
280
+
281
+ model.wv.most_similar(positive=["incidence"])
282
+
283
+ import nltk
284
+ #nltk.download('omw-1.4')
285
+ from nltk.corpus import wordnet as wn
286
+
287
+ wn.synsets('car')
288
+
289
+ wn.synset('car.n.01').definition()
290
+ import gradio as gr
291
+ from nltk.corpus import wordnet as wn
292
+
293
+ # Function to get the definition of the first synset for a given word
294
+ def get_synset_definition(word):
295
+ synsets = wn.synsets(word)
296
+ if synsets:
297
+ first_synset = synsets[0]
298
+ return first_synset.definition()
299
+ else:
300
+ return "No synsets found for the given word."
301
+
302
+ # Gradio Interface
303
+ iface = gr.Interface(
304
+ fn=get_synset_definition,
305
+ inputs=gr.Textbox(),
306
+ outputs=gr.Textbox(),
307
+ live=True,
308
+ title="WordNet Synset Definition",
309
+ description="Enter a word to get the definition of its first WordNet synset.",
310
+ )
311
+
312
+ # Launch the Gradio interface
313
+ iface.launch()