mariamisoieva commited on
Commit
ef89d5e
1 Parent(s): dcc1589

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +496 -0
app.py ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import tensorflow as tf
3
+ from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
4
+
5
+
6
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
7
+
8
+ # add the EOS token as PAD token to avoid warnings
9
+ model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)
10
+ import stanza
11
+ stanza.download('en')
12
+ import nltk
13
+
14
+ nltk.download('punkt')
15
+ nltk.download('wordnet')
16
+ savejson = True
17
+ indexing = True
18
+ import numpy as np
19
+ import pandas as pd
20
+ from nltk.stem import WordNetLemmatizer
21
+ from nltk.corpus import wordnet
22
+ from nltk.wsd import lesk
23
+ import stanza
24
+ import nltk
25
+ import collections
26
+ import itertools
27
+ import json
28
+ from collections import defaultdict
29
+ nlp = stanza.Pipeline()
30
+ lemmatizer = WordNetLemmatizer()
31
+ stories = pd.read_csv('ROCStories_winter2017 - ROCStories_winter2017 (1).csv')
32
+ import warnings
33
+ warnings.filterwarnings('ignore')
34
+ def defdict():
35
+ return defaultdict(list)
36
+ class Sentence:
37
+ def __init__(self, textid, sentencenum, sentence, vectors=None, tfidfs=None, sentenceVector=None, lemmatized=None, preds=None, args=None):
38
+ self.textid = textid
39
+ self.sentence = sentence
40
+ self.sentencenum = sentencenum
41
+ self.vectors = vectors
42
+ self.tfidfs = tfidfs
43
+ self.sentenceVector = sentenceVector
44
+ self.preds = preds
45
+ self.args = args
46
+ if lemmatized:
47
+ self.lemmatized = lemmatized
48
+ else:
49
+ self.lemmatize()
50
+
51
+ def lemmatize(self):
52
+ doc = nlp(self.sentence)
53
+ self.lemmatized = []
54
+ self.preds = []
55
+ ind = 0
56
+ self.args=[]
57
+ for i, dep_edge in enumerate(doc.sentences[0].dependencies):
58
+ if dep_edge[1] != 'punct':
59
+ self.lemmatized.append(dep_edge[2].lemma)
60
+ if dep_edge[1] == "root":
61
+ self.preds.append(dep_edge[2].lemma)
62
+ ind = i+1
63
+ for dep_edge in doc.sentences[0].dependencies:
64
+ if int(dep_edge[2].head) == ind and dep_edge[1] != 'punct': #dep_edge[1] not in x
65
+ self.args.append(lemmatizer.lemmatize(dep_edge[2].lemma.lower()))
66
+ self.doc = doc
67
+
68
+ def calculateVector(self):
69
+ if self.vectors and self.tfidfs:
70
+ self.sentenceVector = np.dot(self.tfidfs, self.vectors)
71
+ return self.sentenceVector
72
+
73
+ def getVector(self):
74
+ if self.sentenceVector is None:
75
+ self.calculateVector()
76
+ return self.sentenceVector
77
+
78
+ class Story:
79
+ def __init__(self, sentences, number):
80
+ self.sentences = sentences
81
+ self.number = number
82
+ def lemmatizedSents(self):
83
+ lemSents = []
84
+ for s in self.sentences:
85
+ lemSents.append(s.lemmatized)
86
+ return lemSents
87
+ storiesSentences = []
88
+ sentencesjsons =[]
89
+ def indexSents(sents):
90
+ ind = defaultdict(defdict)
91
+ for sc in sents:
92
+ for i, w in enumerate(sc.lemmatized):
93
+ ind[w][sc.textid].append((i, sc.sentencenum))
94
+ return ind
95
+ def indexCorpus():
96
+ sentences = []
97
+ # textid, sentencenum, sentence
98
+ for i, story in stories[:300].iterrows():
99
+ storiesSentences.append([])
100
+ # document = ""
101
+ print(i)
102
+ for sind, sent in enumerate(story[2:], start = 1):
103
+ sentence = Sentence(i, sind-1, sent)
104
+ # print(sent)
105
+ # print(i)
106
+ # print(sentence.sentencenum)
107
+ # document.join(sent)
108
+ storiesSentences[i].append(sentence)
109
+ sentences.append(sentence)
110
+ sentencesjsons.append(sentence.__dict__)
111
+ # storiesClasses.append(Story(storiesSentences[i],i))
112
+ # documents.append(document)
113
+ return indexSents(sentences)
114
+ if savejson:
115
+ index = indexCorpus()
116
+ #json.dump(sentencesjsons, open('filename.json', 'a'))
117
+ else:
118
+ sentencesjsons = json.load(open('filename.json'))
119
+ # if indexing:
120
+ # json.dump(index, open('index.json', 'w'))
121
+ # else:
122
+ # index = json.load(open('index.json'))
123
+
124
+ def searchByRequest(words):
125
+ sents = set()
126
+ dicts = []
127
+ keys = [] #story numbers
128
+ synonims = []
129
+ for i, w in enumerate(words):
130
+ synonims.append(set())
131
+ synonims[i].update([w])
132
+ for synset in wordnet.synsets(w):
133
+ synonims[i].update(synset.lemma_names())
134
+ # print(synonims)
135
+ stories = []
136
+ dictsForWords = []
137
+ storiesForWords = []
138
+ for i, w in enumerate(synonims):
139
+ dictsForWords.append([])
140
+ storiesForWords.append(set())
141
+ for synonim in w:
142
+ currrentDict = index[synonim]
143
+ if currrentDict:
144
+ dictsForWords[i].append(currrentDict)
145
+ storiesForWords[i].update(set(currrentDict.keys()))
146
+ paragraphs = set.intersection(*storiesForWords)
147
+ # print(paragraphs)
148
+ # print(dictsForWords)
149
+ # print(dicts)
150
+ sentencesClasses = set()
151
+ temporarySentencesByParagraphs = [[set()]*len(words)]*len(paragraphs)
152
+ for pi, p in enumerate(paragraphs):
153
+ temporarySentences = []
154
+ for wi, wordDictsList in enumerate(dictsForWords):
155
+ temporarySentences.append(set())
156
+ # print(wordDictsList)
157
+ for dictionary in wordDictsList:
158
+ if p in dictionary:
159
+ for sents in dictionary[p]:
160
+ # print(sents)
161
+ temporarySentences[wi].update([sents[1]])
162
+ # print(temporarySentences[wi])
163
+ # print(temporarySentences)
164
+ if wi>0 and len(words) > 1:
165
+ for i in range(wi):
166
+ for s in temporarySentences[wi]:
167
+ if s in temporarySentences[i]:
168
+ sentencesClasses.update([storiesSentences[p][s]])
169
+ # for sentence in sentencesClasses:
170
+ # print(sentence.lemmatized)
171
+ # print(sentence.sentence, sentence.textid, sentence.sentencenum)
172
+ return sentencesClasses
173
+ # m = searchByRequest(['play', 'fun', 'game'])
174
+ # m = searchByRequest(['present', 'Christmas', 'wake'])
175
+ def predIndex():
176
+ # stories = pd.read_csv('ROCStories_winter2017 - ROCStories_winter2017 (1).csv')
177
+ ind = defaultdict(defdict)
178
+ for i, story in enumerate(storiesSentences):
179
+ for j, sent in enumerate(story):
180
+ for s in sent.preds:
181
+ ind[s][i].append(j)
182
+ return ind
183
+ preds = predIndex()
184
+ def powC(subj):
185
+ c = 0
186
+ for k, v in preds[subj].items():
187
+ c += len(v)
188
+ return c
189
+ def powCons(s1, s2):
190
+ count = 0
191
+ for i in (preds[s1].keys() & preds[s2].keys()):
192
+ i1=0
193
+ i2=0
194
+ while i1 != len(preds[s1][i]) and i2 != len(preds[s2][i]): #for d in preds[s1][i]:
195
+ if preds[s1][i][i1] + 1 == preds[s2][i][i2]:
196
+ count += 1
197
+ i1 += 1
198
+ i2 += 1
199
+ elif preds[s1][i][i1] + 1 < preds[s2][i][i2]:
200
+ i1 += 1
201
+ else:
202
+ i2 += 1
203
+ return count
204
+
205
+ # print(powCons('decide', 'make'))
206
+ # print(powCons('know', 'buy'))
207
+ def synset_lesk(sent, word):
208
+ sent_tok = nltk.tokenize.word_tokenize(sent)
209
+ return lesk(sent_tok, word) #,pos
210
+ # comparison of wsd
211
+ # def wpsim():
212
+ # def wpsim_by_max():
213
+ def wpsim_lesk(word1, sent1, word2, sent2):
214
+ synset1 = lesk(sent1, word1)
215
+ # print(synset1.definition())
216
+ synset2 = lesk(sent2, word2)
217
+ # print(synset2.definition())
218
+ return synset1.wup_similarity(synset2)
219
+ x = ['punct', 'conj']
220
+ def args_of_pred(s):
221
+ return s.args
222
+ import math
223
+ alpha = 0.5
224
+ def FRelPred(sent1, sent2):
225
+ try:
226
+ p1 = sent1.preds[0]
227
+ p2 = sent2.preds[0]
228
+ if powCons(p1, p2) == 0:
229
+ return 0.0
230
+ return math.log2(powCons(p1, p2) / (powC(p1)*powC(p2)))
231
+ except:
232
+ return 0.2
233
+ def FRelArgs(s1, s2):
234
+ try:
235
+ args1 = args_of_pred(s1)
236
+ args2 = args_of_pred(s2)
237
+ # print(args1, args2)
238
+ sent_tok1 = s1.lemmatized#nltk.tokenize.word_tokenize(s1)
239
+ sent_tok2 = s2.lemmatized#nltk.tokenize.word_tokenize(s2)
240
+ # print(sent_tok1, sent_tok2)
241
+ sum1 = 0
242
+ sum2 = 0
243
+ max1 = 0
244
+ max2 = 0
245
+ wpsim = 0
246
+ for ni in args1:
247
+ synsetni = lesk(sent_tok1, ni) #pos
248
+ # print(synsetni, ni)
249
+ synsetnj = lesk(sent_tok2, args2[0])
250
+ # print(synsetnj, args2[0])
251
+ # if synsetni != None and synsetnj != None:
252
+ if not (synsetnj is None or synsetni is None):
253
+ max1 = synsetni.wup_similarity(synsetnj) #wp_sim(ni, args2[0])
254
+ # print(type(max1))
255
+ if max1 is None:
256
+ max1 = 0
257
+ for nj in args2[1:]:
258
+ synsetnj = lesk(sent_tok2, nj)
259
+ # print(synsetni, ni)
260
+ # print(synsetnj, nj)
261
+ # if synsetni != None and synsetnj != None:
262
+ if not (synsetnj is None or synsetni is None):
263
+ wpsim = synsetni.wup_similarity(synsetnj) #(ni, nj)
264
+ if wpsim is None:
265
+ wpsim = 0
266
+ if (not None in [wpsim, max1]) and wpsim > max1:
267
+ max1 = wpsim
268
+ # print(wpsim, max1)
269
+ sum1 += max1
270
+ # print(sum1)
271
+
272
+ for ni in args2:
273
+ synsetni = lesk(sent_tok2, ni)
274
+ synsetnj = lesk(sent_tok1, args1[0])
275
+ if not (synsetnj is None or synsetni is None):
276
+ max2 = synsetni.wup_similarity(synsetnj) #wp_sim(ni, args2[0])
277
+ if max2 is None:
278
+ max2 = 0
279
+ for nj in args1[1:]:
280
+ synsetnj = lesk(sent_tok1, nj)
281
+ if not (synsetnj is None or synsetni is None):
282
+ wpsim = synsetni.wup_similarity(synsetnj) #(ni, nj)
283
+ if wpsim is None:
284
+ wpsim = 0
285
+ if (not None in [wpsim, max2]) and wpsim > max2:
286
+ # if (wpsim is not None) and wpsim > max2:
287
+ max2 = wpsim
288
+ sum2 += max2
289
+ # print(len(args1))
290
+ # print(len(args2))
291
+ # print(sum1, sum2)
292
+ return 0.5*( (1/len(args1))*sum1 + (1/len(args2))*sum2 )
293
+ except:
294
+ return 0.2
295
+ def FRel(s1, s2):
296
+ return alpha*FRelPred(s1, s2) + (1-alpha)*FRelArgs(s1, s2)
297
+ def hac(foundSentences, length=2):
298
+ R=0.1
299
+ twoSentenceClusters = []
300
+ numfound = len(foundSentences)
301
+
302
+ sentencePairs = []
303
+ frelijs = []
304
+ ind = 0
305
+ maxind = 0
306
+ maxval = 0
307
+ for i in itertools.permutations(foundSentences, 2):
308
+ if i[0].textid != i[1].textid:
309
+ frelij = FRel(i[0], i[1])
310
+ if frelij > R:
311
+ sentencePairs.append(list(i))
312
+ frelijs.append(frelij)
313
+ if ind != 0:
314
+ if frelij > maxval:
315
+ maxval=maxval
316
+ maxind = ind
317
+ ind += 1
318
+ else:
319
+ ind=1
320
+ maxval = frelij
321
+ maxind = 0
322
+
323
+ # print(sentencePairs)
324
+ maxvalThree = 0
325
+ maxSentsThree = []
326
+ threeSentsCluster = set()
327
+ for pairind, pair in enumerate(sentencePairs):
328
+ for sent in foundSentences:
329
+ if sent.textid != pair[0].textid and sent.textid != pair[1].textid:
330
+ frelij = FRel(sent, i[0])
331
+ if frelij > R:
332
+ threeSentsCluster.add(tuple([sent]+pair))
333
+ current = (frelijs[maxind]+frelij)/2 > maxvalThree
334
+ if current > maxvalThree:
335
+ maxvalThree = current
336
+ maxSentsThree = [sent]+pair
337
+ frelij = FRel(i[1], sent)
338
+ if frelij > R:
339
+ threeSentsCluster.add(tuple(pair+[sent]))
340
+ current = (frelijs[maxind]+frelij)/2 > maxvalThree
341
+ if current > maxvalThree:
342
+ maxvalThree = current
343
+ maxSentsThree = pair+[sent]
344
+ # print(sentencePairs)
345
+ # print(threeSentsCluster)
346
+ # for pair in sentencePairs:
347
+ # print(pair[0].sentence, pair[1].sentence)
348
+ # for cluster in threeSentsCluster:
349
+ # print(cluster[0].sentence, cluster[1].sentence, cluster[2].sentence)
350
+ # print([sentencePairs[maxind],maxSentsThree])
351
+ if len(sentencePairs) >=1:
352
+ return [sentencePairs[maxind],maxSentsThree]#sentencePairs + list(threeSentsCluster)
353
+ else:
354
+ return []
355
+ # print(FRelPred('David noticed he had put on a lot of weight recently.',
356
+ # 'He examined his habits to try and figure out the reason.'))
357
+ # # 'After a few weeks, he started to feel much better.'))
358
+ # print(FRel('David noticed he had put on a lot of weight recently.',
359
+ # 'He examined his habits to try and figure out the reason.'))
360
+ # print(FRel('He decided to buy a pair of khakis.', 'The pair he bought fit him perfectly.'))
361
+ from sklearn.feature_extraction.text import TfidfVectorizer
362
+ from gensim.models.word2vec import Word2Vec
363
+ import gensim.downloader
364
+ def tfidfTokenizer(x):
365
+ return [w for words in [s.lemmatized for s in x] for w in words]
366
+ def preprocess(x):
367
+ return x
368
+ # tfidfvectorizer = TfidfVectorizer(tokenizer=tfidfTokenizer, preprocessor=preprocess, use_idf=True)
369
+ # tfidfvectorizer_vectors = tfidfvectorizer.fit_transform(storiesSentences) #(map(lambda x: [ s.lemmatized for s in np.array(x).flatten() ] ))
370
+ # wvModel = gensim.downloader.load("word2vec-google-news-300")
371
+ # feature_names = tfidfvectorizer.get_feature_names()
372
+ def setVectors(stories):
373
+ for doc in stories:
374
+ for sentence in doc:
375
+ vectors = []
376
+ for lemma in sentence.lemmatized:
377
+ try:
378
+ vectors.append(wvModel[lemma])
379
+ except:
380
+ vectors.append([0]*300)
381
+ sentence.vectors = vectors
382
+ # setVectors(storiesSentences)
383
+ def setTfIdfs(documents):
384
+ for i, doc in enumerate(documents):
385
+ feature = tfidfvectorizer_vectors[i,:].nonzero()[1]
386
+ tfidfs = zip(feature, [tfidfvectorizer_vectors[i, x] for x in feature])
387
+ tfidfsbyword = dict()
388
+ for w,s in [(feature_names[j], s) for (j, s) in tfidfs]:
389
+ tfidfsbyword[w] = s
390
+ for sent in doc:
391
+ tfidfs = []
392
+ for lemma in sent.lemmatized:
393
+ tfidfs.append(tfidfsbyword[lemma])
394
+ sent.tfidfs = tfidfs
395
+ sent.calculateVector()
396
+ # setTfIdfs(storiesSentences)
397
+ # m = searchByRequest(['wake', 'present', 'Christmas'])
398
+ # for sent in m:
399
+ # print(sent.lemmatized)
400
+ def generate(words):
401
+ m = searchByRequest(words)
402
+ return hac(m)
403
+
404
+ def generate_and_choose(input_text):
405
+ input_ids=tokenizer.encode(input_text,return_tensors='tf')
406
+ beam_outputs=model.generate(input_ids,max_length=100,num_return_sequences=3,num_beams=3,no_repeat_ngram_size=2,early_stopping=True)
407
+ return_list = []
408
+ for i, beam_output in enumerate(beam_outputs):
409
+ # print(beam_output)
410
+ return_list.append(tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True))
411
+ outputs_coherences = []
412
+ for i, text in enumerate(return_list):
413
+ sentencesTokenized = nltk.sent_tokenize(text)
414
+ coherence_cur = 0
415
+ length = len(sentencesTokenized)
416
+ for s in range(length-1):
417
+ coherence_cur += FRel(sentencesTokenized[s], sentencesTokenized[s+1])
418
+ if length == 1:
419
+ length += 1
420
+ outputs_coherences.append(coherence_cur / (length-1))
421
+ index_of_max = outputs_coherences.index(max(outputs_coherences))
422
+ return return_list[index_of_max]
423
+
424
+ def greedy_generate(inp):
425
+ input_ids = tokenizer.encode(inp, return_tensors='tf')
426
+ greedy_output = model.generate(input_ids, pad_token_id=tokenizer.encode('.')[0], eos_token_id=tokenizer.encode('.')[0])
427
+ return tokenizer.decode(greedy_output[0], skip_special_tokens=True)
428
+
429
+ def with_sampling(input_ids):
430
+ tf.random.set_seed(0)
431
+ # activate sampling and deactivate top_k by setting top_k sampling to 0
432
+ sample_output = model.generate(
433
+ input_ids,
434
+ do_sample=True,
435
+ max_length=50,
436
+ top_k=0,
437
+ temperature=0.7)
438
+ return tokenizer.decode(sample_output[0], skip_special_tokens=True)
439
+
440
+ def with_top_k_sampling(input_ids):
441
+ tf.random.set_seed(0)
442
+ sample_output = model.generate(
443
+ input_ids,
444
+ do_sample=True,
445
+ max_length=50,
446
+ top_k=50)
447
+ return tokenizer.decode(sample_output[0], skip_special_tokens=True)
448
+
449
+ def with_nucleus_sampling(input_ids):
450
+ tf.random.set_seed(0)
451
+ # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
452
+ sample_outputs = model.generate(
453
+ input_ids,
454
+ do_sample=True,
455
+ max_length=50,
456
+ top_k=50,
457
+ top_p=0.95,
458
+ num_return_sequences=3)
459
+ return_list = []
460
+ for i, beam_output in enumerate(sample_outputs):
461
+ # print(beam_output)
462
+ return_list.append(tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True))
463
+ outputs_coherences = []
464
+ for i, text in enumerate(return_list):
465
+ sentencesTokenized = nltk.sent_tokenize(text)
466
+ coherence_cur = 0
467
+ length = len(sentencesTokenized)
468
+ for s in range(length-1):
469
+ coherence_cur += FRel(sentencesTokenized[s], sentencesTokenized[s+1])
470
+ if length == 1:
471
+ length += 1
472
+ outputs_coherences.append(coherence_cur / (length-1))
473
+ index_of_max = outputs_coherences.index(max(outputs_coherences))
474
+ return return_list[index_of_max]
475
+
476
+ def generation_method(decoding_algorithm,input_text):
477
+ input_ids=tokenizer.encode(input_text,return_tensors='tf')
478
+ if decoding_algorithm=="Beam search":
479
+ return generate_and_choose(input_text)
480
+ elif decoding_algorithm=="Greedy search":
481
+ return greedy_generate(input_text)
482
+ elif decoding_algorithm=="With sampling":
483
+ return with_sampling(input_ids)
484
+ elif decoding_algorithm=="With top k sampling":
485
+ return with_top_k_sampling(input_ids)
486
+ elif decoding_algorithm=="With nucleus sampling":
487
+ return with_nucleus_sampling(input_ids)
488
+
489
+ import gradio as gr
490
+ in1 = gr.inputs.Dropdown(choices=["Beam search", "Greedy search", "With sampling","With top k sampling", "With nucleus sampling"])
491
+ in2 = gr.inputs.Textbox()
492
+ iface = gr.Interface(fn=generation_method,
493
+ inputs=[in1,in2],
494
+ outputs="text").launch(debug=True)
495
+
496
+ iface.launch()