mariamisoieva's picture
Create app.py
ef89d5e
import gradio as gr
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# add the EOS token as PAD token to avoid warnings
model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)
import stanza
stanza.download('en')
import nltk
nltk.download('punkt')
nltk.download('wordnet')
savejson = True
indexing = True
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.wsd import lesk
import stanza
import nltk
import collections
import itertools
import json
from collections import defaultdict
nlp = stanza.Pipeline()
lemmatizer = WordNetLemmatizer()
stories = pd.read_csv('ROCStories_winter2017 - ROCStories_winter2017 (1).csv')
import warnings
warnings.filterwarnings('ignore')
def defdict():
return defaultdict(list)
class Sentence:
def __init__(self, textid, sentencenum, sentence, vectors=None, tfidfs=None, sentenceVector=None, lemmatized=None, preds=None, args=None):
self.textid = textid
self.sentence = sentence
self.sentencenum = sentencenum
self.vectors = vectors
self.tfidfs = tfidfs
self.sentenceVector = sentenceVector
self.preds = preds
self.args = args
if lemmatized:
self.lemmatized = lemmatized
else:
self.lemmatize()
def lemmatize(self):
doc = nlp(self.sentence)
self.lemmatized = []
self.preds = []
ind = 0
self.args=[]
for i, dep_edge in enumerate(doc.sentences[0].dependencies):
if dep_edge[1] != 'punct':
self.lemmatized.append(dep_edge[2].lemma)
if dep_edge[1] == "root":
self.preds.append(dep_edge[2].lemma)
ind = i+1
for dep_edge in doc.sentences[0].dependencies:
if int(dep_edge[2].head) == ind and dep_edge[1] != 'punct': #dep_edge[1] not in x
self.args.append(lemmatizer.lemmatize(dep_edge[2].lemma.lower()))
self.doc = doc
def calculateVector(self):
if self.vectors and self.tfidfs:
self.sentenceVector = np.dot(self.tfidfs, self.vectors)
return self.sentenceVector
def getVector(self):
if self.sentenceVector is None:
self.calculateVector()
return self.sentenceVector
class Story:
def __init__(self, sentences, number):
self.sentences = sentences
self.number = number
def lemmatizedSents(self):
lemSents = []
for s in self.sentences:
lemSents.append(s.lemmatized)
return lemSents
storiesSentences = []
sentencesjsons =[]
def indexSents(sents):
ind = defaultdict(defdict)
for sc in sents:
for i, w in enumerate(sc.lemmatized):
ind[w][sc.textid].append((i, sc.sentencenum))
return ind
def indexCorpus():
sentences = []
# textid, sentencenum, sentence
for i, story in stories[:300].iterrows():
storiesSentences.append([])
# document = ""
print(i)
for sind, sent in enumerate(story[2:], start = 1):
sentence = Sentence(i, sind-1, sent)
# print(sent)
# print(i)
# print(sentence.sentencenum)
# document.join(sent)
storiesSentences[i].append(sentence)
sentences.append(sentence)
sentencesjsons.append(sentence.__dict__)
# storiesClasses.append(Story(storiesSentences[i],i))
# documents.append(document)
return indexSents(sentences)
if savejson:
index = indexCorpus()
#json.dump(sentencesjsons, open('filename.json', 'a'))
else:
sentencesjsons = json.load(open('filename.json'))
# if indexing:
# json.dump(index, open('index.json', 'w'))
# else:
# index = json.load(open('index.json'))
def searchByRequest(words):
sents = set()
dicts = []
keys = [] #story numbers
synonims = []
for i, w in enumerate(words):
synonims.append(set())
synonims[i].update([w])
for synset in wordnet.synsets(w):
synonims[i].update(synset.lemma_names())
# print(synonims)
stories = []
dictsForWords = []
storiesForWords = []
for i, w in enumerate(synonims):
dictsForWords.append([])
storiesForWords.append(set())
for synonim in w:
currrentDict = index[synonim]
if currrentDict:
dictsForWords[i].append(currrentDict)
storiesForWords[i].update(set(currrentDict.keys()))
paragraphs = set.intersection(*storiesForWords)
# print(paragraphs)
# print(dictsForWords)
# print(dicts)
sentencesClasses = set()
temporarySentencesByParagraphs = [[set()]*len(words)]*len(paragraphs)
for pi, p in enumerate(paragraphs):
temporarySentences = []
for wi, wordDictsList in enumerate(dictsForWords):
temporarySentences.append(set())
# print(wordDictsList)
for dictionary in wordDictsList:
if p in dictionary:
for sents in dictionary[p]:
# print(sents)
temporarySentences[wi].update([sents[1]])
# print(temporarySentences[wi])
# print(temporarySentences)
if wi>0 and len(words) > 1:
for i in range(wi):
for s in temporarySentences[wi]:
if s in temporarySentences[i]:
sentencesClasses.update([storiesSentences[p][s]])
# for sentence in sentencesClasses:
# print(sentence.lemmatized)
# print(sentence.sentence, sentence.textid, sentence.sentencenum)
return sentencesClasses
# m = searchByRequest(['play', 'fun', 'game'])
# m = searchByRequest(['present', 'Christmas', 'wake'])
def predIndex():
# stories = pd.read_csv('ROCStories_winter2017 - ROCStories_winter2017 (1).csv')
ind = defaultdict(defdict)
for i, story in enumerate(storiesSentences):
for j, sent in enumerate(story):
for s in sent.preds:
ind[s][i].append(j)
return ind
preds = predIndex()
def powC(subj):
c = 0
for k, v in preds[subj].items():
c += len(v)
return c
def powCons(s1, s2):
count = 0
for i in (preds[s1].keys() & preds[s2].keys()):
i1=0
i2=0
while i1 != len(preds[s1][i]) and i2 != len(preds[s2][i]): #for d in preds[s1][i]:
if preds[s1][i][i1] + 1 == preds[s2][i][i2]:
count += 1
i1 += 1
i2 += 1
elif preds[s1][i][i1] + 1 < preds[s2][i][i2]:
i1 += 1
else:
i2 += 1
return count
# print(powCons('decide', 'make'))
# print(powCons('know', 'buy'))
def synset_lesk(sent, word):
sent_tok = nltk.tokenize.word_tokenize(sent)
return lesk(sent_tok, word) #,pos
# comparison of wsd
# def wpsim():
# def wpsim_by_max():
def wpsim_lesk(word1, sent1, word2, sent2):
synset1 = lesk(sent1, word1)
# print(synset1.definition())
synset2 = lesk(sent2, word2)
# print(synset2.definition())
return synset1.wup_similarity(synset2)
x = ['punct', 'conj']
def args_of_pred(s):
return s.args
import math
alpha = 0.5
def FRelPred(sent1, sent2):
try:
p1 = sent1.preds[0]
p2 = sent2.preds[0]
if powCons(p1, p2) == 0:
return 0.0
return math.log2(powCons(p1, p2) / (powC(p1)*powC(p2)))
except:
return 0.2
def FRelArgs(s1, s2):
try:
args1 = args_of_pred(s1)
args2 = args_of_pred(s2)
# print(args1, args2)
sent_tok1 = s1.lemmatized#nltk.tokenize.word_tokenize(s1)
sent_tok2 = s2.lemmatized#nltk.tokenize.word_tokenize(s2)
# print(sent_tok1, sent_tok2)
sum1 = 0
sum2 = 0
max1 = 0
max2 = 0
wpsim = 0
for ni in args1:
synsetni = lesk(sent_tok1, ni) #pos
# print(synsetni, ni)
synsetnj = lesk(sent_tok2, args2[0])
# print(synsetnj, args2[0])
# if synsetni != None and synsetnj != None:
if not (synsetnj is None or synsetni is None):
max1 = synsetni.wup_similarity(synsetnj) #wp_sim(ni, args2[0])
# print(type(max1))
if max1 is None:
max1 = 0
for nj in args2[1:]:
synsetnj = lesk(sent_tok2, nj)
# print(synsetni, ni)
# print(synsetnj, nj)
# if synsetni != None and synsetnj != None:
if not (synsetnj is None or synsetni is None):
wpsim = synsetni.wup_similarity(synsetnj) #(ni, nj)
if wpsim is None:
wpsim = 0
if (not None in [wpsim, max1]) and wpsim > max1:
max1 = wpsim
# print(wpsim, max1)
sum1 += max1
# print(sum1)
for ni in args2:
synsetni = lesk(sent_tok2, ni)
synsetnj = lesk(sent_tok1, args1[0])
if not (synsetnj is None or synsetni is None):
max2 = synsetni.wup_similarity(synsetnj) #wp_sim(ni, args2[0])
if max2 is None:
max2 = 0
for nj in args1[1:]:
synsetnj = lesk(sent_tok1, nj)
if not (synsetnj is None or synsetni is None):
wpsim = synsetni.wup_similarity(synsetnj) #(ni, nj)
if wpsim is None:
wpsim = 0
if (not None in [wpsim, max2]) and wpsim > max2:
# if (wpsim is not None) and wpsim > max2:
max2 = wpsim
sum2 += max2
# print(len(args1))
# print(len(args2))
# print(sum1, sum2)
return 0.5*( (1/len(args1))*sum1 + (1/len(args2))*sum2 )
except:
return 0.2
def FRel(s1, s2):
return alpha*FRelPred(s1, s2) + (1-alpha)*FRelArgs(s1, s2)
def hac(foundSentences, length=2):
R=0.1
twoSentenceClusters = []
numfound = len(foundSentences)
sentencePairs = []
frelijs = []
ind = 0
maxind = 0
maxval = 0
for i in itertools.permutations(foundSentences, 2):
if i[0].textid != i[1].textid:
frelij = FRel(i[0], i[1])
if frelij > R:
sentencePairs.append(list(i))
frelijs.append(frelij)
if ind != 0:
if frelij > maxval:
maxval=maxval
maxind = ind
ind += 1
else:
ind=1
maxval = frelij
maxind = 0
# print(sentencePairs)
maxvalThree = 0
maxSentsThree = []
threeSentsCluster = set()
for pairind, pair in enumerate(sentencePairs):
for sent in foundSentences:
if sent.textid != pair[0].textid and sent.textid != pair[1].textid:
frelij = FRel(sent, i[0])
if frelij > R:
threeSentsCluster.add(tuple([sent]+pair))
current = (frelijs[maxind]+frelij)/2 > maxvalThree
if current > maxvalThree:
maxvalThree = current
maxSentsThree = [sent]+pair
frelij = FRel(i[1], sent)
if frelij > R:
threeSentsCluster.add(tuple(pair+[sent]))
current = (frelijs[maxind]+frelij)/2 > maxvalThree
if current > maxvalThree:
maxvalThree = current
maxSentsThree = pair+[sent]
# print(sentencePairs)
# print(threeSentsCluster)
# for pair in sentencePairs:
# print(pair[0].sentence, pair[1].sentence)
# for cluster in threeSentsCluster:
# print(cluster[0].sentence, cluster[1].sentence, cluster[2].sentence)
# print([sentencePairs[maxind],maxSentsThree])
if len(sentencePairs) >=1:
return [sentencePairs[maxind],maxSentsThree]#sentencePairs + list(threeSentsCluster)
else:
return []
# print(FRelPred('David noticed he had put on a lot of weight recently.',
# 'He examined his habits to try and figure out the reason.'))
# # 'After a few weeks, he started to feel much better.'))
# print(FRel('David noticed he had put on a lot of weight recently.',
# 'He examined his habits to try and figure out the reason.'))
# print(FRel('He decided to buy a pair of khakis.', 'The pair he bought fit him perfectly.'))
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.word2vec import Word2Vec
import gensim.downloader
def tfidfTokenizer(x):
return [w for words in [s.lemmatized for s in x] for w in words]
def preprocess(x):
return x
# tfidfvectorizer = TfidfVectorizer(tokenizer=tfidfTokenizer, preprocessor=preprocess, use_idf=True)
# tfidfvectorizer_vectors = tfidfvectorizer.fit_transform(storiesSentences) #(map(lambda x: [ s.lemmatized for s in np.array(x).flatten() ] ))
# wvModel = gensim.downloader.load("word2vec-google-news-300")
# feature_names = tfidfvectorizer.get_feature_names()
def setVectors(stories):
for doc in stories:
for sentence in doc:
vectors = []
for lemma in sentence.lemmatized:
try:
vectors.append(wvModel[lemma])
except:
vectors.append([0]*300)
sentence.vectors = vectors
# setVectors(storiesSentences)
def setTfIdfs(documents):
for i, doc in enumerate(documents):
feature = tfidfvectorizer_vectors[i,:].nonzero()[1]
tfidfs = zip(feature, [tfidfvectorizer_vectors[i, x] for x in feature])
tfidfsbyword = dict()
for w,s in [(feature_names[j], s) for (j, s) in tfidfs]:
tfidfsbyword[w] = s
for sent in doc:
tfidfs = []
for lemma in sent.lemmatized:
tfidfs.append(tfidfsbyword[lemma])
sent.tfidfs = tfidfs
sent.calculateVector()
# setTfIdfs(storiesSentences)
# m = searchByRequest(['wake', 'present', 'Christmas'])
# for sent in m:
# print(sent.lemmatized)
def generate(words):
m = searchByRequest(words)
return hac(m)
def generate_and_choose(input_text):
input_ids=tokenizer.encode(input_text,return_tensors='tf')
beam_outputs=model.generate(input_ids,max_length=100,num_return_sequences=3,num_beams=3,no_repeat_ngram_size=2,early_stopping=True)
return_list = []
for i, beam_output in enumerate(beam_outputs):
# print(beam_output)
return_list.append(tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True))
outputs_coherences = []
for i, text in enumerate(return_list):
sentencesTokenized = nltk.sent_tokenize(text)
coherence_cur = 0
length = len(sentencesTokenized)
for s in range(length-1):
coherence_cur += FRel(sentencesTokenized[s], sentencesTokenized[s+1])
if length == 1:
length += 1
outputs_coherences.append(coherence_cur / (length-1))
index_of_max = outputs_coherences.index(max(outputs_coherences))
return return_list[index_of_max]
def greedy_generate(inp):
input_ids = tokenizer.encode(inp, return_tensors='tf')
greedy_output = model.generate(input_ids, pad_token_id=tokenizer.encode('.')[0], eos_token_id=tokenizer.encode('.')[0])
return tokenizer.decode(greedy_output[0], skip_special_tokens=True)
def with_sampling(input_ids):
tf.random.set_seed(0)
# activate sampling and deactivate top_k by setting top_k sampling to 0
sample_output = model.generate(
input_ids,
do_sample=True,
max_length=50,
top_k=0,
temperature=0.7)
return tokenizer.decode(sample_output[0], skip_special_tokens=True)
def with_top_k_sampling(input_ids):
tf.random.set_seed(0)
sample_output = model.generate(
input_ids,
do_sample=True,
max_length=50,
top_k=50)
return tokenizer.decode(sample_output[0], skip_special_tokens=True)
def with_nucleus_sampling(input_ids):
tf.random.set_seed(0)
# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = model.generate(
input_ids,
do_sample=True,
max_length=50,
top_k=50,
top_p=0.95,
num_return_sequences=3)
return_list = []
for i, beam_output in enumerate(sample_outputs):
# print(beam_output)
return_list.append(tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True))
outputs_coherences = []
for i, text in enumerate(return_list):
sentencesTokenized = nltk.sent_tokenize(text)
coherence_cur = 0
length = len(sentencesTokenized)
for s in range(length-1):
coherence_cur += FRel(sentencesTokenized[s], sentencesTokenized[s+1])
if length == 1:
length += 1
outputs_coherences.append(coherence_cur / (length-1))
index_of_max = outputs_coherences.index(max(outputs_coherences))
return return_list[index_of_max]
def generation_method(decoding_algorithm,input_text):
input_ids=tokenizer.encode(input_text,return_tensors='tf')
if decoding_algorithm=="Beam search":
return generate_and_choose(input_text)
elif decoding_algorithm=="Greedy search":
return greedy_generate(input_text)
elif decoding_algorithm=="With sampling":
return with_sampling(input_ids)
elif decoding_algorithm=="With top k sampling":
return with_top_k_sampling(input_ids)
elif decoding_algorithm=="With nucleus sampling":
return with_nucleus_sampling(input_ids)
import gradio as gr
in1 = gr.inputs.Dropdown(choices=["Beam search", "Greedy search", "With sampling","With top k sampling", "With nucleus sampling"])
in2 = gr.inputs.Textbox()
iface = gr.Interface(fn=generation_method,
inputs=[in1,in2],
outputs="text").launch(debug=True)
iface.launch()