Spaces:

mariamisoieva
/

CoherentGeneration

Runtime error

App Files Files Community

mariamisoieva commited on May 25, 2022

Commit

ef89d5e

•

1 Parent(s): dcc1589

Create app.py

Browse files

Files changed (1) hide show

app.py +496 -0

app.py ADDED Viewed

	@@ -0,0 +1,496 @@

+import gradio as gr
+import tensorflow as tf
+from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
+tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+# add the EOS token as PAD token to avoid warnings
+model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)
+import stanza
+stanza.download('en')
+import nltk
+nltk.download('punkt')
+nltk.download('wordnet')
+savejson = True
+indexing = True
+import numpy as np
+import pandas as pd
+from nltk.stem import WordNetLemmatizer
+from nltk.corpus import wordnet
+from nltk.wsd import lesk
+import stanza
+import nltk
+import collections
+import itertools
+import json
+from collections import defaultdict
+nlp = stanza.Pipeline()
+lemmatizer = WordNetLemmatizer()
+stories = pd.read_csv('ROCStories_winter2017 - ROCStories_winter2017 (1).csv')
+import warnings
+warnings.filterwarnings('ignore')
+def defdict():
+    return defaultdict(list)
+class Sentence:
+    def __init__(self, textid, sentencenum, sentence, vectors=None, tfidfs=None, sentenceVector=None, lemmatized=None, preds=None, args=None):
+        self.textid = textid
+        self.sentence = sentence
+        self.sentencenum = sentencenum
+        self.vectors = vectors
+        self.tfidfs = tfidfs
+        self.sentenceVector = sentenceVector
+        self.preds = preds
+        self.args = args
+        if lemmatized:
+            self.lemmatized = lemmatized
+        else:
+            self.lemmatize()
+    def lemmatize(self):
+        doc = nlp(self.sentence)
+        self.lemmatized = []
+        self.preds = []
+        ind = 0
+        self.args=[]
+        for i, dep_edge in enumerate(doc.sentences[0].dependencies):
+            if dep_edge[1] != 'punct':
+                self.lemmatized.append(dep_edge[2].lemma)
+            if dep_edge[1] == "root":
+                self.preds.append(dep_edge[2].lemma)
+                ind = i+1
+        for dep_edge in doc.sentences[0].dependencies:
+            if int(dep_edge[2].head) == ind and dep_edge[1] != 'punct': #dep_edge[1] not in x
+                self.args.append(lemmatizer.lemmatize(dep_edge[2].lemma.lower()))
+        self.doc = doc
+    def calculateVector(self):
+        if self.vectors and self.tfidfs:
+            self.sentenceVector = np.dot(self.tfidfs, self.vectors)
+        return self.sentenceVector
+    def getVector(self):
+        if self.sentenceVector is None:
+            self.calculateVector()
+        return self.sentenceVector
+class Story:
+    def __init__(self, sentences, number):
+        self.sentences = sentences
+        self.number = number
+    def lemmatizedSents(self):
+        lemSents = []
+        for s in self.sentences:
+            lemSents.append(s.lemmatized)
+        return lemSents
+storiesSentences = []
+sentencesjsons =[]
+def indexSents(sents):
+    ind = defaultdict(defdict)
+    for sc in sents:
+        for i, w in enumerate(sc.lemmatized):
+            ind[w][sc.textid].append((i, sc.sentencenum))
+    return ind
+def indexCorpus():
+    sentences = []
+    #     textid, sentencenum, sentence
+    for i, story in stories[:300].iterrows():
+        storiesSentences.append([])
+        # document = ""
+        print(i)
+        for sind, sent in enumerate(story[2:], start = 1):
+            sentence = Sentence(i, sind-1, sent)
+#            print(sent)
+#            print(i)
+#            print(sentence.sentencenum)
+            # document.join(sent)
+            storiesSentences[i].append(sentence)
+            sentences.append(sentence)
+            sentencesjsons.append(sentence.__dict__)
+    # storiesClasses.append(Story(storiesSentences[i],i))
+    # documents.append(document)
+    return indexSents(sentences)
+if savejson:
+  index = indexCorpus()
+  #json.dump(sentencesjsons, open('filename.json', 'a'))
+else:
+  sentencesjsons = json.load(open('filename.json'))
+# if indexing:
+#   json.dump(index, open('index.json', 'w'))
+# else:
+#   index = json.load(open('index.json'))
+def searchByRequest(words):
+    sents = set()
+    dicts = []
+    keys = [] #story numbers
+    synonims = []
+    for i, w in enumerate(words):
+        synonims.append(set())
+        synonims[i].update([w])
+        for synset in wordnet.synsets(w):
+            synonims[i].update(synset.lemma_names())
+    # print(synonims)
+    stories = []
+    dictsForWords = []
+    storiesForWords = []
+    for i, w in enumerate(synonims):
+        dictsForWords.append([])
+        storiesForWords.append(set())
+        for synonim in w:
+            currrentDict = index[synonim]
+            if currrentDict:
+                dictsForWords[i].append(currrentDict)
+                storiesForWords[i].update(set(currrentDict.keys()))
+    paragraphs = set.intersection(*storiesForWords)
+    # print(paragraphs)
+    # print(dictsForWords)
+    # print(dicts)
+    sentencesClasses = set()
+    temporarySentencesByParagraphs = [[set()]*len(words)]*len(paragraphs)
+    for pi, p in enumerate(paragraphs):
+        temporarySentences = []
+        for wi, wordDictsList in enumerate(dictsForWords):
+            temporarySentences.append(set())
+            # print(wordDictsList)
+            for dictionary in wordDictsList:
+                if p in dictionary:
+                    for sents in dictionary[p]:
+                        # print(sents)
+                        temporarySentences[wi].update([sents[1]])
+            #             print(temporarySentences[wi])
+            # print(temporarySentences)
+            if wi>0 and len(words) > 1:
+                for i in range(wi):
+                    for s in temporarySentences[wi]:
+                        if s in temporarySentences[i]:
+                            sentencesClasses.update([storiesSentences[p][s]])
+    # for sentence in sentencesClasses:
+    #     print(sentence.lemmatized)
+    #     print(sentence.sentence, sentence.textid, sentence.sentencenum)
+    return sentencesClasses
+# m = searchByRequest(['play', 'fun', 'game'])
+# m = searchByRequest(['present', 'Christmas', 'wake'])
+def predIndex():
+    # stories = pd.read_csv('ROCStories_winter2017 - ROCStories_winter2017 (1).csv')
+    ind = defaultdict(defdict)
+    for i, story in enumerate(storiesSentences):
+        for j, sent in enumerate(story):
+            for s in sent.preds:
+                ind[s][i].append(j)
+    return ind
+preds = predIndex()
+def powC(subj):
+    c = 0
+    for k, v in preds[subj].items():
+        c += len(v)
+    return c
+def powCons(s1, s2):
+    count = 0
+    for i in (preds[s1].keys() & preds[s2].keys()):
+        i1=0
+        i2=0
+        while i1 != len(preds[s1][i]) and i2 != len(preds[s2][i]): #for d in preds[s1][i]:
+            if preds[s1][i][i1] + 1 == preds[s2][i][i2]:
+                count += 1
+                i1 += 1
+                i2 += 1
+            elif preds[s1][i][i1] + 1 < preds[s2][i][i2]:
+                i1 += 1
+            else:
+                i2 += 1
+    return count
+# print(powCons('decide', 'make'))
+# print(powCons('know', 'buy'))
+def synset_lesk(sent, word):
+    sent_tok = nltk.tokenize.word_tokenize(sent)
+    return lesk(sent_tok, word) #,pos
+# comparison of wsd
+# def wpsim():
+# def wpsim_by_max():
+def wpsim_lesk(word1, sent1, word2, sent2):
+    synset1 = lesk(sent1, word1)
+    # print(synset1.definition())
+    synset2 = lesk(sent2, word2)
+    # print(synset2.definition())
+    return synset1.wup_similarity(synset2)
+x = ['punct', 'conj']
+def args_of_pred(s):
+    return s.args
+import math
+alpha = 0.5
+def FRelPred(sent1, sent2):
+  try:
+    p1 = sent1.preds[0]
+    p2 = sent2.preds[0]
+    if powCons(p1, p2) == 0:
+        return 0.0
+    return math.log2(powCons(p1, p2) / (powC(p1)*powC(p2)))
+  except:
+    return 0.2
+def FRelArgs(s1, s2):
+  try:
+    args1 = args_of_pred(s1)
+    args2 = args_of_pred(s2)
+    # print(args1, args2)
+    sent_tok1 = s1.lemmatized#nltk.tokenize.word_tokenize(s1)
+    sent_tok2 = s2.lemmatized#nltk.tokenize.word_tokenize(s2)
+    # print(sent_tok1, sent_tok2)
+    sum1 = 0
+    sum2 = 0
+    max1 = 0
+    max2 = 0
+    wpsim = 0
+    for ni in args1:
+        synsetni = lesk(sent_tok1, ni) #pos
+        # print(synsetni, ni)
+        synsetnj = lesk(sent_tok2, args2[0])
+        # print(synsetnj, args2[0])
+        #         if synsetni != None and synsetnj != None:
+        if not (synsetnj is None or synsetni is None):
+            max1 = synsetni.wup_similarity(synsetnj) #wp_sim(ni, args2[0])
+        # print(type(max1))
+        if max1 is None:
+            max1 = 0
+        for nj in args2[1:]:
+            synsetnj = lesk(sent_tok2, nj)
+            # print(synsetni, ni)
+            # print(synsetnj, nj)
+            #             if synsetni != None and synsetnj != None:
+            if not (synsetnj is None or synsetni is None):
+                wpsim = synsetni.wup_similarity(synsetnj) #(ni, nj)
+            if wpsim is None:
+                wpsim = 0
+            if (not None in [wpsim, max1]) and wpsim > max1:
+                max1 = wpsim
+            # print(wpsim, max1)
+        sum1 += max1
+        # print(sum1)
+    for ni in args2:
+        synsetni = lesk(sent_tok2, ni)
+        synsetnj = lesk(sent_tok1, args1[0])
+        if not (synsetnj is None or synsetni is None):
+            max2 = synsetni.wup_similarity(synsetnj) #wp_sim(ni, args2[0])
+        if max2 is None:
+            max2 = 0
+        for nj in args1[1:]:
+            synsetnj = lesk(sent_tok1, nj)
+            if not (synsetnj is None or synsetni is None):
+                wpsim = synsetni.wup_similarity(synsetnj) #(ni, nj)
+            if wpsim is None:
+                wpsim = 0
+            if (not None in [wpsim, max2]) and wpsim > max2:
+                #             if (wpsim is not None) and wpsim > max2:
+                max2 = wpsim
+        sum2 += max2
+    # print(len(args1))
+    # print(len(args2))
+    # print(sum1, sum2)
+    return 0.5*( (1/len(args1))*sum1 + (1/len(args2))*sum2 )
+  except:
+    return 0.2
+def FRel(s1, s2):
+    return alpha*FRelPred(s1, s2) + (1-alpha)*FRelArgs(s1, s2)
+def hac(foundSentences, length=2):
+    R=0.1
+    twoSentenceClusters = []
+    numfound = len(foundSentences)
+    sentencePairs = []
+    frelijs = []
+    ind = 0
+    maxind = 0
+    maxval = 0
+    for i in itertools.permutations(foundSentences, 2):
+        if i[0].textid != i[1].textid:
+            frelij = FRel(i[0], i[1])
+            if frelij > R:
+                sentencePairs.append(list(i))
+                frelijs.append(frelij)
+                if ind != 0:
+                    if frelij > maxval:
+                        maxval=maxval
+                        maxind = ind
+                        ind += 1
+                else:
+                    ind=1
+                    maxval = frelij
+                    maxind = 0
+    # print(sentencePairs)
+    maxvalThree = 0
+    maxSentsThree = []
+    threeSentsCluster = set()
+    for pairind, pair in enumerate(sentencePairs):
+        for sent in foundSentences:
+            if sent.textid != pair[0].textid and sent.textid != pair[1].textid:
+                frelij = FRel(sent, i[0])
+                if frelij > R:
+                    threeSentsCluster.add(tuple([sent]+pair))
+                    current = (frelijs[maxind]+frelij)/2 > maxvalThree
+                    if current > maxvalThree:
+                        maxvalThree = current
+                        maxSentsThree = [sent]+pair
+                frelij = FRel(i[1], sent)
+                if frelij > R:
+                    threeSentsCluster.add(tuple(pair+[sent]))
+                    current = (frelijs[maxind]+frelij)/2 > maxvalThree
+                    if current > maxvalThree:
+                        maxvalThree = current
+                        maxSentsThree = pair+[sent]
+    # print(sentencePairs)
+    # print(threeSentsCluster)
+    # for pair in sentencePairs:
+    #     print(pair[0].sentence, pair[1].sentence)
+    # for cluster in threeSentsCluster:
+    #     print(cluster[0].sentence, cluster[1].sentence, cluster[2].sentence)
+    # print([sentencePairs[maxind],maxSentsThree])
+    if len(sentencePairs) >=1:
+      return [sentencePairs[maxind],maxSentsThree]#sentencePairs + list(threeSentsCluster)
+    else:
+      return []
+# print(FRelPred('David noticed he had put on a lot of weight recently.',
+#                'He examined his habits to try and figure out the reason.'))
+# #          'After a few weeks, he started to feel much better.'))
+# print(FRel('David noticed he had put on a lot of weight recently.',
+#             'He examined his habits to try and figure out the reason.'))
+# print(FRel('He decided to buy a pair of khakis.', 'The pair he bought fit him perfectly.'))
+from sklearn.feature_extraction.text import TfidfVectorizer
+from gensim.models.word2vec import Word2Vec
+import gensim.downloader
+def tfidfTokenizer(x):
+    return [w for words in [s.lemmatized for s in x] for w in words]
+def preprocess(x):
+    return x
+# tfidfvectorizer = TfidfVectorizer(tokenizer=tfidfTokenizer, preprocessor=preprocess, use_idf=True)
+# tfidfvectorizer_vectors = tfidfvectorizer.fit_transform(storiesSentences) #(map(lambda x: [ s.lemmatized for s in np.array(x).flatten() ] ))
+# wvModel = gensim.downloader.load("word2vec-google-news-300")
+# feature_names = tfidfvectorizer.get_feature_names()
+def setVectors(stories):
+    for doc in stories:
+        for sentence in doc:
+            vectors = []
+            for lemma in sentence.lemmatized:
+                try:
+                    vectors.append(wvModel[lemma])
+                except:
+                    vectors.append([0]*300)
+            sentence.vectors = vectors
+# setVectors(storiesSentences)
+def setTfIdfs(documents):
+    for i, doc in enumerate(documents):
+        feature = tfidfvectorizer_vectors[i,:].nonzero()[1]
+        tfidfs = zip(feature, [tfidfvectorizer_vectors[i, x] for x in feature])
+        tfidfsbyword = dict()
+        for w,s in [(feature_names[j], s) for (j, s) in tfidfs]:
+            tfidfsbyword[w] = s
+        for sent in doc:
+            tfidfs = []
+            for lemma in sent.lemmatized:
+                tfidfs.append(tfidfsbyword[lemma])
+            sent.tfidfs = tfidfs
+            sent.calculateVector()
+# setTfIdfs(storiesSentences)
+# m = searchByRequest(['wake', 'present', 'Christmas'])
+# for sent in m:
+#     print(sent.lemmatized)
+def generate(words):
+    m = searchByRequest(words)
+    return hac(m)
+def generate_and_choose(input_text):
+  input_ids=tokenizer.encode(input_text,return_tensors='tf')
+  beam_outputs=model.generate(input_ids,max_length=100,num_return_sequences=3,num_beams=3,no_repeat_ngram_size=2,early_stopping=True)
+  return_list = []
+  for i, beam_output in enumerate(beam_outputs):
+    # print(beam_output)
+    return_list.append(tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True))
+  outputs_coherences = []
+  for i, text in enumerate(return_list):
+    sentencesTokenized = nltk.sent_tokenize(text)
+    coherence_cur = 0
+    length = len(sentencesTokenized)
+    for s in range(length-1):
+      coherence_cur += FRel(sentencesTokenized[s], sentencesTokenized[s+1])
+      if length == 1:
+        length += 1
+    outputs_coherences.append(coherence_cur / (length-1))
+  index_of_max = outputs_coherences.index(max(outputs_coherences))
+  return return_list[index_of_max]
+def greedy_generate(inp):
+  input_ids = tokenizer.encode(inp, return_tensors='tf')
+  greedy_output = model.generate(input_ids, pad_token_id=tokenizer.encode('.')[0], eos_token_id=tokenizer.encode('.')[0])
+  return tokenizer.decode(greedy_output[0], skip_special_tokens=True)
+def with_sampling(input_ids):
+  tf.random.set_seed(0)
+  # activate sampling and deactivate top_k by setting top_k sampling to 0
+  sample_output = model.generate(
+    input_ids,
+    do_sample=True,
+    max_length=50,
+    top_k=0,
+    temperature=0.7)
+  return tokenizer.decode(sample_output[0], skip_special_tokens=True)
+def with_top_k_sampling(input_ids):
+  tf.random.set_seed(0)
+  sample_output = model.generate(
+    input_ids,
+    do_sample=True,
+    max_length=50,
+    top_k=50)
+  return tokenizer.decode(sample_output[0], skip_special_tokens=True)
+def with_nucleus_sampling(input_ids):
+  tf.random.set_seed(0)
+  # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
+  sample_outputs = model.generate(
+    input_ids,
+    do_sample=True,
+    max_length=50,
+    top_k=50,
+    top_p=0.95,
+    num_return_sequences=3)
+  return_list = []
+  for i, beam_output in enumerate(sample_outputs):
+    # print(beam_output)
+    return_list.append(tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True))
+  outputs_coherences = []
+  for i, text in enumerate(return_list):
+    sentencesTokenized = nltk.sent_tokenize(text)
+    coherence_cur = 0
+    length = len(sentencesTokenized)
+    for s in range(length-1):
+      coherence_cur += FRel(sentencesTokenized[s], sentencesTokenized[s+1])
+      if length == 1:
+        length += 1
+    outputs_coherences.append(coherence_cur / (length-1))
+  index_of_max = outputs_coherences.index(max(outputs_coherences))
+  return return_list[index_of_max]
+def generation_method(decoding_algorithm,input_text):
+  input_ids=tokenizer.encode(input_text,return_tensors='tf')
+  if decoding_algorithm=="Beam search":
+    return generate_and_choose(input_text)
+  elif decoding_algorithm=="Greedy search":
+    return greedy_generate(input_text)
+  elif decoding_algorithm=="With sampling":
+    return with_sampling(input_ids)
+  elif decoding_algorithm=="With top k sampling":
+    return with_top_k_sampling(input_ids)
+  elif decoding_algorithm=="With nucleus sampling":
+    return with_nucleus_sampling(input_ids)
+import gradio as gr
+in1 = gr.inputs.Dropdown(choices=["Beam search", "Greedy search", "With sampling","With top k sampling", "With nucleus sampling"])
+in2 = gr.inputs.Textbox()
+iface = gr.Interface(fn=generation_method,
+             inputs=[in1,in2],
+             outputs="text").launch(debug=True)
+iface.launch()