Spaces:

mariamisoieva
/

CoherentGeneration

Runtime error

File size: 17,833 Bytes

ef89d5e

import gradio as gr
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer


tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings
model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)
import stanza
stanza.download('en')
import nltk

nltk.download('punkt')
nltk.download('wordnet')
savejson = True
indexing = True
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.wsd import lesk
import stanza
import nltk
import collections
import itertools
import json
from collections import defaultdict
nlp = stanza.Pipeline()
lemmatizer = WordNetLemmatizer()
stories = pd.read_csv('ROCStories_winter2017 - ROCStories_winter2017 (1).csv')
import warnings
warnings.filterwarnings('ignore')
def defdict():
    return defaultdict(list)
class Sentence:
    def __init__(self, textid, sentencenum, sentence, vectors=None, tfidfs=None, sentenceVector=None, lemmatized=None, preds=None, args=None):
        self.textid = textid
        self.sentence = sentence
        self.sentencenum = sentencenum
        self.vectors = vectors
        self.tfidfs = tfidfs
        self.sentenceVector = sentenceVector
        self.preds = preds
        self.args = args
        if lemmatized:
            self.lemmatized = lemmatized
        else:
            self.lemmatize()
    
    def lemmatize(self):
        doc = nlp(self.sentence)
        self.lemmatized = []
        self.preds = []
        ind = 0
        self.args=[]
        for i, dep_edge in enumerate(doc.sentences[0].dependencies):
            if dep_edge[1] != 'punct':
                self.lemmatized.append(dep_edge[2].lemma)
            if dep_edge[1] == "root":
                self.preds.append(dep_edge[2].lemma)
                ind = i+1
        for dep_edge in doc.sentences[0].dependencies:
            if int(dep_edge[2].head) == ind and dep_edge[1] != 'punct': #dep_edge[1] not in x
                self.args.append(lemmatizer.lemmatize(dep_edge[2].lemma.lower()))
        self.doc = doc

    def calculateVector(self):
        if self.vectors and self.tfidfs:
            self.sentenceVector = np.dot(self.tfidfs, self.vectors)
        return self.sentenceVector
    
    def getVector(self):
        if self.sentenceVector is None:
            self.calculateVector()
        return self.sentenceVector

class Story:
    def __init__(self, sentences, number):
        self.sentences = sentences
        self.number = number
    def lemmatizedSents(self):
        lemSents = []
        for s in self.sentences:
            lemSents.append(s.lemmatized)
        return lemSents
storiesSentences = []
sentencesjsons =[]
def indexSents(sents):
    ind = defaultdict(defdict)
    for sc in sents:
        for i, w in enumerate(sc.lemmatized):
            ind[w][sc.textid].append((i, sc.sentencenum))
    return ind
def indexCorpus():
    sentences = []
    #     textid, sentencenum, sentence
    for i, story in stories[:300].iterrows():
        storiesSentences.append([])
        # document = ""
        print(i)
        for sind, sent in enumerate(story[2:], start = 1):
            sentence = Sentence(i, sind-1, sent)
#            print(sent)
#            print(i)
#            print(sentence.sentencenum)
            # document.join(sent)
            storiesSentences[i].append(sentence)
            sentences.append(sentence)
            sentencesjsons.append(sentence.__dict__)
    # storiesClasses.append(Story(storiesSentences[i],i))
    # documents.append(document)
    return indexSents(sentences)
if savejson:
  index = indexCorpus()
  #json.dump(sentencesjsons, open('filename.json', 'a'))
else:
  sentencesjsons = json.load(open('filename.json'))
# if indexing:
#   json.dump(index, open('index.json', 'w'))
# else:
#   index = json.load(open('index.json'))

def searchByRequest(words):
    sents = set()
    dicts = []
    keys = [] #story numbers
    synonims = []
    for i, w in enumerate(words):
        synonims.append(set())
        synonims[i].update([w])
        for synset in wordnet.synsets(w):
            synonims[i].update(synset.lemma_names())
    # print(synonims)
    stories = []
    dictsForWords = []
    storiesForWords = []
    for i, w in enumerate(synonims):
        dictsForWords.append([])
        storiesForWords.append(set())
        for synonim in w:
            currrentDict = index[synonim]
            if currrentDict:
                dictsForWords[i].append(currrentDict)
                storiesForWords[i].update(set(currrentDict.keys()))
    paragraphs = set.intersection(*storiesForWords)
    # print(paragraphs)
    # print(dictsForWords)
    # print(dicts)
    sentencesClasses = set()
    temporarySentencesByParagraphs = [[set()]*len(words)]*len(paragraphs)
    for pi, p in enumerate(paragraphs):
        temporarySentences = []
        for wi, wordDictsList in enumerate(dictsForWords):
            temporarySentences.append(set())
            # print(wordDictsList)
            for dictionary in wordDictsList:
                if p in dictionary:
                    for sents in dictionary[p]:
                        # print(sents)
                        temporarySentences[wi].update([sents[1]])
            #             print(temporarySentences[wi])
            # print(temporarySentences)
            if wi>0 and len(words) > 1:
                for i in range(wi):
                    for s in temporarySentences[wi]:
                        if s in temporarySentences[i]:
                            sentencesClasses.update([storiesSentences[p][s]])
    # for sentence in sentencesClasses:
    #     print(sentence.lemmatized)
    #     print(sentence.sentence, sentence.textid, sentence.sentencenum)
    return sentencesClasses
# m = searchByRequest(['play', 'fun', 'game'])
# m = searchByRequest(['present', 'Christmas', 'wake'])
def predIndex():
    # stories = pd.read_csv('ROCStories_winter2017 - ROCStories_winter2017 (1).csv')
    ind = defaultdict(defdict)
    for i, story in enumerate(storiesSentences):
        for j, sent in enumerate(story):
            for s in sent.preds:
                ind[s][i].append(j)
    return ind
preds = predIndex()
def powC(subj):
    c = 0
    for k, v in preds[subj].items():
        c += len(v)
    return c
def powCons(s1, s2):
    count = 0
    for i in (preds[s1].keys() & preds[s2].keys()):
        i1=0
        i2=0
        while i1 != len(preds[s1][i]) and i2 != len(preds[s2][i]): #for d in preds[s1][i]:
            if preds[s1][i][i1] + 1 == preds[s2][i][i2]:
                count += 1
                i1 += 1
                i2 += 1
            elif preds[s1][i][i1] + 1 < preds[s2][i][i2]:
                i1 += 1
            else:
                i2 += 1
    return count

# print(powCons('decide', 'make'))
# print(powCons('know', 'buy'))
def synset_lesk(sent, word):
    sent_tok = nltk.tokenize.word_tokenize(sent)
    return lesk(sent_tok, word) #,pos
# comparison of wsd
# def wpsim():
# def wpsim_by_max():
def wpsim_lesk(word1, sent1, word2, sent2):
    synset1 = lesk(sent1, word1)
    # print(synset1.definition())
    synset2 = lesk(sent2, word2)
    # print(synset2.definition())
    return synset1.wup_similarity(synset2)
x = ['punct', 'conj']
def args_of_pred(s):
    return s.args
import math
alpha = 0.5
def FRelPred(sent1, sent2):
  try:
    p1 = sent1.preds[0]
    p2 = sent2.preds[0]
    if powCons(p1, p2) == 0:
        return 0.0
    return math.log2(powCons(p1, p2) / (powC(p1)*powC(p2)))
  except:
    return 0.2
def FRelArgs(s1, s2):
  try:
    args1 = args_of_pred(s1)
    args2 = args_of_pred(s2)
    # print(args1, args2)
    sent_tok1 = s1.lemmatized#nltk.tokenize.word_tokenize(s1)
    sent_tok2 = s2.lemmatized#nltk.tokenize.word_tokenize(s2)
    # print(sent_tok1, sent_tok2)
    sum1 = 0
    sum2 = 0
    max1 = 0
    max2 = 0
    wpsim = 0
    for ni in args1:
        synsetni = lesk(sent_tok1, ni) #pos
        # print(synsetni, ni)
        synsetnj = lesk(sent_tok2, args2[0])
        # print(synsetnj, args2[0])
        #         if synsetni != None and synsetnj != None:
        if not (synsetnj is None or synsetni is None):
            max1 = synsetni.wup_similarity(synsetnj) #wp_sim(ni, args2[0])
        # print(type(max1))
        if max1 is None:
            max1 = 0
        for nj in args2[1:]:
            synsetnj = lesk(sent_tok2, nj)
            # print(synsetni, ni)
            # print(synsetnj, nj)
            #             if synsetni != None and synsetnj != None:
            if not (synsetnj is None or synsetni is None):
                wpsim = synsetni.wup_similarity(synsetnj) #(ni, nj)
            if wpsim is None:
                wpsim = 0
            if (not None in [wpsim, max1]) and wpsim > max1:
                max1 = wpsim
            # print(wpsim, max1)
        sum1 += max1
        # print(sum1)
    
    for ni in args2:
        synsetni = lesk(sent_tok2, ni)
        synsetnj = lesk(sent_tok1, args1[0])
        if not (synsetnj is None or synsetni is None):
            max2 = synsetni.wup_similarity(synsetnj) #wp_sim(ni, args2[0])
        if max2 is None:
            max2 = 0
        for nj in args1[1:]:
            synsetnj = lesk(sent_tok1, nj)
            if not (synsetnj is None or synsetni is None):
                wpsim = synsetni.wup_similarity(synsetnj) #(ni, nj)
            if wpsim is None:
                wpsim = 0
            if (not None in [wpsim, max2]) and wpsim > max2:
                #             if (wpsim is not None) and wpsim > max2:
                max2 = wpsim
        sum2 += max2
    # print(len(args1))
    # print(len(args2))
    # print(sum1, sum2)
    return 0.5*( (1/len(args1))*sum1 + (1/len(args2))*sum2 )
  except:
    return 0.2
def FRel(s1, s2):
    return alpha*FRelPred(s1, s2) + (1-alpha)*FRelArgs(s1, s2)
def hac(foundSentences, length=2):
    R=0.1
    twoSentenceClusters = []
    numfound = len(foundSentences)
    
    sentencePairs = []
    frelijs = []
    ind = 0
    maxind = 0
    maxval = 0
    for i in itertools.permutations(foundSentences, 2):
        if i[0].textid != i[1].textid:
            frelij = FRel(i[0], i[1])
            if frelij > R:
                sentencePairs.append(list(i))
                frelijs.append(frelij)
                if ind != 0:
                    if frelij > maxval:
                        maxval=maxval
                        maxind = ind
                        ind += 1
                else:
                    ind=1
                    maxval = frelij
                    maxind = 0
    
    # print(sentencePairs)
    maxvalThree = 0
    maxSentsThree = []
    threeSentsCluster = set()
    for pairind, pair in enumerate(sentencePairs):
        for sent in foundSentences:
            if sent.textid != pair[0].textid and sent.textid != pair[1].textid:
                frelij = FRel(sent, i[0])
                if frelij > R:
                    threeSentsCluster.add(tuple([sent]+pair))
                    current = (frelijs[maxind]+frelij)/2 > maxvalThree
                    if current > maxvalThree:
                        maxvalThree = current
                        maxSentsThree = [sent]+pair
                frelij = FRel(i[1], sent)
                if frelij > R:
                    threeSentsCluster.add(tuple(pair+[sent]))
                    current = (frelijs[maxind]+frelij)/2 > maxvalThree
                    if current > maxvalThree:
                        maxvalThree = current
                        maxSentsThree = pair+[sent]
    # print(sentencePairs)
    # print(threeSentsCluster)
    # for pair in sentencePairs:
    #     print(pair[0].sentence, pair[1].sentence)
    # for cluster in threeSentsCluster:
    #     print(cluster[0].sentence, cluster[1].sentence, cluster[2].sentence)
    # print([sentencePairs[maxind],maxSentsThree])
    if len(sentencePairs) >=1:
      return [sentencePairs[maxind],maxSentsThree]#sentencePairs + list(threeSentsCluster)
    else:
      return []
# print(FRelPred('David noticed he had put on a lot of weight recently.',
#                'He examined his habits to try and figure out the reason.'))
# #          'After a few weeks, he started to feel much better.'))
# print(FRel('David noticed he had put on a lot of weight recently.',
#             'He examined his habits to try and figure out the reason.'))
# print(FRel('He decided to buy a pair of khakis.', 'The pair he bought fit him perfectly.'))
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.word2vec import Word2Vec
import gensim.downloader
def tfidfTokenizer(x):
    return [w for words in [s.lemmatized for s in x] for w in words]
def preprocess(x):
    return x
# tfidfvectorizer = TfidfVectorizer(tokenizer=tfidfTokenizer, preprocessor=preprocess, use_idf=True)
# tfidfvectorizer_vectors = tfidfvectorizer.fit_transform(storiesSentences) #(map(lambda x: [ s.lemmatized for s in np.array(x).flatten() ] ))
# wvModel = gensim.downloader.load("word2vec-google-news-300")
# feature_names = tfidfvectorizer.get_feature_names()
def setVectors(stories):
    for doc in stories:
        for sentence in doc:
            vectors = []
            for lemma in sentence.lemmatized:
                try:
                    vectors.append(wvModel[lemma])
                except:
                    vectors.append([0]*300)
            sentence.vectors = vectors
# setVectors(storiesSentences)
def setTfIdfs(documents):
    for i, doc in enumerate(documents):
        feature = tfidfvectorizer_vectors[i,:].nonzero()[1]
        tfidfs = zip(feature, [tfidfvectorizer_vectors[i, x] for x in feature])
        tfidfsbyword = dict()
        for w,s in [(feature_names[j], s) for (j, s) in tfidfs]:
            tfidfsbyword[w] = s
        for sent in doc:
            tfidfs = []
            for lemma in sent.lemmatized:
                tfidfs.append(tfidfsbyword[lemma])
            sent.tfidfs = tfidfs
            sent.calculateVector()
# setTfIdfs(storiesSentences)
# m = searchByRequest(['wake', 'present', 'Christmas'])
# for sent in m:
#     print(sent.lemmatized)
def generate(words):
    m = searchByRequest(words)
    return hac(m)
    
def generate_and_choose(input_text):
  input_ids=tokenizer.encode(input_text,return_tensors='tf')
  beam_outputs=model.generate(input_ids,max_length=100,num_return_sequences=3,num_beams=3,no_repeat_ngram_size=2,early_stopping=True)
  return_list = []
  for i, beam_output in enumerate(beam_outputs):
    # print(beam_output)
    return_list.append(tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True))
  outputs_coherences = []
  for i, text in enumerate(return_list):
    sentencesTokenized = nltk.sent_tokenize(text)
    coherence_cur = 0
    length = len(sentencesTokenized)
    for s in range(length-1):
      coherence_cur += FRel(sentencesTokenized[s], sentencesTokenized[s+1])
      if length == 1:
        length += 1
    outputs_coherences.append(coherence_cur / (length-1))
  index_of_max = outputs_coherences.index(max(outputs_coherences))
  return return_list[index_of_max]

def greedy_generate(inp):
  input_ids = tokenizer.encode(inp, return_tensors='tf')
  greedy_output = model.generate(input_ids, pad_token_id=tokenizer.encode('.')[0], eos_token_id=tokenizer.encode('.')[0])
  return tokenizer.decode(greedy_output[0], skip_special_tokens=True)

def with_sampling(input_ids):
  tf.random.set_seed(0)
  # activate sampling and deactivate top_k by setting top_k sampling to 0
  sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=50, 
    top_k=0, 
    temperature=0.7)
  return tokenizer.decode(sample_output[0], skip_special_tokens=True)

def with_top_k_sampling(input_ids):
  tf.random.set_seed(0)
  sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=50, 
    top_k=50)
  return tokenizer.decode(sample_output[0], skip_special_tokens=True)

def with_nucleus_sampling(input_ids):
  tf.random.set_seed(0)
  # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
  sample_outputs = model.generate(
    input_ids,
    do_sample=True, 
    max_length=50, 
    top_k=50, 
    top_p=0.95, 
    num_return_sequences=3)
  return_list = []
  for i, beam_output in enumerate(sample_outputs):
    # print(beam_output)
    return_list.append(tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True))
  outputs_coherences = []
  for i, text in enumerate(return_list):
    sentencesTokenized = nltk.sent_tokenize(text)
    coherence_cur = 0
    length = len(sentencesTokenized)
    for s in range(length-1):
      coherence_cur += FRel(sentencesTokenized[s], sentencesTokenized[s+1])
      if length == 1:
        length += 1
    outputs_coherences.append(coherence_cur / (length-1))
  index_of_max = outputs_coherences.index(max(outputs_coherences))
  return return_list[index_of_max]
  
def generation_method(decoding_algorithm,input_text):
  input_ids=tokenizer.encode(input_text,return_tensors='tf')
  if decoding_algorithm=="Beam search":
    return generate_and_choose(input_text)
  elif decoding_algorithm=="Greedy search":
    return greedy_generate(input_text)
  elif decoding_algorithm=="With sampling":
    return with_sampling(input_ids)
  elif decoding_algorithm=="With top k sampling":
    return with_top_k_sampling(input_ids)
  elif decoding_algorithm=="With nucleus sampling":
    return with_nucleus_sampling(input_ids)
 
import gradio as gr
in1 = gr.inputs.Dropdown(choices=["Beam search", "Greedy search", "With sampling","With top k sampling", "With nucleus sampling"])
in2 = gr.inputs.Textbox()
iface = gr.Interface(fn=generation_method, 
             inputs=[in1,in2],
             outputs="text").launch(debug=True)

iface.launch()