import gradio as gr import tensorflow as tf from transformers import TFGPT2LMHeadModel, GPT2Tokenizer tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # add the EOS token as PAD token to avoid warnings model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id) import stanza stanza.download('en') import nltk nltk.download('punkt') nltk.download('wordnet') savejson = True indexing = True import numpy as np import pandas as pd from nltk.stem import WordNetLemmatizer from nltk.corpus import wordnet from nltk.wsd import lesk import stanza import nltk import collections import itertools import json from collections import defaultdict nlp = stanza.Pipeline() lemmatizer = WordNetLemmatizer() stories = pd.read_csv('ROCStories_winter2017 - ROCStories_winter2017 (1).csv') import warnings warnings.filterwarnings('ignore') def defdict(): return defaultdict(list) class Sentence: def __init__(self, textid, sentencenum, sentence, vectors=None, tfidfs=None, sentenceVector=None, lemmatized=None, preds=None, args=None): self.textid = textid self.sentence = sentence self.sentencenum = sentencenum self.vectors = vectors self.tfidfs = tfidfs self.sentenceVector = sentenceVector self.preds = preds self.args = args if lemmatized: self.lemmatized = lemmatized else: self.lemmatize() def lemmatize(self): doc = nlp(self.sentence) self.lemmatized = [] self.preds = [] ind = 0 self.args=[] for i, dep_edge in enumerate(doc.sentences[0].dependencies): if dep_edge[1] != 'punct': self.lemmatized.append(dep_edge[2].lemma) if dep_edge[1] == "root": self.preds.append(dep_edge[2].lemma) ind = i+1 for dep_edge in doc.sentences[0].dependencies: if int(dep_edge[2].head) == ind and dep_edge[1] != 'punct': #dep_edge[1] not in x self.args.append(lemmatizer.lemmatize(dep_edge[2].lemma.lower())) self.doc = doc def calculateVector(self): if self.vectors and self.tfidfs: self.sentenceVector = np.dot(self.tfidfs, self.vectors) return self.sentenceVector def getVector(self): if self.sentenceVector is None: self.calculateVector() return self.sentenceVector class Story: def __init__(self, sentences, number): self.sentences = sentences self.number = number def lemmatizedSents(self): lemSents = [] for s in self.sentences: lemSents.append(s.lemmatized) return lemSents storiesSentences = [] sentencesjsons =[] def indexSents(sents): ind = defaultdict(defdict) for sc in sents: for i, w in enumerate(sc.lemmatized): ind[w][sc.textid].append((i, sc.sentencenum)) return ind def indexCorpus(): sentences = [] # textid, sentencenum, sentence for i, story in stories[:300].iterrows(): storiesSentences.append([]) # document = "" print(i) for sind, sent in enumerate(story[2:], start = 1): sentence = Sentence(i, sind-1, sent) # print(sent) # print(i) # print(sentence.sentencenum) # document.join(sent) storiesSentences[i].append(sentence) sentences.append(sentence) sentencesjsons.append(sentence.__dict__) # storiesClasses.append(Story(storiesSentences[i],i)) # documents.append(document) return indexSents(sentences) if savejson: index = indexCorpus() #json.dump(sentencesjsons, open('filename.json', 'a')) else: sentencesjsons = json.load(open('filename.json')) # if indexing: # json.dump(index, open('index.json', 'w')) # else: # index = json.load(open('index.json')) def searchByRequest(words): sents = set() dicts = [] keys = [] #story numbers synonims = [] for i, w in enumerate(words): synonims.append(set()) synonims[i].update([w]) for synset in wordnet.synsets(w): synonims[i].update(synset.lemma_names()) # print(synonims) stories = [] dictsForWords = [] storiesForWords = [] for i, w in enumerate(synonims): dictsForWords.append([]) storiesForWords.append(set()) for synonim in w: currrentDict = index[synonim] if currrentDict: dictsForWords[i].append(currrentDict) storiesForWords[i].update(set(currrentDict.keys())) paragraphs = set.intersection(*storiesForWords) # print(paragraphs) # print(dictsForWords) # print(dicts) sentencesClasses = set() temporarySentencesByParagraphs = [[set()]*len(words)]*len(paragraphs) for pi, p in enumerate(paragraphs): temporarySentences = [] for wi, wordDictsList in enumerate(dictsForWords): temporarySentences.append(set()) # print(wordDictsList) for dictionary in wordDictsList: if p in dictionary: for sents in dictionary[p]: # print(sents) temporarySentences[wi].update([sents[1]]) # print(temporarySentences[wi]) # print(temporarySentences) if wi>0 and len(words) > 1: for i in range(wi): for s in temporarySentences[wi]: if s in temporarySentences[i]: sentencesClasses.update([storiesSentences[p][s]]) # for sentence in sentencesClasses: # print(sentence.lemmatized) # print(sentence.sentence, sentence.textid, sentence.sentencenum) return sentencesClasses # m = searchByRequest(['play', 'fun', 'game']) # m = searchByRequest(['present', 'Christmas', 'wake']) def predIndex(): # stories = pd.read_csv('ROCStories_winter2017 - ROCStories_winter2017 (1).csv') ind = defaultdict(defdict) for i, story in enumerate(storiesSentences): for j, sent in enumerate(story): for s in sent.preds: ind[s][i].append(j) return ind preds = predIndex() def powC(subj): c = 0 for k, v in preds[subj].items(): c += len(v) return c def powCons(s1, s2): count = 0 for i in (preds[s1].keys() & preds[s2].keys()): i1=0 i2=0 while i1 != len(preds[s1][i]) and i2 != len(preds[s2][i]): #for d in preds[s1][i]: if preds[s1][i][i1] + 1 == preds[s2][i][i2]: count += 1 i1 += 1 i2 += 1 elif preds[s1][i][i1] + 1 < preds[s2][i][i2]: i1 += 1 else: i2 += 1 return count # print(powCons('decide', 'make')) # print(powCons('know', 'buy')) def synset_lesk(sent, word): sent_tok = nltk.tokenize.word_tokenize(sent) return lesk(sent_tok, word) #,pos # comparison of wsd # def wpsim(): # def wpsim_by_max(): def wpsim_lesk(word1, sent1, word2, sent2): synset1 = lesk(sent1, word1) # print(synset1.definition()) synset2 = lesk(sent2, word2) # print(synset2.definition()) return synset1.wup_similarity(synset2) x = ['punct', 'conj'] def args_of_pred(s): return s.args import math alpha = 0.5 def FRelPred(sent1, sent2): try: p1 = sent1.preds[0] p2 = sent2.preds[0] if powCons(p1, p2) == 0: return 0.0 return math.log2(powCons(p1, p2) / (powC(p1)*powC(p2))) except: return 0.2 def FRelArgs(s1, s2): try: args1 = args_of_pred(s1) args2 = args_of_pred(s2) # print(args1, args2) sent_tok1 = s1.lemmatized#nltk.tokenize.word_tokenize(s1) sent_tok2 = s2.lemmatized#nltk.tokenize.word_tokenize(s2) # print(sent_tok1, sent_tok2) sum1 = 0 sum2 = 0 max1 = 0 max2 = 0 wpsim = 0 for ni in args1: synsetni = lesk(sent_tok1, ni) #pos # print(synsetni, ni) synsetnj = lesk(sent_tok2, args2[0]) # print(synsetnj, args2[0]) # if synsetni != None and synsetnj != None: if not (synsetnj is None or synsetni is None): max1 = synsetni.wup_similarity(synsetnj) #wp_sim(ni, args2[0]) # print(type(max1)) if max1 is None: max1 = 0 for nj in args2[1:]: synsetnj = lesk(sent_tok2, nj) # print(synsetni, ni) # print(synsetnj, nj) # if synsetni != None and synsetnj != None: if not (synsetnj is None or synsetni is None): wpsim = synsetni.wup_similarity(synsetnj) #(ni, nj) if wpsim is None: wpsim = 0 if (not None in [wpsim, max1]) and wpsim > max1: max1 = wpsim # print(wpsim, max1) sum1 += max1 # print(sum1) for ni in args2: synsetni = lesk(sent_tok2, ni) synsetnj = lesk(sent_tok1, args1[0]) if not (synsetnj is None or synsetni is None): max2 = synsetni.wup_similarity(synsetnj) #wp_sim(ni, args2[0]) if max2 is None: max2 = 0 for nj in args1[1:]: synsetnj = lesk(sent_tok1, nj) if not (synsetnj is None or synsetni is None): wpsim = synsetni.wup_similarity(synsetnj) #(ni, nj) if wpsim is None: wpsim = 0 if (not None in [wpsim, max2]) and wpsim > max2: # if (wpsim is not None) and wpsim > max2: max2 = wpsim sum2 += max2 # print(len(args1)) # print(len(args2)) # print(sum1, sum2) return 0.5*( (1/len(args1))*sum1 + (1/len(args2))*sum2 ) except: return 0.2 def FRel(s1, s2): return alpha*FRelPred(s1, s2) + (1-alpha)*FRelArgs(s1, s2) def hac(foundSentences, length=2): R=0.1 twoSentenceClusters = [] numfound = len(foundSentences) sentencePairs = [] frelijs = [] ind = 0 maxind = 0 maxval = 0 for i in itertools.permutations(foundSentences, 2): if i[0].textid != i[1].textid: frelij = FRel(i[0], i[1]) if frelij > R: sentencePairs.append(list(i)) frelijs.append(frelij) if ind != 0: if frelij > maxval: maxval=maxval maxind = ind ind += 1 else: ind=1 maxval = frelij maxind = 0 # print(sentencePairs) maxvalThree = 0 maxSentsThree = [] threeSentsCluster = set() for pairind, pair in enumerate(sentencePairs): for sent in foundSentences: if sent.textid != pair[0].textid and sent.textid != pair[1].textid: frelij = FRel(sent, i[0]) if frelij > R: threeSentsCluster.add(tuple([sent]+pair)) current = (frelijs[maxind]+frelij)/2 > maxvalThree if current > maxvalThree: maxvalThree = current maxSentsThree = [sent]+pair frelij = FRel(i[1], sent) if frelij > R: threeSentsCluster.add(tuple(pair+[sent])) current = (frelijs[maxind]+frelij)/2 > maxvalThree if current > maxvalThree: maxvalThree = current maxSentsThree = pair+[sent] # print(sentencePairs) # print(threeSentsCluster) # for pair in sentencePairs: # print(pair[0].sentence, pair[1].sentence) # for cluster in threeSentsCluster: # print(cluster[0].sentence, cluster[1].sentence, cluster[2].sentence) # print([sentencePairs[maxind],maxSentsThree]) if len(sentencePairs) >=1: return [sentencePairs[maxind],maxSentsThree]#sentencePairs + list(threeSentsCluster) else: return [] # print(FRelPred('David noticed he had put on a lot of weight recently.', # 'He examined his habits to try and figure out the reason.')) # # 'After a few weeks, he started to feel much better.')) # print(FRel('David noticed he had put on a lot of weight recently.', # 'He examined his habits to try and figure out the reason.')) # print(FRel('He decided to buy a pair of khakis.', 'The pair he bought fit him perfectly.')) from sklearn.feature_extraction.text import TfidfVectorizer from gensim.models.word2vec import Word2Vec import gensim.downloader def tfidfTokenizer(x): return [w for words in [s.lemmatized for s in x] for w in words] def preprocess(x): return x # tfidfvectorizer = TfidfVectorizer(tokenizer=tfidfTokenizer, preprocessor=preprocess, use_idf=True) # tfidfvectorizer_vectors = tfidfvectorizer.fit_transform(storiesSentences) #(map(lambda x: [ s.lemmatized for s in np.array(x).flatten() ] )) # wvModel = gensim.downloader.load("word2vec-google-news-300") # feature_names = tfidfvectorizer.get_feature_names() def setVectors(stories): for doc in stories: for sentence in doc: vectors = [] for lemma in sentence.lemmatized: try: vectors.append(wvModel[lemma]) except: vectors.append([0]*300) sentence.vectors = vectors # setVectors(storiesSentences) def setTfIdfs(documents): for i, doc in enumerate(documents): feature = tfidfvectorizer_vectors[i,:].nonzero()[1] tfidfs = zip(feature, [tfidfvectorizer_vectors[i, x] for x in feature]) tfidfsbyword = dict() for w,s in [(feature_names[j], s) for (j, s) in tfidfs]: tfidfsbyword[w] = s for sent in doc: tfidfs = [] for lemma in sent.lemmatized: tfidfs.append(tfidfsbyword[lemma]) sent.tfidfs = tfidfs sent.calculateVector() # setTfIdfs(storiesSentences) # m = searchByRequest(['wake', 'present', 'Christmas']) # for sent in m: # print(sent.lemmatized) def generate(words): m = searchByRequest(words) return hac(m) def generate_and_choose(input_text): input_ids=tokenizer.encode(input_text,return_tensors='tf') beam_outputs=model.generate(input_ids,max_length=100,num_return_sequences=3,num_beams=3,no_repeat_ngram_size=2,early_stopping=True) return_list = [] for i, beam_output in enumerate(beam_outputs): # print(beam_output) return_list.append(tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)) outputs_coherences = [] for i, text in enumerate(return_list): sentencesTokenized = nltk.sent_tokenize(text) coherence_cur = 0 length = len(sentencesTokenized) for s in range(length-1): coherence_cur += FRel(sentencesTokenized[s], sentencesTokenized[s+1]) if length == 1: length += 1 outputs_coherences.append(coherence_cur / (length-1)) index_of_max = outputs_coherences.index(max(outputs_coherences)) return return_list[index_of_max] def greedy_generate(inp): input_ids = tokenizer.encode(inp, return_tensors='tf') greedy_output = model.generate(input_ids, pad_token_id=tokenizer.encode('.')[0], eos_token_id=tokenizer.encode('.')[0]) return tokenizer.decode(greedy_output[0], skip_special_tokens=True) def with_sampling(input_ids): tf.random.set_seed(0) # activate sampling and deactivate top_k by setting top_k sampling to 0 sample_output = model.generate( input_ids, do_sample=True, max_length=50, top_k=0, temperature=0.7) return tokenizer.decode(sample_output[0], skip_special_tokens=True) def with_top_k_sampling(input_ids): tf.random.set_seed(0) sample_output = model.generate( input_ids, do_sample=True, max_length=50, top_k=50) return tokenizer.decode(sample_output[0], skip_special_tokens=True) def with_nucleus_sampling(input_ids): tf.random.set_seed(0) # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3 sample_outputs = model.generate( input_ids, do_sample=True, max_length=50, top_k=50, top_p=0.95, num_return_sequences=3) return_list = [] for i, beam_output in enumerate(sample_outputs): # print(beam_output) return_list.append(tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)) outputs_coherences = [] for i, text in enumerate(return_list): sentencesTokenized = nltk.sent_tokenize(text) coherence_cur = 0 length = len(sentencesTokenized) for s in range(length-1): coherence_cur += FRel(sentencesTokenized[s], sentencesTokenized[s+1]) if length == 1: length += 1 outputs_coherences.append(coherence_cur / (length-1)) index_of_max = outputs_coherences.index(max(outputs_coherences)) return return_list[index_of_max] def generation_method(decoding_algorithm,input_text): input_ids=tokenizer.encode(input_text,return_tensors='tf') if decoding_algorithm=="Beam search": return generate_and_choose(input_text) elif decoding_algorithm=="Greedy search": return greedy_generate(input_text) elif decoding_algorithm=="With sampling": return with_sampling(input_ids) elif decoding_algorithm=="With top k sampling": return with_top_k_sampling(input_ids) elif decoding_algorithm=="With nucleus sampling": return with_nucleus_sampling(input_ids) import gradio as gr in1 = gr.inputs.Dropdown(choices=["Beam search", "Greedy search", "With sampling","With top k sampling", "With nucleus sampling"]) in2 = gr.inputs.Textbox() iface = gr.Interface(fn=generation_method, inputs=[in1,in2], outputs="text").launch(debug=True) iface.launch()