Spaces:
Runtime error
Runtime error
import gradio as gr | |
import tensorflow as tf | |
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer | |
tokenizer = GPT2Tokenizer.from_pretrained("gpt2") | |
# add the EOS token as PAD token to avoid warnings | |
model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id) | |
import stanza | |
stanza.download('en') | |
import nltk | |
nltk.download('punkt') | |
nltk.download('wordnet') | |
savejson = True | |
indexing = True | |
import numpy as np | |
import pandas as pd | |
from nltk.stem import WordNetLemmatizer | |
from nltk.corpus import wordnet | |
from nltk.wsd import lesk | |
import stanza | |
import nltk | |
import collections | |
import itertools | |
import json | |
from collections import defaultdict | |
nlp = stanza.Pipeline() | |
lemmatizer = WordNetLemmatizer() | |
stories = pd.read_csv('ROCStories_winter2017 - ROCStories_winter2017 (1).csv') | |
import warnings | |
warnings.filterwarnings('ignore') | |
def defdict(): | |
return defaultdict(list) | |
class Sentence: | |
def __init__(self, textid, sentencenum, sentence, vectors=None, tfidfs=None, sentenceVector=None, lemmatized=None, preds=None, args=None): | |
self.textid = textid | |
self.sentence = sentence | |
self.sentencenum = sentencenum | |
self.vectors = vectors | |
self.tfidfs = tfidfs | |
self.sentenceVector = sentenceVector | |
self.preds = preds | |
self.args = args | |
if lemmatized: | |
self.lemmatized = lemmatized | |
else: | |
self.lemmatize() | |
def lemmatize(self): | |
doc = nlp(self.sentence) | |
self.lemmatized = [] | |
self.preds = [] | |
ind = 0 | |
self.args=[] | |
for i, dep_edge in enumerate(doc.sentences[0].dependencies): | |
if dep_edge[1] != 'punct': | |
self.lemmatized.append(dep_edge[2].lemma) | |
if dep_edge[1] == "root": | |
self.preds.append(dep_edge[2].lemma) | |
ind = i+1 | |
for dep_edge in doc.sentences[0].dependencies: | |
if int(dep_edge[2].head) == ind and dep_edge[1] != 'punct': #dep_edge[1] not in x | |
self.args.append(lemmatizer.lemmatize(dep_edge[2].lemma.lower())) | |
self.doc = doc | |
def calculateVector(self): | |
if self.vectors and self.tfidfs: | |
self.sentenceVector = np.dot(self.tfidfs, self.vectors) | |
return self.sentenceVector | |
def getVector(self): | |
if self.sentenceVector is None: | |
self.calculateVector() | |
return self.sentenceVector | |
class Story: | |
def __init__(self, sentences, number): | |
self.sentences = sentences | |
self.number = number | |
def lemmatizedSents(self): | |
lemSents = [] | |
for s in self.sentences: | |
lemSents.append(s.lemmatized) | |
return lemSents | |
storiesSentences = [] | |
sentencesjsons =[] | |
def indexSents(sents): | |
ind = defaultdict(defdict) | |
for sc in sents: | |
for i, w in enumerate(sc.lemmatized): | |
ind[w][sc.textid].append((i, sc.sentencenum)) | |
return ind | |
def indexCorpus(): | |
sentences = [] | |
# textid, sentencenum, sentence | |
for i, story in stories[:300].iterrows(): | |
storiesSentences.append([]) | |
# document = "" | |
print(i) | |
for sind, sent in enumerate(story[2:], start = 1): | |
sentence = Sentence(i, sind-1, sent) | |
# print(sent) | |
# print(i) | |
# print(sentence.sentencenum) | |
# document.join(sent) | |
storiesSentences[i].append(sentence) | |
sentences.append(sentence) | |
sentencesjsons.append(sentence.__dict__) | |
# storiesClasses.append(Story(storiesSentences[i],i)) | |
# documents.append(document) | |
return indexSents(sentences) | |
if savejson: | |
index = indexCorpus() | |
#json.dump(sentencesjsons, open('filename.json', 'a')) | |
else: | |
sentencesjsons = json.load(open('filename.json')) | |
# if indexing: | |
# json.dump(index, open('index.json', 'w')) | |
# else: | |
# index = json.load(open('index.json')) | |
def searchByRequest(words): | |
sents = set() | |
dicts = [] | |
keys = [] #story numbers | |
synonims = [] | |
for i, w in enumerate(words): | |
synonims.append(set()) | |
synonims[i].update([w]) | |
for synset in wordnet.synsets(w): | |
synonims[i].update(synset.lemma_names()) | |
# print(synonims) | |
stories = [] | |
dictsForWords = [] | |
storiesForWords = [] | |
for i, w in enumerate(synonims): | |
dictsForWords.append([]) | |
storiesForWords.append(set()) | |
for synonim in w: | |
currrentDict = index[synonim] | |
if currrentDict: | |
dictsForWords[i].append(currrentDict) | |
storiesForWords[i].update(set(currrentDict.keys())) | |
paragraphs = set.intersection(*storiesForWords) | |
# print(paragraphs) | |
# print(dictsForWords) | |
# print(dicts) | |
sentencesClasses = set() | |
temporarySentencesByParagraphs = [[set()]*len(words)]*len(paragraphs) | |
for pi, p in enumerate(paragraphs): | |
temporarySentences = [] | |
for wi, wordDictsList in enumerate(dictsForWords): | |
temporarySentences.append(set()) | |
# print(wordDictsList) | |
for dictionary in wordDictsList: | |
if p in dictionary: | |
for sents in dictionary[p]: | |
# print(sents) | |
temporarySentences[wi].update([sents[1]]) | |
# print(temporarySentences[wi]) | |
# print(temporarySentences) | |
if wi>0 and len(words) > 1: | |
for i in range(wi): | |
for s in temporarySentences[wi]: | |
if s in temporarySentences[i]: | |
sentencesClasses.update([storiesSentences[p][s]]) | |
# for sentence in sentencesClasses: | |
# print(sentence.lemmatized) | |
# print(sentence.sentence, sentence.textid, sentence.sentencenum) | |
return sentencesClasses | |
# m = searchByRequest(['play', 'fun', 'game']) | |
# m = searchByRequest(['present', 'Christmas', 'wake']) | |
def predIndex(): | |
# stories = pd.read_csv('ROCStories_winter2017 - ROCStories_winter2017 (1).csv') | |
ind = defaultdict(defdict) | |
for i, story in enumerate(storiesSentences): | |
for j, sent in enumerate(story): | |
for s in sent.preds: | |
ind[s][i].append(j) | |
return ind | |
preds = predIndex() | |
def powC(subj): | |
c = 0 | |
for k, v in preds[subj].items(): | |
c += len(v) | |
return c | |
def powCons(s1, s2): | |
count = 0 | |
for i in (preds[s1].keys() & preds[s2].keys()): | |
i1=0 | |
i2=0 | |
while i1 != len(preds[s1][i]) and i2 != len(preds[s2][i]): #for d in preds[s1][i]: | |
if preds[s1][i][i1] + 1 == preds[s2][i][i2]: | |
count += 1 | |
i1 += 1 | |
i2 += 1 | |
elif preds[s1][i][i1] + 1 < preds[s2][i][i2]: | |
i1 += 1 | |
else: | |
i2 += 1 | |
return count | |
# print(powCons('decide', 'make')) | |
# print(powCons('know', 'buy')) | |
def synset_lesk(sent, word): | |
sent_tok = nltk.tokenize.word_tokenize(sent) | |
return lesk(sent_tok, word) #,pos | |
# comparison of wsd | |
# def wpsim(): | |
# def wpsim_by_max(): | |
def wpsim_lesk(word1, sent1, word2, sent2): | |
synset1 = lesk(sent1, word1) | |
# print(synset1.definition()) | |
synset2 = lesk(sent2, word2) | |
# print(synset2.definition()) | |
return synset1.wup_similarity(synset2) | |
x = ['punct', 'conj'] | |
def args_of_pred(s): | |
return s.args | |
import math | |
alpha = 0.5 | |
def FRelPred(sent1, sent2): | |
try: | |
p1 = sent1.preds[0] | |
p2 = sent2.preds[0] | |
if powCons(p1, p2) == 0: | |
return 0.0 | |
return math.log2(powCons(p1, p2) / (powC(p1)*powC(p2))) | |
except: | |
return 0.2 | |
def FRelArgs(s1, s2): | |
try: | |
args1 = args_of_pred(s1) | |
args2 = args_of_pred(s2) | |
# print(args1, args2) | |
sent_tok1 = s1.lemmatized#nltk.tokenize.word_tokenize(s1) | |
sent_tok2 = s2.lemmatized#nltk.tokenize.word_tokenize(s2) | |
# print(sent_tok1, sent_tok2) | |
sum1 = 0 | |
sum2 = 0 | |
max1 = 0 | |
max2 = 0 | |
wpsim = 0 | |
for ni in args1: | |
synsetni = lesk(sent_tok1, ni) #pos | |
# print(synsetni, ni) | |
synsetnj = lesk(sent_tok2, args2[0]) | |
# print(synsetnj, args2[0]) | |
# if synsetni != None and synsetnj != None: | |
if not (synsetnj is None or synsetni is None): | |
max1 = synsetni.wup_similarity(synsetnj) #wp_sim(ni, args2[0]) | |
# print(type(max1)) | |
if max1 is None: | |
max1 = 0 | |
for nj in args2[1:]: | |
synsetnj = lesk(sent_tok2, nj) | |
# print(synsetni, ni) | |
# print(synsetnj, nj) | |
# if synsetni != None and synsetnj != None: | |
if not (synsetnj is None or synsetni is None): | |
wpsim = synsetni.wup_similarity(synsetnj) #(ni, nj) | |
if wpsim is None: | |
wpsim = 0 | |
if (not None in [wpsim, max1]) and wpsim > max1: | |
max1 = wpsim | |
# print(wpsim, max1) | |
sum1 += max1 | |
# print(sum1) | |
for ni in args2: | |
synsetni = lesk(sent_tok2, ni) | |
synsetnj = lesk(sent_tok1, args1[0]) | |
if not (synsetnj is None or synsetni is None): | |
max2 = synsetni.wup_similarity(synsetnj) #wp_sim(ni, args2[0]) | |
if max2 is None: | |
max2 = 0 | |
for nj in args1[1:]: | |
synsetnj = lesk(sent_tok1, nj) | |
if not (synsetnj is None or synsetni is None): | |
wpsim = synsetni.wup_similarity(synsetnj) #(ni, nj) | |
if wpsim is None: | |
wpsim = 0 | |
if (not None in [wpsim, max2]) and wpsim > max2: | |
# if (wpsim is not None) and wpsim > max2: | |
max2 = wpsim | |
sum2 += max2 | |
# print(len(args1)) | |
# print(len(args2)) | |
# print(sum1, sum2) | |
return 0.5*( (1/len(args1))*sum1 + (1/len(args2))*sum2 ) | |
except: | |
return 0.2 | |
def FRel(s1, s2): | |
return alpha*FRelPred(s1, s2) + (1-alpha)*FRelArgs(s1, s2) | |
def hac(foundSentences, length=2): | |
R=0.1 | |
twoSentenceClusters = [] | |
numfound = len(foundSentences) | |
sentencePairs = [] | |
frelijs = [] | |
ind = 0 | |
maxind = 0 | |
maxval = 0 | |
for i in itertools.permutations(foundSentences, 2): | |
if i[0].textid != i[1].textid: | |
frelij = FRel(i[0], i[1]) | |
if frelij > R: | |
sentencePairs.append(list(i)) | |
frelijs.append(frelij) | |
if ind != 0: | |
if frelij > maxval: | |
maxval=maxval | |
maxind = ind | |
ind += 1 | |
else: | |
ind=1 | |
maxval = frelij | |
maxind = 0 | |
# print(sentencePairs) | |
maxvalThree = 0 | |
maxSentsThree = [] | |
threeSentsCluster = set() | |
for pairind, pair in enumerate(sentencePairs): | |
for sent in foundSentences: | |
if sent.textid != pair[0].textid and sent.textid != pair[1].textid: | |
frelij = FRel(sent, i[0]) | |
if frelij > R: | |
threeSentsCluster.add(tuple([sent]+pair)) | |
current = (frelijs[maxind]+frelij)/2 > maxvalThree | |
if current > maxvalThree: | |
maxvalThree = current | |
maxSentsThree = [sent]+pair | |
frelij = FRel(i[1], sent) | |
if frelij > R: | |
threeSentsCluster.add(tuple(pair+[sent])) | |
current = (frelijs[maxind]+frelij)/2 > maxvalThree | |
if current > maxvalThree: | |
maxvalThree = current | |
maxSentsThree = pair+[sent] | |
# print(sentencePairs) | |
# print(threeSentsCluster) | |
# for pair in sentencePairs: | |
# print(pair[0].sentence, pair[1].sentence) | |
# for cluster in threeSentsCluster: | |
# print(cluster[0].sentence, cluster[1].sentence, cluster[2].sentence) | |
# print([sentencePairs[maxind],maxSentsThree]) | |
if len(sentencePairs) >=1: | |
return [sentencePairs[maxind],maxSentsThree]#sentencePairs + list(threeSentsCluster) | |
else: | |
return [] | |
# print(FRelPred('David noticed he had put on a lot of weight recently.', | |
# 'He examined his habits to try and figure out the reason.')) | |
# # 'After a few weeks, he started to feel much better.')) | |
# print(FRel('David noticed he had put on a lot of weight recently.', | |
# 'He examined his habits to try and figure out the reason.')) | |
# print(FRel('He decided to buy a pair of khakis.', 'The pair he bought fit him perfectly.')) | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from gensim.models.word2vec import Word2Vec | |
import gensim.downloader | |
def tfidfTokenizer(x): | |
return [w for words in [s.lemmatized for s in x] for w in words] | |
def preprocess(x): | |
return x | |
# tfidfvectorizer = TfidfVectorizer(tokenizer=tfidfTokenizer, preprocessor=preprocess, use_idf=True) | |
# tfidfvectorizer_vectors = tfidfvectorizer.fit_transform(storiesSentences) #(map(lambda x: [ s.lemmatized for s in np.array(x).flatten() ] )) | |
# wvModel = gensim.downloader.load("word2vec-google-news-300") | |
# feature_names = tfidfvectorizer.get_feature_names() | |
def setVectors(stories): | |
for doc in stories: | |
for sentence in doc: | |
vectors = [] | |
for lemma in sentence.lemmatized: | |
try: | |
vectors.append(wvModel[lemma]) | |
except: | |
vectors.append([0]*300) | |
sentence.vectors = vectors | |
# setVectors(storiesSentences) | |
def setTfIdfs(documents): | |
for i, doc in enumerate(documents): | |
feature = tfidfvectorizer_vectors[i,:].nonzero()[1] | |
tfidfs = zip(feature, [tfidfvectorizer_vectors[i, x] for x in feature]) | |
tfidfsbyword = dict() | |
for w,s in [(feature_names[j], s) for (j, s) in tfidfs]: | |
tfidfsbyword[w] = s | |
for sent in doc: | |
tfidfs = [] | |
for lemma in sent.lemmatized: | |
tfidfs.append(tfidfsbyword[lemma]) | |
sent.tfidfs = tfidfs | |
sent.calculateVector() | |
# setTfIdfs(storiesSentences) | |
# m = searchByRequest(['wake', 'present', 'Christmas']) | |
# for sent in m: | |
# print(sent.lemmatized) | |
def generate(words): | |
m = searchByRequest(words) | |
return hac(m) | |
def generate_and_choose(input_text): | |
input_ids=tokenizer.encode(input_text,return_tensors='tf') | |
beam_outputs=model.generate(input_ids,max_length=100,num_return_sequences=3,num_beams=3,no_repeat_ngram_size=2,early_stopping=True) | |
return_list = [] | |
for i, beam_output in enumerate(beam_outputs): | |
# print(beam_output) | |
return_list.append(tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)) | |
outputs_coherences = [] | |
for i, text in enumerate(return_list): | |
sentencesTokenized = nltk.sent_tokenize(text) | |
coherence_cur = 0 | |
length = len(sentencesTokenized) | |
for s in range(length-1): | |
coherence_cur += FRel(sentencesTokenized[s], sentencesTokenized[s+1]) | |
if length == 1: | |
length += 1 | |
outputs_coherences.append(coherence_cur / (length-1)) | |
index_of_max = outputs_coherences.index(max(outputs_coherences)) | |
return return_list[index_of_max] | |
def greedy_generate(inp): | |
input_ids = tokenizer.encode(inp, return_tensors='tf') | |
greedy_output = model.generate(input_ids, pad_token_id=tokenizer.encode('.')[0], eos_token_id=tokenizer.encode('.')[0]) | |
return tokenizer.decode(greedy_output[0], skip_special_tokens=True) | |
def with_sampling(input_ids): | |
tf.random.set_seed(0) | |
# activate sampling and deactivate top_k by setting top_k sampling to 0 | |
sample_output = model.generate( | |
input_ids, | |
do_sample=True, | |
max_length=50, | |
top_k=0, | |
temperature=0.7) | |
return tokenizer.decode(sample_output[0], skip_special_tokens=True) | |
def with_top_k_sampling(input_ids): | |
tf.random.set_seed(0) | |
sample_output = model.generate( | |
input_ids, | |
do_sample=True, | |
max_length=50, | |
top_k=50) | |
return tokenizer.decode(sample_output[0], skip_special_tokens=True) | |
def with_nucleus_sampling(input_ids): | |
tf.random.set_seed(0) | |
# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3 | |
sample_outputs = model.generate( | |
input_ids, | |
do_sample=True, | |
max_length=50, | |
top_k=50, | |
top_p=0.95, | |
num_return_sequences=3) | |
return_list = [] | |
for i, beam_output in enumerate(sample_outputs): | |
# print(beam_output) | |
return_list.append(tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)) | |
outputs_coherences = [] | |
for i, text in enumerate(return_list): | |
sentencesTokenized = nltk.sent_tokenize(text) | |
coherence_cur = 0 | |
length = len(sentencesTokenized) | |
for s in range(length-1): | |
coherence_cur += FRel(sentencesTokenized[s], sentencesTokenized[s+1]) | |
if length == 1: | |
length += 1 | |
outputs_coherences.append(coherence_cur / (length-1)) | |
index_of_max = outputs_coherences.index(max(outputs_coherences)) | |
return return_list[index_of_max] | |
def generation_method(decoding_algorithm,input_text): | |
input_ids=tokenizer.encode(input_text,return_tensors='tf') | |
if decoding_algorithm=="Beam search": | |
return generate_and_choose(input_text) | |
elif decoding_algorithm=="Greedy search": | |
return greedy_generate(input_text) | |
elif decoding_algorithm=="With sampling": | |
return with_sampling(input_ids) | |
elif decoding_algorithm=="With top k sampling": | |
return with_top_k_sampling(input_ids) | |
elif decoding_algorithm=="With nucleus sampling": | |
return with_nucleus_sampling(input_ids) | |
import gradio as gr | |
in1 = gr.inputs.Dropdown(choices=["Beam search", "Greedy search", "With sampling","With top k sampling", "With nucleus sampling"]) | |
in2 = gr.inputs.Textbox() | |
iface = gr.Interface(fn=generation_method, | |
inputs=[in1,in2], | |
outputs="text").launch(debug=True) | |
iface.launch() |