Spaces:

mariamisoieva
/

CoherentGeneration

Runtime error

App Files Files Community

CoherentGeneration / app.py

mariamisoieva

Create app.py

ef89d5e about 2 years ago

raw history blame

No virus

17.8 kB

	import gradio as gr
	import tensorflow as tf
	from transformers import TFGPT2LMHeadModel, GPT2Tokenizer


	tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

	# add the EOS token as PAD token to avoid warnings
	model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)
	import stanza
	stanza.download('en')
	import nltk

	nltk.download('punkt')
	nltk.download('wordnet')
	savejson = True
	indexing = True
	import numpy as np
	import pandas as pd
	from nltk.stem import WordNetLemmatizer
	from nltk.corpus import wordnet
	from nltk.wsd import lesk
	import stanza
	import nltk
	import collections
	import itertools
	import json
	from collections import defaultdict
	nlp = stanza.Pipeline()
	lemmatizer = WordNetLemmatizer()
	stories = pd.read_csv('ROCStories_winter2017 - ROCStories_winter2017 (1).csv')
	import warnings
	warnings.filterwarnings('ignore')
	def defdict():
	return defaultdict(list)
	class Sentence:
	def __init__(self, textid, sentencenum, sentence, vectors=None, tfidfs=None, sentenceVector=None, lemmatized=None, preds=None, args=None):
	self.textid = textid
	self.sentence = sentence
	self.sentencenum = sentencenum
	self.vectors = vectors
	self.tfidfs = tfidfs
	self.sentenceVector = sentenceVector
	self.preds = preds
	self.args = args
	if lemmatized:
	self.lemmatized = lemmatized
	else:
	self.lemmatize()

	def lemmatize(self):
	doc = nlp(self.sentence)
	self.lemmatized = []
	self.preds = []
	ind = 0
	self.args=[]
	for i, dep_edge in enumerate(doc.sentences[0].dependencies):
	if dep_edge[1] != 'punct':
	self.lemmatized.append(dep_edge[2].lemma)
	if dep_edge[1] == "root":
	self.preds.append(dep_edge[2].lemma)
	ind = i+1
	for dep_edge in doc.sentences[0].dependencies:
	if int(dep_edge[2].head) == ind and dep_edge[1] != 'punct': #dep_edge[1] not in x
	self.args.append(lemmatizer.lemmatize(dep_edge[2].lemma.lower()))
	self.doc = doc

	def calculateVector(self):
	if self.vectors and self.tfidfs:
	self.sentenceVector = np.dot(self.tfidfs, self.vectors)
	return self.sentenceVector

	def getVector(self):
	if self.sentenceVector is None:
	self.calculateVector()
	return self.sentenceVector

	class Story:
	def __init__(self, sentences, number):
	self.sentences = sentences
	self.number = number
	def lemmatizedSents(self):
	lemSents = []
	for s in self.sentences:
	lemSents.append(s.lemmatized)
	return lemSents
	storiesSentences = []
	sentencesjsons =[]
	def indexSents(sents):
	ind = defaultdict(defdict)
	for sc in sents:
	for i, w in enumerate(sc.lemmatized):
	ind[w][sc.textid].append((i, sc.sentencenum))
	return ind
	def indexCorpus():
	sentences = []
	# textid, sentencenum, sentence
	for i, story in stories[:300].iterrows():
	storiesSentences.append([])
	# document = ""
	print(i)
	for sind, sent in enumerate(story[2:], start = 1):
	sentence = Sentence(i, sind-1, sent)
	# print(sent)
	# print(i)
	# print(sentence.sentencenum)
	# document.join(sent)
	storiesSentences[i].append(sentence)
	sentences.append(sentence)
	sentencesjsons.append(sentence.__dict__)
	# storiesClasses.append(Story(storiesSentences[i],i))
	# documents.append(document)
	return indexSents(sentences)
	if savejson:
	index = indexCorpus()
	#json.dump(sentencesjsons, open('filename.json', 'a'))
	else:
	sentencesjsons = json.load(open('filename.json'))
	# if indexing:
	# json.dump(index, open('index.json', 'w'))
	# else:
	# index = json.load(open('index.json'))

	def searchByRequest(words):
	sents = set()
	dicts = []
	keys = [] #story numbers
	synonims = []
	for i, w in enumerate(words):
	synonims.append(set())
	synonims[i].update([w])
	for synset in wordnet.synsets(w):
	synonims[i].update(synset.lemma_names())
	# print(synonims)
	stories = []
	dictsForWords = []
	storiesForWords = []
	for i, w in enumerate(synonims):
	dictsForWords.append([])
	storiesForWords.append(set())
	for synonim in w:
	currrentDict = index[synonim]
	if currrentDict:
	dictsForWords[i].append(currrentDict)
	storiesForWords[i].update(set(currrentDict.keys()))
	paragraphs = set.intersection(*storiesForWords)
	# print(paragraphs)
	# print(dictsForWords)
	# print(dicts)
	sentencesClasses = set()
	temporarySentencesByParagraphs = [[set()]len(words)]len(paragraphs)
	for pi, p in enumerate(paragraphs):
	temporarySentences = []
	for wi, wordDictsList in enumerate(dictsForWords):
	temporarySentences.append(set())
	# print(wordDictsList)
	for dictionary in wordDictsList:
	if p in dictionary:
	for sents in dictionary[p]:
	# print(sents)
	temporarySentences[wi].update([sents[1]])
	# print(temporarySentences[wi])
	# print(temporarySentences)
	if wi>0 and len(words) > 1:
	for i in range(wi):
	for s in temporarySentences[wi]:
	if s in temporarySentences[i]:
	sentencesClasses.update([storiesSentences[p][s]])
	# for sentence in sentencesClasses:
	# print(sentence.lemmatized)
	# print(sentence.sentence, sentence.textid, sentence.sentencenum)
	return sentencesClasses
	# m = searchByRequest(['play', 'fun', 'game'])
	# m = searchByRequest(['present', 'Christmas', 'wake'])
	def predIndex():
	# stories = pd.read_csv('ROCStories_winter2017 - ROCStories_winter2017 (1).csv')
	ind = defaultdict(defdict)
	for i, story in enumerate(storiesSentences):
	for j, sent in enumerate(story):
	for s in sent.preds:
	ind[s][i].append(j)
	return ind
	preds = predIndex()
	def powC(subj):
	c = 0
	for k, v in preds[subj].items():
	c += len(v)
	return c
	def powCons(s1, s2):
	count = 0
	for i in (preds[s1].keys() & preds[s2].keys()):
	i1=0
	i2=0
	while i1 != len(preds[s1][i]) and i2 != len(preds[s2][i]): #for d in preds[s1][i]:
	if preds[s1][i][i1] + 1 == preds[s2][i][i2]:
	count += 1
	i1 += 1
	i2 += 1
	elif preds[s1][i][i1] + 1 < preds[s2][i][i2]:
	i1 += 1
	else:
	i2 += 1
	return count

	# print(powCons('decide', 'make'))
	# print(powCons('know', 'buy'))
	def synset_lesk(sent, word):
	sent_tok = nltk.tokenize.word_tokenize(sent)
	return lesk(sent_tok, word) #,pos
	# comparison of wsd
	# def wpsim():
	# def wpsim_by_max():
	def wpsim_lesk(word1, sent1, word2, sent2):
	synset1 = lesk(sent1, word1)
	# print(synset1.definition())
	synset2 = lesk(sent2, word2)
	# print(synset2.definition())
	return synset1.wup_similarity(synset2)
	x = ['punct', 'conj']
	def args_of_pred(s):
	return s.args
	import math
	alpha = 0.5
	def FRelPred(sent1, sent2):
	try:
	p1 = sent1.preds[0]
	p2 = sent2.preds[0]
	if powCons(p1, p2) == 0:
	return 0.0
	return math.log2(powCons(p1, p2) / (powC(p1)*powC(p2)))
	except:
	return 0.2
	def FRelArgs(s1, s2):
	try:
	args1 = args_of_pred(s1)
	args2 = args_of_pred(s2)
	# print(args1, args2)
	sent_tok1 = s1.lemmatized#nltk.tokenize.word_tokenize(s1)
	sent_tok2 = s2.lemmatized#nltk.tokenize.word_tokenize(s2)
	# print(sent_tok1, sent_tok2)
	sum1 = 0
	sum2 = 0
	max1 = 0
	max2 = 0
	wpsim = 0
	for ni in args1:
	synsetni = lesk(sent_tok1, ni) #pos
	# print(synsetni, ni)
	synsetnj = lesk(sent_tok2, args2[0])
	# print(synsetnj, args2[0])
	# if synsetni != None and synsetnj != None:
	if not (synsetnj is None or synsetni is None):
	max1 = synsetni.wup_similarity(synsetnj) #wp_sim(ni, args2[0])
	# print(type(max1))
	if max1 is None:
	max1 = 0
	for nj in args2[1:]:
	synsetnj = lesk(sent_tok2, nj)
	# print(synsetni, ni)
	# print(synsetnj, nj)
	# if synsetni != None and synsetnj != None:
	if not (synsetnj is None or synsetni is None):
	wpsim = synsetni.wup_similarity(synsetnj) #(ni, nj)
	if wpsim is None:
	wpsim = 0
	if (not None in [wpsim, max1]) and wpsim > max1:
	max1 = wpsim
	# print(wpsim, max1)
	sum1 += max1
	# print(sum1)

	for ni in args2:
	synsetni = lesk(sent_tok2, ni)
	synsetnj = lesk(sent_tok1, args1[0])
	if not (synsetnj is None or synsetni is None):
	max2 = synsetni.wup_similarity(synsetnj) #wp_sim(ni, args2[0])
	if max2 is None:
	max2 = 0
	for nj in args1[1:]:
	synsetnj = lesk(sent_tok1, nj)
	if not (synsetnj is None or synsetni is None):
	wpsim = synsetni.wup_similarity(synsetnj) #(ni, nj)
	if wpsim is None:
	wpsim = 0
	if (not None in [wpsim, max2]) and wpsim > max2:
	# if (wpsim is not None) and wpsim > max2:
	max2 = wpsim
	sum2 += max2
	# print(len(args1))
	# print(len(args2))
	# print(sum1, sum2)
	return 0.5( (1/len(args1))sum1 + (1/len(args2))*sum2 )
	except:
	return 0.2
	def FRel(s1, s2):
	return alphaFRelPred(s1, s2) + (1-alpha)FRelArgs(s1, s2)
	def hac(foundSentences, length=2):
	R=0.1
	twoSentenceClusters = []
	numfound = len(foundSentences)

	sentencePairs = []
	frelijs = []
	ind = 0
	maxind = 0
	maxval = 0
	for i in itertools.permutations(foundSentences, 2):
	if i[0].textid != i[1].textid:
	frelij = FRel(i[0], i[1])
	if frelij > R:
	sentencePairs.append(list(i))
	frelijs.append(frelij)
	if ind != 0:
	if frelij > maxval:
	maxval=maxval
	maxind = ind
	ind += 1
	else:
	ind=1
	maxval = frelij
	maxind = 0

	# print(sentencePairs)
	maxvalThree = 0
	maxSentsThree = []
	threeSentsCluster = set()
	for pairind, pair in enumerate(sentencePairs):
	for sent in foundSentences:
	if sent.textid != pair[0].textid and sent.textid != pair[1].textid:
	frelij = FRel(sent, i[0])
	if frelij > R:
	threeSentsCluster.add(tuple([sent]+pair))
	current = (frelijs[maxind]+frelij)/2 > maxvalThree
	if current > maxvalThree:
	maxvalThree = current
	maxSentsThree = [sent]+pair
	frelij = FRel(i[1], sent)
	if frelij > R:
	threeSentsCluster.add(tuple(pair+[sent]))
	current = (frelijs[maxind]+frelij)/2 > maxvalThree
	if current > maxvalThree:
	maxvalThree = current
	maxSentsThree = pair+[sent]
	# print(sentencePairs)
	# print(threeSentsCluster)
	# for pair in sentencePairs:
	# print(pair[0].sentence, pair[1].sentence)
	# for cluster in threeSentsCluster:
	# print(cluster[0].sentence, cluster[1].sentence, cluster[2].sentence)
	# print([sentencePairs[maxind],maxSentsThree])
	if len(sentencePairs) >=1:
	return [sentencePairs[maxind],maxSentsThree]#sentencePairs + list(threeSentsCluster)
	else:
	return []
	# print(FRelPred('David noticed he had put on a lot of weight recently.',
	# 'He examined his habits to try and figure out the reason.'))
	# # 'After a few weeks, he started to feel much better.'))
	# print(FRel('David noticed he had put on a lot of weight recently.',
	# 'He examined his habits to try and figure out the reason.'))
	# print(FRel('He decided to buy a pair of khakis.', 'The pair he bought fit him perfectly.'))
	from sklearn.feature_extraction.text import TfidfVectorizer
	from gensim.models.word2vec import Word2Vec
	import gensim.downloader
	def tfidfTokenizer(x):
	return [w for words in [s.lemmatized for s in x] for w in words]
	def preprocess(x):
	return x
	# tfidfvectorizer = TfidfVectorizer(tokenizer=tfidfTokenizer, preprocessor=preprocess, use_idf=True)
	# tfidfvectorizer_vectors = tfidfvectorizer.fit_transform(storiesSentences) #(map(lambda x: [ s.lemmatized for s in np.array(x).flatten() ] ))
	# wvModel = gensim.downloader.load("word2vec-google-news-300")
	# feature_names = tfidfvectorizer.get_feature_names()
	def setVectors(stories):
	for doc in stories:
	for sentence in doc:
	vectors = []
	for lemma in sentence.lemmatized:
	try:
	vectors.append(wvModel[lemma])
	except:
	vectors.append([0]*300)
	sentence.vectors = vectors
	# setVectors(storiesSentences)
	def setTfIdfs(documents):
	for i, doc in enumerate(documents):
	feature = tfidfvectorizer_vectors[i,:].nonzero()[1]
	tfidfs = zip(feature, [tfidfvectorizer_vectors[i, x] for x in feature])
	tfidfsbyword = dict()
	for w,s in [(feature_names[j], s) for (j, s) in tfidfs]:
	tfidfsbyword[w] = s
	for sent in doc:
	tfidfs = []
	for lemma in sent.lemmatized:
	tfidfs.append(tfidfsbyword[lemma])
	sent.tfidfs = tfidfs
	sent.calculateVector()
	# setTfIdfs(storiesSentences)
	# m = searchByRequest(['wake', 'present', 'Christmas'])
	# for sent in m:
	# print(sent.lemmatized)
	def generate(words):
	m = searchByRequest(words)
	return hac(m)

	def generate_and_choose(input_text):
	input_ids=tokenizer.encode(input_text,return_tensors='tf')
	beam_outputs=model.generate(input_ids,max_length=100,num_return_sequences=3,num_beams=3,no_repeat_ngram_size=2,early_stopping=True)
	return_list = []
	for i, beam_output in enumerate(beam_outputs):
	# print(beam_output)
	return_list.append(tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True))
	outputs_coherences = []
	for i, text in enumerate(return_list):
	sentencesTokenized = nltk.sent_tokenize(text)
	coherence_cur = 0
	length = len(sentencesTokenized)
	for s in range(length-1):
	coherence_cur += FRel(sentencesTokenized[s], sentencesTokenized[s+1])
	if length == 1:
	length += 1
	outputs_coherences.append(coherence_cur / (length-1))
	index_of_max = outputs_coherences.index(max(outputs_coherences))
	return return_list[index_of_max]

	def greedy_generate(inp):
	input_ids = tokenizer.encode(inp, return_tensors='tf')
	greedy_output = model.generate(input_ids, pad_token_id=tokenizer.encode('.')[0], eos_token_id=tokenizer.encode('.')[0])
	return tokenizer.decode(greedy_output[0], skip_special_tokens=True)

	def with_sampling(input_ids):
	tf.random.set_seed(0)
	# activate sampling and deactivate top_k by setting top_k sampling to 0
	sample_output = model.generate(
	input_ids,
	do_sample=True,
	max_length=50,
	top_k=0,
	temperature=0.7)
	return tokenizer.decode(sample_output[0], skip_special_tokens=True)

	def with_top_k_sampling(input_ids):
	tf.random.set_seed(0)
	sample_output = model.generate(
	input_ids,
	do_sample=True,
	max_length=50,
	top_k=50)
	return tokenizer.decode(sample_output[0], skip_special_tokens=True)

	def with_nucleus_sampling(input_ids):
	tf.random.set_seed(0)
	# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
	sample_outputs = model.generate(
	input_ids,
	do_sample=True,
	max_length=50,
	top_k=50,
	top_p=0.95,
	num_return_sequences=3)
	return_list = []
	for i, beam_output in enumerate(sample_outputs):
	# print(beam_output)
	return_list.append(tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True))
	outputs_coherences = []
	for i, text in enumerate(return_list):
	sentencesTokenized = nltk.sent_tokenize(text)
	coherence_cur = 0
	length = len(sentencesTokenized)
	for s in range(length-1):
	coherence_cur += FRel(sentencesTokenized[s], sentencesTokenized[s+1])
	if length == 1:
	length += 1
	outputs_coherences.append(coherence_cur / (length-1))
	index_of_max = outputs_coherences.index(max(outputs_coherences))
	return return_list[index_of_max]

	def generation_method(decoding_algorithm,input_text):
	input_ids=tokenizer.encode(input_text,return_tensors='tf')
	if decoding_algorithm=="Beam search":
	return generate_and_choose(input_text)
	elif decoding_algorithm=="Greedy search":
	return greedy_generate(input_text)
	elif decoding_algorithm=="With sampling":
	return with_sampling(input_ids)
	elif decoding_algorithm=="With top k sampling":
	return with_top_k_sampling(input_ids)
	elif decoding_algorithm=="With nucleus sampling":
	return with_nucleus_sampling(input_ids)

	import gradio as gr
	in1 = gr.inputs.Dropdown(choices=["Beam search", "Greedy search", "With sampling","With top k sampling", "With nucleus sampling"])
	in2 = gr.inputs.Textbox()
	iface = gr.Interface(fn=generation_method,
	inputs=[in1,in2],
	outputs="text").launch(debug=True)

	iface.launch()