Spaces:
Sleeping
Sleeping
File size: 10,144 Bytes
b326a58 2acd461 eb3434c 2acd461 4677c24 2acd461 b326a58 1c3c84c b326a58 75f64b7 4677c24 2acd461 4677c24 2acd461 b326a58 e906813 96ac6d4 2acd461 b326a58 2acd461 b326a58 2acd461 1c3c84c b326a58 1c3c84c 2acd461 96ac6d4 1c3c84c b326a58 2acd461 1c3c84c b326a58 96ac6d4 4677c24 1c3c84c 2acd461 1c3c84c 2acd461 b326a58 96ac6d4 4677c24 96ac6d4 4677c24 b326a58 2acd461 1c3c84c 2acd461 4677c24 b326a58 4677c24 b326a58 4677c24 b326a58 4677c24 2acd461 4677c24 96ac6d4 4677c24 b326a58 96ac6d4 4677c24 b326a58 96ac6d4 1c3c84c 96ac6d4 82bac73 96ac6d4 2acd461 4677c24 96ac6d4 b326a58 96ac6d4 2acd461 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 |
from transformers import RobertaTokenizer, RobertaForMaskedLM, GPT2Tokenizer
import torch
import tensorflow as tf
import wikipedia
import re
import random
import nltk
from aitextgen import aitextgen
nltk.download('cmudict')
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base')
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = aitextgen(tf_gpt2="355M")
#gpt2_model = aitextgen()
frequent_words = set()
with open("wordFrequency.txt", 'r') as f:
line = f.readline()
while line != '': # The EOF char is an empty string
frequent_words.add(line.strip())
line = f.readline()
def filter_rhymes(word):
filter_list = ['to', 'on', 'has', 'but', 'the', 'in', 'and', 'a', 'aitch', 'angst', 'arugula', 'beige', 'blitzed', 'boing', 'bombed', 'cairn', 'chaos', 'chocolate', 'circle', 'circus', 'cleansed', 'coif', 'cusp', 'doth', 'else', 'eth', 'fiends', 'film', 'flange', 'fourths', 'grilse', 'gulf', 'kiln', 'loge', 'midst', 'month', 'music', 'neutron', 'ninja', 'oblige', 'oink', 'opus', 'orange', 'pint', 'plagued', 'plankton', 'plinth', 'poem', 'poet', 'purple', 'quaich', 'rhythm', 'rouged', 'silver', 'siren', 'soldier', 'sylph', 'thesp', 'toilet', 'torsk', 'tufts', 'waltzed', 'wasp', 'wharves', 'width', 'woman', 'yttrium']
if word in filter_list:
return False
else:
return True
def remove_punctuation(text):
text = re.sub(r'[^\w\s]', '', text)
text = text.replace("\n", " ")
return text
def get_rhymes(inp, level):
entries = nltk.corpus.cmudict.entries()
syllables = [(word, syl) for word, syl in entries if word == inp]
rhymes = []
filtered_rhymes = set()
for (word, syllable) in syllables:
rhymes += [word for word, pron in entries if pron[-level:] == syllable[-level:]]
for word in rhymes:
if (word in frequent_words) and (word != inp):
filtered_rhymes.add(word)
return filtered_rhymes
def get_inputs_length(input):
input_ids = gpt2_tokenizer(input)['input_ids']
return len(input_ids)
def get_prediction(sent):
token_ids = roberta_tokenizer.encode(sent, return_tensors='pt')
masked_position = (token_ids.squeeze() == roberta_tokenizer.mask_token_id).nonzero()
masked_pos = [mask.item() for mask in masked_position ]
with torch.no_grad():
output = roberta_model(token_ids)
last_hidden_state = output[0].squeeze()
list_of_list =[]
for index,mask_index in enumerate(masked_pos):
words = []
while not words:
mask_hidden_state = last_hidden_state[mask_index]
idx = torch.topk(mask_hidden_state, k=5, dim=0)[1]
for i in idx:
word = roberta_tokenizer.decode(i.item()).strip()
if (remove_punctuation(word) != "") and (word != '</s>'):
words.append(word)
list_of_list.append(words)
print(f"Mask {index+1} Guesses: {words}")
best_guess = ""
for j in list_of_list:
best_guess = best_guess+" "+j[0]
return best_guess
def get_line(prompt, inputs_len):
line = gpt2_model.generate_one(prompt=prompt + ".", max_length=inputs_len + 7, min_length=4)[len(prompt)+2:]
return line
def get_rhyming_line(prompt, rhyming_word, inputs_len):
gpt2_sentence = gpt2_model.generate_one(prompt=prompt + ".", max_length=inputs_len + 4, min_length=2)[len(prompt)+2:]
while len(gpt2_sentence) == 0:
gpt2_sentence = gpt2_model.generate_one(prompt=prompt + ".", max_length=inputs_len + 4, min_length=2)[len(prompt)+2:]
gpt2_sentence = gpt2_sentence.replace("\n", "")
print(f"\nGetting rhyming line starting with '{gpt2_sentence}' and ending with rhyming word '{rhyming_word}'")
sentence = gpt2_sentence + " ___ ___ ___ " + rhyming_word
print(f"Original Sentence: {sentence}")
if sentence[-1] != ".":
sentence = sentence.replace("___","<mask>") + "."
else:
sentence = sentence.replace("___","<mask>")
print(f"Original Sentence replaced with mask: {sentence}")
print("\n")
predicted_blanks = get_prediction(sentence)
print(f"\nBest guess for fill in the blanks: {predicted_blanks}")
final_sentence = gpt2_sentence + predicted_blanks + " " + rhyming_word
print(f"Final Sentence: {final_sentence}")
return final_sentence
def gpt2_summary(topic):
return gpt2_model.generate_one(prompt=f"Here is some information about {topic}.", top_k=100, top_p=0.95, min_length=200)
def generate(topic, wiki=True):
if wiki:
try:
topic_summary = remove_punctuation(wikipedia.summary(topic))
except:
return(f"Method A struggled to find information about {topic}, please try a different topic!")
else:
topic_summary = remove_punctuation(gpt2_summary(topic))
word_list = topic_summary.split()
topic_summary_len = len(topic_summary)
no_of_words = len(word_list)
inputs_len = get_inputs_length(topic_summary)
print(f"Topic Summary: {topic_summary}")
print(f"Topic Summary Length: {topic_summary_len}")
print(f"No of Words in Summary: {no_of_words}")
print(f"Length of Input IDs: {inputs_len}")
rhyming_words_125 = []
while len(rhyming_words_125) < 3 or valid_rhyme == False or len(first_line) == 0:
first_line = get_line(topic_summary, inputs_len)
if first_line:
end_word = remove_punctuation(first_line.split()[-1])
valid_rhyme = filter_rhymes(end_word)
if valid_rhyme:
print(f"\nFirst Line: {first_line}")
rhyming_words_125 = list(get_rhymes(end_word, 3))
print(f"Rhyming words for '{end_word}' are {rhyming_words_125}")
limerick = first_line + "\n"
rhyming_word = random.choice(rhyming_words_125)
rhyming_words_125.remove(rhyming_word)
prompt = topic_summary + " " + first_line
inputs_len = get_inputs_length(prompt)
print(f"Prompt: {prompt}")
print(f"Length of prompt: {inputs_len}")
second_line = get_rhyming_line(prompt, rhyming_word, inputs_len)
print(f"\nSecond Line: {second_line}")
limerick += second_line + "\n"
rhyming_words_34 = []
prompt = prompt + " " + second_line
inputs_len = get_inputs_length(prompt)
print(f"Prompt: {prompt}")
print(f"Length of prompt: {inputs_len}")
while len(rhyming_words_34) < 2 or valid_rhyme == False or len(third_line) == 0:
third_line = get_line(prompt, inputs_len)
if third_line:
print(f"\nThird Line: {third_line}")
end_word = remove_punctuation(third_line.split()[-1])
valid_rhyme = filter_rhymes(end_word)
print(f"Does '{end_word}' have valid rhymes: {valid_rhyme}")
rhyming_words_34 = list(get_rhymes(end_word, 3))
print(f"Rhyming words for '{end_word}' are {rhyming_words_34}")
if valid_rhyme and len(rhyming_words_34) > 1:
limerick += third_line + "\n"
rhyming_word = random.choice(rhyming_words_34)
rhyming_words_34.remove(rhyming_word)
prompt = prompt + " " + third_line
inputs_len = get_inputs_length(prompt)
print(f"Prompt: {prompt}")
print(f"Length of prompt: {inputs_len}")
fourth_line = get_rhyming_line(prompt, rhyming_word, inputs_len)
print(f"\nFourth Line: {fourth_line}")
limerick += fourth_line + "\n"
rhyming_word = random.choice(rhyming_words_125)
rhyming_words_125.remove(rhyming_word)
prompt = prompt + " " + fourth_line
inputs_len = get_inputs_length(prompt)
print(f"Prompt: {prompt}")
print(f"Length of prompt: {inputs_len}")
fifth_line = get_rhyming_line(prompt, rhyming_word, inputs_len)
print(f"\nFifth Line: {fifth_line}")
limerick += fifth_line + "\n"
print("\n")
print(limerick)
return limerick
def compare_summaries(topic):
wiki_limerick = generate(topic)
gpt2_limerick = generate(topic, wiki=False)
output1 = wiki_limerick
output2 = gpt2_limerick
print(output1 + "\n" + output2)
return output1, output2
import gradio as gr
description = "Generates limericks (five-line poems with a rhyme scheme of AABBA) via two different methods"
article = '<center><big><strong>Limerick Generation</strong></big></center>'\
'<center><strong>By Ans Farooq</strong></center>'\
'<center><small>Level 4 Individual Project</small></center>'\
'<center><small>BSc Computing Science</small></center>'\
'<center><small>University of Glasgow</small></center>'\
'<strong>Description</strong><br>'\
'Recent advances in natural language processing (NLP) have shown '\
'incredible promise at generating human-quality language. Poetry '\
'presents an additional challenge as it often relies on rhyme and '\
'rhythm of language. Factoring these in presents an interesting '\
'challenge to new deep learning-based methods. This text-generation '\
'project examines the use of transformer-based deep learning methods '\
'and the addition of constraints for length, rhyme and rhythm given '\
'example words to seed a poem. This interface allows you to produce two '\
'cohesive limericks automatically, using two different methods. The '\
'results of this project are to be evaluated through human comparisons.'
input = gr.inputs.Textbox(label='Topic')
output1 = gr.outputs.Textbox(label='Method A')
output2 = gr.outputs.Textbox(label='Method B')
interface = gr.Interface(
fn=compare_summaries,
inputs=input,
outputs=[output1, output2],
title="Text-generation with rhyme and rhythm",
layout="horizontal",
theme="peach",
description=description,
article=article)
interface.launch(debug=True) |