# Import required packages from transformers import RobertaTokenizer, RobertaForMaskedLM, GPT2Tokenizer, GPT2LMHeadModel, pipeline import torch import wikipedia import re import random import nltk import gradio as gr nltk.download('cmudict') # Use the RoBERTa model from HuggingFace for masked language modelling roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base') roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base') # Use the GPT-2 from HuggingFace for causal language modelling gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2") gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=gpt2_tokenizer.eos_token_id) # Initialise a text generation pipeline using HuggingFace Transformers and the pre-trained GPT-2 model gpt2_pipeline = pipeline('text-generation', model=gpt2_model, tokenizer=gpt2_tokenizer) # Hold all the words in wordFrequency.txt in a Python set frequent_words = set() with open("wordFrequency.txt", 'r') as f: line = f.readline() while line != '': # The EOF char is an empty string frequent_words.add(line.strip()) line = f.readline() # Used alongside the word frequency list to filter out problematic words for rhyming def filter_rhymes(word): filter_list = ['an', 'to', 'on', 'has', 'but', 'the', 'in', 'and', 'a', 'are', 'or', 'its', 'it''s'] if word in filter_list: return False else: return True # Used to remove any punctuation and new line characters from generated text def remove_punctuation(text): text = re.sub(r'[^\w\s]', '', text) text = text.replace("\n", " ") return text.strip() # Used to find rhymes to a given word using NLTK # where inp is a word and level means how good the rhyme should be. # Adapted from the following Stack Overflow answer: # https://stackoverflow.com/a/25714769/18559178 def get_rhymes(inp, level): entries = nltk.corpus.cmudict.entries() syllables = [(word, syl) for word, syl in entries if word == inp] rhymes = [] filtered_rhymes = set() for (word, syllable) in syllables: rhymes += [word for word, pron in entries if pron[-level:] == syllable[-level:]] for word in rhymes: if (word in frequent_words) and (word != inp): filtered_rhymes.add(word) return filtered_rhymes # Used to get the length of the topic summary, to then determine max length for # the text generation pipeline def get_inputs_length(input): input_ids = gpt2_tokenizer(input)['input_ids'] return len(input_ids) # Sized Fill-in-the-blank or Multi Mask filling with RoBERTa and Huggingface Transformers # Used to fill in the blank words between the starting words of each line # generated by GPT-2 and the end rhyming word # Code adapted from the following Medium article: # https://ramsrigoutham.medium.com/sized-fill-in-the-blank-or-multi-mask-filling-with-roberta-and-huggingface-transformers-58eb9e7fb0c def get_prediction(sent): token_ids = roberta_tokenizer.encode(sent, return_tensors='pt') masked_position = (token_ids.squeeze() == roberta_tokenizer.mask_token_id).nonzero() masked_pos = [mask.item() for mask in masked_position ] with torch.no_grad(): output = roberta_model(token_ids) last_hidden_state = output[0].squeeze() list_of_list =[] for index,mask_index in enumerate(masked_pos): words = [] while not words: mask_hidden_state = last_hidden_state[mask_index] idx = torch.topk(mask_hidden_state, k=5, dim=0)[1] # Discard predicted word if it is blank or end token for i in idx: word = roberta_tokenizer.decode(i.item()).strip() if (remove_punctuation(word) != "") and (word != ''): words.append(word) list_of_list.append(words) print(f"Mask {index+1} Guesses: {words}") best_guess = "" for j in list_of_list: best_guess = best_guess+" "+j[0] return best_guess # Used to generate the 1st and 3rd lines of the limerick # these are full lines, without RoBERTa being used def get_line(prompt, inputs_len): output = gpt2_pipeline( prompt + ".", min_length=4, max_length=inputs_len + 7, clean_up_tokenization_spaces=True, return_full_text=False ) return remove_punctuation(output[0]['generated_text']) # Used to generate the 2nd, 4th and 5th lines # GPT-2 is used to generate starting few words of the lines # RoBERTa is then used to fill in the rest of the words until the end rhyme word def get_rhyming_line(prompt, rhyming_word, inputs_len): output = gpt2_pipeline( prompt + ".", min_length=4, max_length=inputs_len + 3, clean_up_tokenization_spaces=True, return_full_text=False ) gpt2_sentence = remove_punctuation(output[0]['generated_text']) while len(gpt2_sentence) == 0: output = gpt2_pipeline( prompt + ".", min_length=4, max_length=inputs_len + 3, clean_up_tokenization_spaces=True, return_full_text=False ) gpt2_sentence = remove_punctuation(output[0]['generated_text']) print(f"\nGetting rhyming line starting with '{gpt2_sentence}' and ending with rhyming word '{rhyming_word}'") sentence = gpt2_sentence + " ___ ___ ___ " + rhyming_word print(f"Original Sentence: {sentence}") if sentence[-1] != ".": sentence = sentence.replace("___","") + "." else: sentence = sentence.replace("___","") print(f"Original Sentence replaced with mask: {sentence}") print("\n") predicted_blanks = get_prediction(sentence) print(f"\nBest guess for fill in the blanks: {predicted_blanks}") final_sentence = gpt2_sentence + predicted_blanks + " " + rhyming_word print(f"Final Sentence: {final_sentence}") return final_sentence # Used for the second method, Method B, of limerick generation # Uses GPT-2 to get information about the user's given topic def gpt2_summary(topic): output = gpt2_pipeline( f"Here is some information about {topic}.", min_length=100, max_length=300, clean_up_tokenization_spaces=True, return_full_text=False ) return remove_punctuation(output[0]['generated_text']) # Main logic for limerick generation is contained here def generate(topic, wiki=True): # Search for the topic on Wikipedia and get an informational summary if wiki: try: topic_search = wikipedia.search(topic, results=3) print(f"Wikipedia search results for {topic} are: {topic_search}") topic_summary = remove_punctuation(wikipedia.summary(topic_search[0], auto_suggest=False)) except wikipedia.DisambiguationError as e: print(f"Wikipedia returned a disambiguation error for {topic}. Selecting the first option {e.options[0]} instead.") page = e.options[0] topic_summary = remove_punctuation(wikipedia.summary(page, auto_suggest=False)) except: return(f"Method A struggled to find information about {topic}, please try a different topic!") # Use GPT-2 to get info about the topic if the wiki parameter is false else: topic_summary = remove_punctuation(gpt2_summary(topic)) # Log info about the topic summary data word_list = topic_summary.split() topic_summary_len = len(topic_summary) no_of_words = len(word_list) inputs_len = get_inputs_length(topic_summary) print(f"Topic Summary: {topic_summary}") print(f"Topic Summary Length: {topic_summary_len}") print(f"No of Words in Summary: {no_of_words}") print(f"Length of Input IDs: {inputs_len}") # Generate the first line of the limerick rhyming_words_125 = [] while len(rhyming_words_125) < 3 or valid_rhyme == False or len(first_line) == 0: first_line = get_line(topic_summary, inputs_len) if first_line: end_word = remove_punctuation(first_line.split()[-1]) valid_rhyme = filter_rhymes(end_word) if valid_rhyme: print(f"\nFirst Line: {first_line}") rhyming_words_125 = list(get_rhymes(end_word, 3)) print(f"Rhyming words for '{end_word}' are {rhyming_words_125}") limerick = first_line + "\n" # Generate the second line of the limerick rhyming_word = random.choice(rhyming_words_125) rhyming_words_125.remove(rhyming_word) prompt = topic_summary + " " + first_line inputs_len = get_inputs_length(prompt) print(f"Prompt: {prompt}") print(f"Length of prompt: {inputs_len}") second_line = get_rhyming_line(prompt, rhyming_word, inputs_len) print(f"\nSecond Line: {second_line}") limerick += second_line + "\n" # Generate the third line of the limerick rhyming_words_34 = [] prompt = prompt + " " + second_line inputs_len = get_inputs_length(prompt) print(f"Prompt: {prompt}") print(f"Length of prompt: {inputs_len}") while len(rhyming_words_34) < 2 or valid_rhyme == False or len(third_line) == 0: third_line = get_line(prompt, inputs_len) if third_line: print(f"\nThird Line: {third_line}") end_word = remove_punctuation(third_line.split()[-1]) valid_rhyme = filter_rhymes(end_word) print(f"Does '{end_word}' have valid rhymes: {valid_rhyme}") rhyming_words_34 = list(get_rhymes(end_word, 3)) print(f"Rhyming words for '{end_word}' are {rhyming_words_34}") if valid_rhyme and len(rhyming_words_34) > 1: limerick += third_line + "\n" # Generate the fourth line of the limerick rhyming_word = random.choice(rhyming_words_34) rhyming_words_34.remove(rhyming_word) prompt = prompt + " " + third_line inputs_len = get_inputs_length(prompt) print(f"Prompt: {prompt}") print(f"Length of prompt: {inputs_len}") fourth_line = get_rhyming_line(prompt, rhyming_word, inputs_len) print(f"\nFourth Line: {fourth_line}") limerick += fourth_line + "\n" # Generate the fifth line of the limerick rhyming_word = random.choice(rhyming_words_125) rhyming_words_125.remove(rhyming_word) prompt = prompt + " " + fourth_line inputs_len = get_inputs_length(prompt) print(f"Prompt: {prompt}") print(f"Length of prompt: {inputs_len}") fifth_line = get_rhyming_line(prompt, rhyming_word, inputs_len) print(f"\nFifth Line: {fifth_line}") limerick += fifth_line + "\n" print("\n") print(limerick) return limerick # Helper function to generate two limericks via both methods to then compare def compare_summaries(topic): wiki_limerick = generate(topic) gpt2_limerick = generate(topic, wiki=False) output1 = wiki_limerick output2 = gpt2_limerick print(output1 + "\n" + output2) return output1, output2 # Use Gradio to create an interface, which can be hosted on HuggingFace spaces # https://huggingface.co/spaces/ansfarooq7/l4-project description = "Generates limericks (five-line poems with a rhyme scheme of AABBA) using two different methods, please be patient as it can take up to a minute to generate both limericks." article = 'Limerick Generation'\ 'By Ans Farooq'\ 'Level 4 Individual Project'\ 'BSc Computing Science'\ 'University of Glasgow'\ 'Description
'\ 'Recent advances in natural language processing (NLP) have shown '\ 'incredible promise at generating human-quality language. Poetry '\ 'presents an additional challenge as it often relies on rhyme and '\ 'rhythm of language. Factoring these in presents an interesting '\ 'challenge to new deep learning-based methods. This text-generation '\ 'project examines the use of transformer-based deep learning methods '\ 'and the addition of constraints for length, rhyme and rhythm given '\ 'example words to seed a poem. This interface allows you to produce two '\ 'cohesive limericks automatically, using two different methods. The '\ 'results of this project are to be evaluated through human comparisons.' gr_input = gr.inputs.Textbox(label='Topic') gr_output1 = gr.outputs.Textbox(label='Method A') gr_output2 = gr.outputs.Textbox(label='Method B') interface = gr.Interface( fn=compare_summaries, inputs=gr_input, outputs=[gr_output1, gr_output2], title="Text-generation with rhyme and rhythm", layout="horizontal", theme="peach", description=description, article=article) interface.launch(debug=False)