Spaces:

ansfarooq7
/

l4-project

Sleeping

App Files Files Community

ansfarooq7 commited on Mar 30, 2022

Commit

f5fde7c

•

1 Parent(s): a494241

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -8

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from transformers import RobertaTokenizer, RobertaForMaskedLM, GPT2Tokenizer, GPT2LMHeadModel, pipeline
 import torch
 import wikipedia
@@ -7,22 +8,26 @@ import nltk
 import gradio as gr
 nltk.download('cmudict')
 roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
 roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base')
 gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=gpt2_tokenizer.eos_token_id)
 gpt2_pipeline = pipeline('text-generation', model=gpt2_model, tokenizer=gpt2_tokenizer)
 frequent_words = set()
 with open("wordFrequency.txt", 'r') as f:
     line = f.readline()
     while line != '':  # The EOF char is an empty string
         frequent_words.add(line.strip())
         line = f.readline()
 def filter_rhymes(word):
     filter_list = ['an', 'to', 'on', 'has', 'but', 'the', 'in', 'and', 'a', 'are', 'or', 'its', 'it''s']
     if word in filter_list:
@@ -30,11 +35,16 @@ def filter_rhymes(word):
     else:
         return True
 def remove_punctuation(text):
     text = re.sub(r'[^\w\s]', '', text)
     text = text.replace("\n", " ")
     return text.strip()
 def get_rhymes(inp, level):
     entries = nltk.corpus.cmudict.entries()
     syllables = [(word, syl) for word, syl in entries if word == inp]
@@ -48,10 +58,18 @@ def get_rhymes(inp, level):
             filtered_rhymes.add(word)
     return filtered_rhymes
 def get_inputs_length(input):
     input_ids = gpt2_tokenizer(input)['input_ids']
     return len(input_ids)
 def get_prediction(sent):
     token_ids = roberta_tokenizer.encode(sent, return_tensors='pt')
     masked_position = (token_ids.squeeze() == roberta_tokenizer.mask_token_id).nonzero()
@@ -68,6 +86,8 @@ def get_prediction(sent):
         while not words:
             mask_hidden_state = last_hidden_state[mask_index]
             idx = torch.topk(mask_hidden_state, k=5, dim=0)[1]
             for i in idx:
                 word = roberta_tokenizer.decode(i.item()).strip()
                 if (remove_punctuation(word) != "") and (word != '</s>'):
@@ -80,7 +100,9 @@ def get_prediction(sent):
         best_guess = best_guess+" "+j[0]
     return best_guess
 def get_line(prompt, inputs_len):
     output = gpt2_pipeline(
         prompt + ".",
@@ -91,6 +113,9 @@ def get_line(prompt, inputs_len):
     )
     return remove_punctuation(output[0]['generated_text'])
 def get_rhyming_line(prompt, rhyming_word, inputs_len):
     output = gpt2_pipeline(
         prompt + ".",
@@ -126,6 +151,8 @@ def get_rhyming_line(prompt, rhyming_word, inputs_len):
     print(f"Final Sentence: {final_sentence}")
     return final_sentence
 def gpt2_summary(topic):
     output = gpt2_pipeline(
         f"Here is some information about {topic}.",
@@ -135,8 +162,11 @@ def gpt2_summary(topic):
         return_full_text=False
     )
     return remove_punctuation(output[0]['generated_text'])
 def generate(topic, wiki=True):
     if wiki:
         try:
             topic_search = wikipedia.search(topic, results=3)
@@ -148,9 +178,12 @@ def generate(topic, wiki=True):
             topic_summary = remove_punctuation(wikipedia.summary(page, auto_suggest=False))
         except:
             return(f"Method A struggled to find information about {topic}, please try a different topic!")
     else:
         topic_summary = remove_punctuation(gpt2_summary(topic))
     word_list = topic_summary.split()
     topic_summary_len = len(topic_summary)
     no_of_words = len(word_list)
@@ -160,6 +193,7 @@ def generate(topic, wiki=True):
     print(f"No of Words in Summary: {no_of_words}")
     print(f"Length of Input IDs: {inputs_len}")
     rhyming_words_125 = []
     while len(rhyming_words_125) < 3 or valid_rhyme == False or len(first_line) == 0:
         first_line = get_line(topic_summary, inputs_len)
@@ -172,6 +206,7 @@ def generate(topic, wiki=True):
                 print(f"Rhyming words for '{end_word}' are {rhyming_words_125}")
                 limerick = first_line + "\n"
     rhyming_word = random.choice(rhyming_words_125)
     rhyming_words_125.remove(rhyming_word)
     prompt = topic_summary + " " + first_line
@@ -182,6 +217,7 @@ def generate(topic, wiki=True):
     print(f"\nSecond Line: {second_line}")
     limerick += second_line + "\n"
     rhyming_words_34 = []
     prompt = prompt + " " + second_line
     inputs_len = get_inputs_length(prompt)
@@ -199,6 +235,7 @@ def generate(topic, wiki=True):
             if valid_rhyme and len(rhyming_words_34) > 1:
                 limerick += third_line + "\n"
     rhyming_word = random.choice(rhyming_words_34)
     rhyming_words_34.remove(rhyming_word)
     prompt = prompt + " " + third_line
@@ -209,6 +246,7 @@ def generate(topic, wiki=True):
     print(f"\nFourth Line: {fourth_line}")
     limerick += fourth_line + "\n"
     rhyming_word = random.choice(rhyming_words_125)
     rhyming_words_125.remove(rhyming_word)
     prompt = prompt + " " + fourth_line
@@ -223,7 +261,8 @@ def generate(topic, wiki=True):
     print(limerick)
     return limerick
 def compare_summaries(topic):
     wiki_limerick = generate(topic)
     gpt2_limerick = generate(topic, wiki=False)
@@ -233,8 +272,11 @@ def compare_summaries(topic):
     print(output1 + "\n" + output2)
     return output1, output2
-description = "Generates limericks (five-line poems with a rhyme scheme of AABBA) using two different methods, please be patient as it can take up to a minute to generate both limericks. You may have to generate multiple times or try different topics in order to produce something of good quality."
 article = '<center><big><strong>Limerick Generation</strong></big></center>'\
         '<center><strong>By Ans Farooq</strong></center>'\
         '<center><small>Level 4 Individual Project</small></center>'\
@@ -265,4 +307,4 @@ interface = gr.Interface(
     theme="peach",
     description=description,
     article=article)
-interface.launch(debug=True)

+# Import required packages
 from transformers import RobertaTokenizer, RobertaForMaskedLM, GPT2Tokenizer, GPT2LMHeadModel, pipeline
 import torch
 import wikipedia
 import gradio as gr
 nltk.download('cmudict')
+# Use the RoBERTa model from HuggingFace for masked language modelling
 roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
 roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base')
+# Use the GPT-2 from HuggingFace for causal language modelling
 gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=gpt2_tokenizer.eos_token_id)
+# Initialise a text generation pipeline using HuggingFace Transformers and the pre-trained GPT-2 model
 gpt2_pipeline = pipeline('text-generation', model=gpt2_model, tokenizer=gpt2_tokenizer)
+# Hold all the words in wordFrequency.txt in a Python set
 frequent_words = set()
 with open("wordFrequency.txt", 'r') as f:
     line = f.readline()
     while line != '':  # The EOF char is an empty string
         frequent_words.add(line.strip())
         line = f.readline()
+# Used alongside the word frequency list to filter out problematic words for rhyming
 def filter_rhymes(word):
     filter_list = ['an', 'to', 'on', 'has', 'but', 'the', 'in', 'and', 'a', 'are', 'or', 'its', 'it''s']
     if word in filter_list:
     else:
         return True
+# Used to remove any punctuation and new line characters from generated text
 def remove_punctuation(text):
     text = re.sub(r'[^\w\s]', '', text)
     text = text.replace("\n", " ")
     return text.strip()
+# Used to find rhymes to a given word using NLTK
+# where inp is a word and level means how good the rhyme should be.
+# Adapted from the following Stack Overflow answer:
+# https://stackoverflow.com/a/25714769/18559178
 def get_rhymes(inp, level):
     entries = nltk.corpus.cmudict.entries()
     syllables = [(word, syl) for word, syl in entries if word == inp]
             filtered_rhymes.add(word)
     return filtered_rhymes
+# Used to get the length of the topic summary, to then determine max length for
+# the text generation pipeline
 def get_inputs_length(input):
     input_ids = gpt2_tokenizer(input)['input_ids']
     return len(input_ids)
+# Sized Fill-in-the-blank or Multi Mask filling with RoBERTa and Huggingface Transformers
+# Used to fill in the blank words between the starting words of each line
+# generated by GPT-2 and the end rhyming word
+# Code adapted from the following Medium article:
+# https://ramsrigoutham.medium.com/sized-fill-in-the-blank-or-multi-mask-filling-with-roberta-and-huggingface-transformers-58eb9e7fb0c
 def get_prediction(sent):
     token_ids = roberta_tokenizer.encode(sent, return_tensors='pt')
     masked_position = (token_ids.squeeze() == roberta_tokenizer.mask_token_id).nonzero()
         while not words:
             mask_hidden_state = last_hidden_state[mask_index]
             idx = torch.topk(mask_hidden_state, k=5, dim=0)[1]
+            # Discard predicted word if it is blank or end token
             for i in idx:
                 word = roberta_tokenizer.decode(i.item()).strip()
                 if (remove_punctuation(word) != "") and (word != '</s>'):
         best_guess = best_guess+" "+j[0]
     return best_guess
+# Used to generate the 1st and 3rd lines of the limerick
+# these are full lines, without RoBERTa being used
 def get_line(prompt, inputs_len):
     output = gpt2_pipeline(
         prompt + ".",
     )
     return remove_punctuation(output[0]['generated_text'])
+# Used to generate the 2nd, 4th and 5th lines
+# GPT-2 is used to generate starting few words of the lines
+# RoBERTa is then used to fill in the rest of the words until the end rhyme word
 def get_rhyming_line(prompt, rhyming_word, inputs_len):
     output = gpt2_pipeline(
         prompt + ".",
     print(f"Final Sentence: {final_sentence}")
     return final_sentence
+# Used for the second method, Method B, of limerick generation
+# Uses GPT-2 to get information about the user's given topic
 def gpt2_summary(topic):
     output = gpt2_pipeline(
         f"Here is some information about {topic}.",
         return_full_text=False
     )
     return remove_punctuation(output[0]['generated_text'])
+# Main logic for limerick generation is contained here
 def generate(topic, wiki=True):
+    # Search for the topic on Wikipedia and get an informational summary
     if wiki:
         try:
             topic_search = wikipedia.search(topic, results=3)
             topic_summary = remove_punctuation(wikipedia.summary(page, auto_suggest=False))
         except:
             return(f"Method A struggled to find information about {topic}, please try a different topic!")
+    # Use GPT-2 to get info about the topic if the wiki parameter is false
     else:
         topic_summary = remove_punctuation(gpt2_summary(topic))
+    # Log info about the topic summary data
     word_list = topic_summary.split()
     topic_summary_len = len(topic_summary)
     no_of_words = len(word_list)
     print(f"No of Words in Summary: {no_of_words}")
     print(f"Length of Input IDs: {inputs_len}")
+    # Generate the first line of the limerick
     rhyming_words_125 = []
     while len(rhyming_words_125) < 3 or valid_rhyme == False or len(first_line) == 0:
         first_line = get_line(topic_summary, inputs_len)
                 print(f"Rhyming words for '{end_word}' are {rhyming_words_125}")
                 limerick = first_line + "\n"
+    # Generate the second line of the limerick
     rhyming_word = random.choice(rhyming_words_125)
     rhyming_words_125.remove(rhyming_word)
     prompt = topic_summary + " " + first_line
     print(f"\nSecond Line: {second_line}")
     limerick += second_line + "\n"
+    # Generate the third line of the limerick
     rhyming_words_34 = []
     prompt = prompt + " " + second_line
     inputs_len = get_inputs_length(prompt)
             if valid_rhyme and len(rhyming_words_34) > 1:
                 limerick += third_line + "\n"
+    # Generate the fourth line of the limerick
     rhyming_word = random.choice(rhyming_words_34)
     rhyming_words_34.remove(rhyming_word)
     prompt = prompt + " " + third_line
     print(f"\nFourth Line: {fourth_line}")
     limerick += fourth_line + "\n"
+    # Generate the fifth line of the limerick
     rhyming_word = random.choice(rhyming_words_125)
     rhyming_words_125.remove(rhyming_word)
     prompt = prompt + " " + fourth_line
     print(limerick)
     return limerick
+# Helper function to generate two limericks via both methods to then compare
 def compare_summaries(topic):
     wiki_limerick = generate(topic)
     gpt2_limerick = generate(topic, wiki=False)
     print(output1 + "\n" + output2)
     return output1, output2
+# Use Gradio to create an interface, which can be hosted on HuggingFace spaces
+# https://huggingface.co/spaces/ansfarooq7/l4-project
+description = "Generates limericks (five-line poems with a rhyme scheme of AABBA) using two different methods, please be patient as it can take up to a minute to generate both limericks."
 article = '<center><big><strong>Limerick Generation</strong></big></center>'\
         '<center><strong>By Ans Farooq</strong></center>'\
         '<center><small>Level 4 Individual Project</small></center>'\
     theme="peach",
     description=description,
     article=article)
+interface.launch(debug=False)