Spaces:

ansfarooq7
/

l4-project

Sleeping

File size: 9,176 Bytes

from transformers import RobertaTokenizer, RobertaForMaskedLM, GPT2Tokenizer, GPTNeoForCausalLM
import torch
import wikipedia
import re
import random
import nltk
import syllables
from aitextgen import aitextgen
nltk.download('cmudict')

masked_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
masked_model = RobertaForMaskedLM.from_pretrained('roberta-base')

causal_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gptneo_tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
gptneo_model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")

# Without any parameters, aitextgen() will download, cache, and load the 124M GPT-2 "small" model
gpt2 = aitextgen()

frequent_words = set()

def set_seed(seed: int):
    """

    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if

    installed).



    Args:

        seed (:obj:`int`): The seed to set.

    """
    #random.seed(seed)
    #np.random.seed(seed)
    #if is_torch_available():
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    #if is_tf_available():
        #tf.random.set_seed(seed)
        
with open("wordFrequency.txt", 'r') as f:
    line = f.readline()
    while line != '':  # The EOF char is an empty string
        frequent_words.add(line.strip())
        line = f.readline()

def filter_rhymes(word):
    filter_list = ['to', 'on', 'has', 'but', 'the', 'in', 'and', 'a', 'aitch', 'angst', 'arugula', 'beige', 'blitzed', 'boing', 'bombed', 'cairn', 'chaos', 'chocolate', 'circle', 'circus', 'cleansed', 'coif', 'cusp', 'doth', 'else', 'eth', 'fiends', 'film', 'flange', 'fourths', 'grilse', 'gulf', 'kiln', 'loge', 'midst', 'month', 'music', 'neutron', 'ninja', 'oblige', 'oink', 'opus', 'orange', 'pint', 'plagued', 'plankton', 'plinth', 'poem', 'poet', 'purple', 'quaich', 'rhythm', 'rouged', 'silver', 'siren', 'soldier', 'sylph', 'thesp', 'toilet', 'torsk', 'tufts', 'waltzed', 'wasp', 'wharves', 'width', 'woman', 'yttrium'] 
    if word in filter_list:
        return False
    else:
        return True

def remove_punctuation(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.replace("\n", " ")
    return text

def get_rhymes(inp, level):
    entries = nltk.corpus.cmudict.entries()
    syllables = [(word, syl) for word, syl in entries if word == inp]
    rhymes = []
    filtered_rhymes = set()
    for (word, syllable) in syllables:
        rhymes += [word for word, pron in entries if pron[-level:] == syllable[-level:]]
    
    for word in rhymes:
        if (word in frequent_words) and (word != inp):
            filtered_rhymes.add(word)
    return filtered_rhymes

def get_inputs_length(input):
    input_ids = causal_tokenizer(input)['input_ids']
    return len(input_ids)
       
set_seed(0)
    
def get_prediction(sent):
    
    token_ids = masked_tokenizer.encode(sent, return_tensors='pt')
    masked_position = (token_ids.squeeze() == masked_tokenizer.mask_token_id).nonzero()
    masked_pos = [mask.item() for mask in masked_position ]

    with torch.no_grad():
        output = masked_model(token_ids)

    last_hidden_state = output[0].squeeze()

    list_of_list =[]
    for index,mask_index in enumerate(masked_pos):
        words = []
        while not words:
            mask_hidden_state = last_hidden_state[mask_index]
            idx = torch.topk(mask_hidden_state, k=5, dim=0)[1]
            for i in idx:
                word = masked_tokenizer.decode(i.item()).strip()
                if (remove_punctuation(word) != "") and (word != '</s>'):
                    words.append(word)
            #words = [masked_tokenizer.decode(i.item()).strip() for i in idx]
        list_of_list.append(words)
        print(f"Mask {index+1} Guesses: {words}")
    
    best_guess = ""
    for j in list_of_list:
        best_guess = best_guess+" "+j[0]
        
    return best_guess
    
def get_line(prompt, inputs_len):
    line = gpt2.generate_one(prompt=prompt + ".", max_length=inputs_len + 7)[len(prompt)+2:]
    return line

def get_rhyming_line(prompt, rhyming_word, inputs_len):
    gpt2_sentence = gpt2.generate_one(prompt=prompt + ".", max_length=inputs_len + 4)[len(prompt)+2:]
    gpt2_sentence = gpt2_sentence.replace("\n", "")
    print(f"\nGetting rhyming line starting with '{gpt2_sentence}' and ending with rhyming word '{rhyming_word}'")
    sentence = gpt2_sentence + " ___ ___ ___ " + rhyming_word
    print(f"Original Sentence: {sentence}")
    if sentence[-1] != ".":
        sentence = sentence.replace("___","<mask>") + "."
    else:
        sentence = sentence.replace("___","<mask>")
    print(f"Original Sentence replaced with mask: {sentence}")
    print("\n")
 
    predicted_blanks = get_prediction(sentence)
    print(f"\nBest guess for fill in the blanks: {predicted_blanks}")
    final_sentence = gpt2_sentence + predicted_blanks + " " + rhyming_word
    print(f"Final Sentence: {final_sentence}")
    return final_sentence

def gptneo_summary(topic):
    input_ids = gptneo_tokenizer(f"Here is some information about {topic}", return_tensors="pt").input_ids
    gen_tokens = gptneo_model.generate(input_ids, do_sample=True, temperature=0.9, max_length=200)
    generated_text = gptneo_tokenizer.decode(gen_tokens[0])
    return generated_text 
    
def generate(topic, wiki=True):
    if wiki:
        topic_summary = remove_punctuation(wikipedia.summary(topic))
    else:
        topic_summary = remove_punctuation(gptneo_summary(topic))
    word_list = topic_summary.split()
    topic_summary_len = len(topic_summary)
    no_of_words = len(word_list)
    inputs_len = get_inputs_length(topic_summary)
    print(f"Topic Summary: {topic_summary}")
    print(f"Topic Summary Length: {topic_summary_len}")
    print(f"No of Words in Summary: {no_of_words}")
    print(f"Length of Input IDs: {inputs_len}")         

    rhyming_words_125 = []
    while len(rhyming_words_125) < 3 or valid_rhyme == False or len(first_line) == 0:
        first_line = get_line(topic_summary, inputs_len)
        if first_line:
            end_word = remove_punctuation(first_line.split()[-1])
            valid_rhyme = filter_rhymes(end_word)
            if valid_rhyme:
                print(f"\nFirst Line: {first_line}")
                rhyming_words_125 = list(get_rhymes(end_word, 3))
                print(f"Rhyming words for '{end_word}' are {rhyming_words_125}")
                limerick = first_line + "\n"

    rhyming_word = rhyming_words_125[0]
    prompt = topic_summary + " " + first_line
    inputs_len = get_inputs_length(prompt)
    print(f"Prompt: {prompt}")
    print(f"Length of prompt: {inputs_len}")
    second_line = get_rhyming_line(prompt, rhyming_word, inputs_len)
    print(f"\nSecond Line: {second_line}")
    limerick += second_line + "\n"

    rhyming_words_34 = []
    prompt = prompt + " " + second_line
    inputs_len = get_inputs_length(prompt)
    print(f"Prompt: {prompt}")
    print(f"Length of prompt: {inputs_len}")
    while len(rhyming_words_34) < 2 or valid_rhyme == False or len(third_line) == 0:
        third_line = get_line(prompt, inputs_len)
        if third_line:
            print(f"\nThird Line: {third_line}")
            end_word = remove_punctuation(third_line.split()[-1])
            valid_rhyme = filter_rhymes(end_word)
            print(f"Does '{end_word}' have valid rhymes: {valid_rhyme}")
            rhyming_words_34 = list(get_rhymes(end_word, 3))
            print(f"Rhyming words for '{end_word}' are {rhyming_words_34}")
            if valid_rhyme and len(rhyming_words_34) > 1:
                limerick += third_line + "\n"

    rhyming_word = rhyming_words_34[0]
    prompt = prompt + " " + third_line
    inputs_len = get_inputs_length(prompt)
    print(f"Prompt: {prompt}")
    print(f"Length of prompt: {inputs_len}")
    fourth_line = get_rhyming_line(prompt, rhyming_word, inputs_len)
    print(f"\nFourth Line: {fourth_line}")
    limerick += fourth_line + "\n"

    rhyming_word = rhyming_words_125[1]
    prompt = prompt + " " + fourth_line
    inputs_len = get_inputs_length(prompt)
    print(f"Prompt: {prompt}")
    print(f"Length of prompt: {inputs_len}")
    fifth_line = get_rhyming_line(prompt, rhyming_word, inputs_len)
    print(f"\nFifth Line: {fifth_line}")
    limerick += fifth_line + "\n"

    print("\n")
    print(limerick)

    return limerick

def compare_summaries(topic):
    wiki_limerick = generate(topic, wiki=True)
    gptneo_limerick = generate(topic, wiki=False)

    output = f"Limerick with Wikipedia summary of topic as prompt: \n"
    output += wiki_limerick + "\n"
    output += f"Limerick with GPT Neo summary of topic as prompt: \n"
    output += gptneo_limerick

    return output
      
import gradio as gr

interface = gr.Interface(
    fn=compare_summaries, 
    inputs="text", 
    outputs="text")
interface.launch(debug=True)