import pandas as pd import numpy as np import re import gradio as gr import nltk from nltk.tokenize import sent_tokenize nltk.download('punkt') from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from transformers import PegasusForConditionalGeneration, PegasusTokenizer import torch def read_in_text(url): with open(url, 'r') as file: article = file.read() return article def clean_text(url): text = url #converting the text to all lower case text = text.lower() #removing the dates, time and name of author text = re.sub('(by[\s\w,|]+ - \d\d\/\d\d\/\d\d\s\d+:\d+\s\w{2}\s\w{2})|(by[\s\w|,]+\d\d,\s\d{4})', "", text) return text device = 'cuda' if torch.cuda.is_available() else 'cpu' print ("device ",device) T5_model = AutoModelForSeq2SeqLM.from_pretrained("jaimin/T5-Large") T5_tokenizer = AutoTokenizer.from_pretrained("jaimin/T5-Large") pegasus_model = PegasusForConditionalGeneration.from_pretrained('jaimin/pegasus').to(device) pegasus_tokenizer = PegasusTokenizer.from_pretrained('jaimin/pegasus') # Diverse Beam search def my_paraphrase(sentence, model, tokenizer,beams): text = "paraphrase: "+sentence + " " encoding = tokenizer.encode_plus(text, padding=True, return_tensors="pt", truncation=True) input_ids,attention_mask = encoding["input_ids"].to(device), encoding["attention_mask"].to(device) model.eval() diverse_beam_outputs = model.generate( input_ids=input_ids,attention_mask=attention_mask, max_length = 512, early_stopping=True, num_beams=beams, num_beam_groups = 5, num_return_sequences=5, diversity_penalty = 0.70 ) sent = tokenizer.decode(diverse_beam_outputs[0], skip_special_tokens=True,clean_up_tokenization_spaces=True) return sent def return_output(file, models,beams): docs = file sentence = clean_text(docs) if models == 'T5': model = T5_model tokenizer = T5_tokenizer elif models == 'Pegasus': model = pegasus_model tokenizer = pegasus_tokenizer output = " ".join([my_paraphrase(sent, model, tokenizer,beams) for sent in sent_tokenize(sentence)]) new_output = output.replace('paraphrasedoutput:', "") new_output = new_output.replace('.', '.\n') return new_output demo = gr.Interface(return_output, inputs=[gr.inputs.Textbox(label="Text", optional=False), gr.inputs.Dropdown(['Pegasus', 'T5'], type="value", default=None, label="Models", optional=False), gr.Slider(label="Number of Beams", minimum=5, maximum=25, step=5, randomize=True, type="value", default=5, optional=False)], outputs=[gr.outputs.Textbox(label="Summary")]) if __name__ == "__main__": demo.launch()