Paraphrase-2 / app.py
jaimin's picture
Update app.py
3cd6437
raw
history blame contribute delete
No virus
2.83 kB
import pandas as pd
import numpy as np
import re
import gradio as gr
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch
def read_in_text(url):
with open(url, 'r') as file:
article = file.read()
return article
def clean_text(url):
text = url
#converting the text to all lower case
text = text.lower()
#removing the dates, time and name of author
text = re.sub('(by[\s\w,|]+ - \d\d\/\d\d\/\d\d\s\d+:\d+\s\w{2}\s\w{2})|(by[\s\w|,]+\d\d,\s\d{4})', "", text)
return text
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print ("device ",device)
T5_model = AutoModelForSeq2SeqLM.from_pretrained("jaimin/T5-Large")
T5_tokenizer = AutoTokenizer.from_pretrained("jaimin/T5-Large")
pegasus_model = PegasusForConditionalGeneration.from_pretrained('jaimin/pegasus').to(device)
pegasus_tokenizer = PegasusTokenizer.from_pretrained('jaimin/pegasus')
# Diverse Beam search
def my_paraphrase(sentence, model, tokenizer,beams):
text = "paraphrase: "+sentence + " </s>"
encoding = tokenizer.encode_plus(text, padding=True, return_tensors="pt", truncation=True)
input_ids,attention_mask = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
model.eval()
diverse_beam_outputs = model.generate(
input_ids=input_ids,attention_mask=attention_mask,
max_length = 512,
early_stopping=True,
num_beams=beams,
num_beam_groups = 5,
num_return_sequences=5,
diversity_penalty = 0.70
)
sent = tokenizer.decode(diverse_beam_outputs[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)
return sent
def return_output(file, models,beams):
docs = file
sentence = clean_text(docs)
if models == 'T5':
model = T5_model
tokenizer = T5_tokenizer
elif models == 'Pegasus':
model = pegasus_model
tokenizer = pegasus_tokenizer
output = " ".join([my_paraphrase(sent, model, tokenizer,beams) for sent in sent_tokenize(sentence)])
new_output = output.replace('paraphrasedoutput:', "")
new_output = new_output.replace('.<n>', '.\n')
return new_output
demo = gr.Interface(return_output, inputs=[gr.inputs.Textbox(label="Text", optional=False),
gr.inputs.Dropdown(['Pegasus', 'T5'], type="value", default=None, label="Models", optional=False),
gr.Slider(label="Number of Beams", minimum=5, maximum=25, step=5, randomize=True, type="value", default=5, optional=False)],
outputs=[gr.outputs.Textbox(label="Summary")])
if __name__ == "__main__":
demo.launch()