File size: 2,834 Bytes
0eae565
 
 
 
 
 
 
232ad72
 
 
 
0eae565
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ee0e48
 
 
 
 
 
0eae565
232ad72
0eae565
 
 
 
 
 
 
 
 
 
232ad72
0eae565
 
 
 
 
 
 
232ad72
0eae565
 
 
 
 
3ee0e48
 
0eae565
 
3ee0e48
 
0eae565
232ad72
0eae565
 
 
 
 
1eb1f5f
3cd6437
0eae565
 
 
5bfd0c2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pandas as pd
import numpy as np
import re
import gradio as gr
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

import torch

def read_in_text(url):
  with open(url, 'r') as file:
    article = file.read()
    return article
    
def clean_text(url):
  text = url
  #converting the text to all lower case
  text = text.lower()

  #removing the dates, time and name of author
  text = re.sub('(by[\s\w,|]+ - \d\d\/\d\d\/\d\d\s\d+:\d+\s\w{2}\s\w{2})|(by[\s\w|,]+\d\d,\s\d{4})', "", text)
  return text
  
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print ("device ",device)

T5_model = AutoModelForSeq2SeqLM.from_pretrained("jaimin/T5-Large")
T5_tokenizer = AutoTokenizer.from_pretrained("jaimin/T5-Large")

pegasus_model = PegasusForConditionalGeneration.from_pretrained('jaimin/pegasus').to(device)
pegasus_tokenizer = PegasusTokenizer.from_pretrained('jaimin/pegasus')

# Diverse Beam search
def my_paraphrase(sentence, model, tokenizer,beams):

  text = "paraphrase: "+sentence + " </s>"
  encoding = tokenizer.encode_plus(text, padding=True, return_tensors="pt", truncation=True)
  input_ids,attention_mask  = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

  model.eval()
  diverse_beam_outputs = model.generate(
    input_ids=input_ids,attention_mask=attention_mask,
    max_length = 512,
    early_stopping=True,
    num_beams=beams,
    num_beam_groups = 5,
    num_return_sequences=5,
    diversity_penalty = 0.70
  )
  sent = tokenizer.decode(diverse_beam_outputs[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)
  return sent
  
def return_output(file, models,beams):

  docs = file
  sentence = clean_text(docs)

  if models == 'T5':
    model = T5_model
    tokenizer = T5_tokenizer
    
  elif models == 'Pegasus':
    model = pegasus_model
    tokenizer = pegasus_tokenizer

  output = " ".join([my_paraphrase(sent, model, tokenizer,beams) for sent in sent_tokenize(sentence)])
  new_output = output.replace('paraphrasedoutput:', "")
  new_output = new_output.replace('.<n>', '.\n')
  return new_output
  
demo = gr.Interface(return_output, inputs=[gr.inputs.Textbox(label="Text", optional=False),
                                           gr.inputs.Dropdown(['Pegasus', 'T5'], type="value", default=None, label="Models", optional=False),
                                           gr.Slider(label="Number of Beams", minimum=5, maximum=25, step=5, randomize=True, type="value", default=5, optional=False)],
                                                 outputs=[gr.outputs.Textbox(label="Summary")])
                                                 
if __name__ == "__main__":
    demo.launch()