File size: 1,345 Bytes
e6d0f4c
9eb4d18
3eec261
e6d0f4c
 
9eb4d18
 
3d3dd88
9eb4d18
 
e6d0f4c
 
 
 
d7989d6
e6d0f4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9eb4d18
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import re
from extractdata import extract_text
from wordcloudplot import plot_wordcloud
from transformers import  AutoModelForSeq2SeqLM, AutoTokenizer

def summarize(input_, model):
  if input_.split("/")[0] == "https:":
    text = extract_text(input_)
  else:
    text = input_

  if model == "T5":
    checkpoint = "csebuetnlp/mT5_multilingual_XLSum"
  elif model == "BART":
    checkpoint = "ai4bharat/IndicBART-XLSum"

  WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))

  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
  model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)


  input_ids = tokenizer(
                  [WHITESPACE_HANDLER(text)],
                  return_tensors="pt",
                  padding="max_length",
                  truncation=True,
                  max_length=512 )["input_ids"]

  output_ids = model.generate(
                    input_ids=input_ids,
                    max_length=70,
                    min_length=30,
                    no_repeat_ngram_size=2,
                    num_beams=4 )[0]     

  
  summary = tokenizer.decode(
                      output_ids,
                      skip_special_tokens=True,
                      clean_up_tokenization_spaces=False)
                      
  figure = plot_wordcloud(text)
  
  return summary, figure