d0r1h commited on
Commit
9eb4d18
1 Parent(s): c798528

Update summarizer.py

Browse files

updated with wordcloud and input option for both link and text

Files changed (1) hide show
  1. summarizer.py +12 -5
summarizer.py CHANGED
@@ -1,7 +1,13 @@
1
  import re
 
 
2
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
 
4
- def summarize(text, model):
 
 
 
 
5
 
6
  if model == "T5":
7
  checkpoint = "csebuetnlp/mT5_multilingual_XLSum"
@@ -10,7 +16,6 @@ def summarize(text, model):
10
 
11
  WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
12
 
13
-
14
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
15
  model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
16
 
@@ -33,6 +38,8 @@ def summarize(text, model):
33
  summary = tokenizer.decode(
34
  output_ids,
35
  skip_special_tokens=True,
36
- clean_up_tokenization_spaces=False )
37
-
38
- return summary
 
 
 
1
  import re
2
+ from extractdata import extract_text
3
+ from wordcloud import plot_wordcloud
4
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
5
 
6
+ def summarize(input_, model):
7
+ if input_.split("/")[0] == "https:":
8
+ text = extract_text(input)
9
+ else:
10
+ text = input_
11
 
12
  if model == "T5":
13
  checkpoint = "csebuetnlp/mT5_multilingual_XLSum"
 
16
 
17
  WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
18
 
 
19
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
20
  model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
21
 
 
38
  summary = tokenizer.decode(
39
  output_ids,
40
  skip_special_tokens=True,
41
+ clean_up_tokenization_spaces=False)
42
+
43
+ figure = plot_wordcloud(text)
44
+
45
+ return summary, figure