sdhanabal1 commited on
Commit
c98407b
1 Parent(s): c2b444a

Revert to nltk word tokenizer

Browse files
Files changed (1) hide show
  1. Summarizer.py +3 -3
Summarizer.py CHANGED
@@ -1,4 +1,5 @@
1
  import string
 
2
 
3
  from sumy.parsers import DocumentParser
4
  from sumy.parsers.html import HtmlParser
@@ -7,13 +8,12 @@ from sumy.nlp.tokenizers import Tokenizer
7
  from sumy.nlp.stemmers import Stemmer
8
  from sumy.summarizers.lsa import LsaSummarizer
9
  from sumy.utils import get_stop_words
10
- from transformers import Pipeline, BertTokenizer
11
 
12
 
13
  class Summarizer:
14
  DEFAULT_LANGUAGE = "english"
15
  DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
16
- TOKENIZER = BertTokenizer.from_pretrained('bert-base-cased')
17
  STOP_WORDS = list(get_stop_words(language=DEFAULT_LANGUAGE)) + list(string.punctuation)
18
 
19
  def __init__(self, pipeline: Pipeline):
@@ -40,7 +40,7 @@ class Summarizer:
40
  cumulative_token_length = 0
41
  for sentence in summary_sentences:
42
  result_list.append(sentence)
43
- token_list = Summarizer.TOKENIZER.tokenize(sentence)
44
  token_words = [token for token in token_list if token.lower() not in Summarizer.STOP_WORDS]
45
  token_length = len(token_words)
46
  if token_length + cumulative_token_length >= max_token_length:
 
1
  import string
2
+ import nltk
3
 
4
  from sumy.parsers import DocumentParser
5
  from sumy.parsers.html import HtmlParser
 
8
  from sumy.nlp.stemmers import Stemmer
9
  from sumy.summarizers.lsa import LsaSummarizer
10
  from sumy.utils import get_stop_words
11
+ from transformers import Pipeline
12
 
13
 
14
  class Summarizer:
15
  DEFAULT_LANGUAGE = "english"
16
  DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
 
17
  STOP_WORDS = list(get_stop_words(language=DEFAULT_LANGUAGE)) + list(string.punctuation)
18
 
19
  def __init__(self, pipeline: Pipeline):
 
40
  cumulative_token_length = 0
41
  for sentence in summary_sentences:
42
  result_list.append(sentence)
43
+ token_list = nltk.word_tokenize(sentence)
44
  token_words = [token for token in token_list if token.lower() not in Summarizer.STOP_WORDS]
45
  token_length = len(token_words)
46
  if token_length + cumulative_token_length >= max_token_length: