pleonova commited on
Commit
ebb04f6
1 Parent(s): d15705b

Remove dependence on Spacy

Browse files
Files changed (1) hide show
  1. models.py +1 -4
models.py CHANGED
@@ -4,9 +4,6 @@ import streamlit as st
4
  from keybert import KeyBERT
5
 
6
 
7
- import spacy
8
- nlp = spacy.load('en_core_web_sm')
9
-
10
  # Reference: https://discuss.huggingface.co/t/summarization-on-long-documents/920/7
11
  def create_nest_sentences(document:str, token_max_length = 1024):
12
  nested = []
@@ -15,7 +12,7 @@ def create_nest_sentences(document:str, token_max_length = 1024):
15
  tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
16
  tokens = nlp(document)
17
 
18
- for sentence in tokens.sents:
19
  tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
20
  length += len(tokens_in_sentence)
21
 
 
4
  from keybert import KeyBERT
5
 
6
 
 
 
 
7
  # Reference: https://discuss.huggingface.co/t/summarization-on-long-documents/920/7
8
  def create_nest_sentences(document:str, token_max_length = 1024):
9
  nested = []
 
12
  tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
13
  tokens = nlp(document)
14
 
15
+ for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')):
16
  tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
17
  length += len(tokens_in_sentence)
18