Merge pull request #4 from pleonova/pleonova-patch-nested
Browse files- models.py +1 -4
- requirements.txt +0 -2
models.py
CHANGED
@@ -4,9 +4,6 @@ import streamlit as st
|
|
4 |
from keybert import KeyBERT
|
5 |
|
6 |
|
7 |
-
import spacy
|
8 |
-
nlp = spacy.load('en_core_web_sm')
|
9 |
-
|
10 |
# Reference: https://discuss.huggingface.co/t/summarization-on-long-documents/920/7
|
11 |
def create_nest_sentences(document:str, token_max_length = 1024):
|
12 |
nested = []
|
@@ -15,7 +12,7 @@ def create_nest_sentences(document:str, token_max_length = 1024):
|
|
15 |
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
|
16 |
tokens = nlp(document)
|
17 |
|
18 |
-
for sentence in
|
19 |
tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
|
20 |
length += len(tokens_in_sentence)
|
21 |
|
|
|
4 |
from keybert import KeyBERT
|
5 |
|
6 |
|
|
|
|
|
|
|
7 |
# Reference: https://discuss.huggingface.co/t/summarization-on-long-documents/920/7
|
8 |
def create_nest_sentences(document:str, token_max_length = 1024):
|
9 |
nested = []
|
|
|
12 |
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
|
13 |
tokens = nlp(document)
|
14 |
|
15 |
+
for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')):
|
16 |
tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
|
17 |
length += len(tokens_in_sentence)
|
18 |
|
requirements.txt
CHANGED
@@ -5,5 +5,3 @@ plotly
|
|
5 |
torch
|
6 |
sklearn
|
7 |
KeyBERT
|
8 |
-
spacy>=2.2.0,<3.0.0
|
9 |
-
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz#egg=en_core_web_sm
|
|
|
5 |
torch
|
6 |
sklearn
|
7 |
KeyBERT
|
|
|
|