File size: 2,420 Bytes
b04763d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
from typing import List
from spacy.lang.en import English
class SentenceHandler(object):
def __init__(self, language=English):
"""
Base Sentence Handler with Spacy support.
:param language: Determines the language to use with spacy.
"""
self.nlp = language()
try:
# Supports spacy 2.0
self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))
self.is_spacy_3 = False
except Exception:
# Supports spacy 3.0
self.nlp.add_pipe("sentencizer")
self.is_spacy_3 = True
def sentence_processor(self, doc,
min_length: int = 40,
max_length: int = 600) -> List[str]:
"""
Processes a given spacy document and turns them into sentences.
:param doc: The document to use from spacy.
:param min_length: The minimum length a sentence should be to be considered.
:param max_length: The maximum length a sentence should be to be considered.
:return: Sentences.
"""
to_return = []
for c in doc.sents:
if max_length > len(c.text.strip()) > min_length:
if self.is_spacy_3:
to_return.append(c.text.strip())
else:
to_return.append(c.string.strip())
return to_return
def process(self, body: str,
min_length: int = 40,
max_length: int = 600) -> List[str]:
"""
Processes the content sentences.
:param body: The raw string body to process
:param min_length: Minimum length that the sentences must be
:param max_length: Max length that the sentences mus fall under
:return: Returns a list of sentences.
"""
doc = self.nlp(body)
return self.sentence_processor(doc, min_length, max_length)
def __call__(self, body: str,
min_length: int = 40,
max_length: int = 600) -> List[str]:
"""
Processes the content sentences.
:param body: The raw string body to process
:param min_length: Minimum length that the sentences must be
:param max_length: Max length that the sentences mus fall under
:return: Returns a list of sentences.
"""
return self.process(body, min_length, max_length) |