from typing import List from spacy.lang.en import English class SentenceHandler(object): def __init__(self, language=English): """ Base Sentence Handler with Spacy support. :param language: Determines the language to use with spacy. """ self.nlp = language() try: # Supports spacy 2.0 self.nlp.add_pipe(self.nlp.create_pipe('sentencizer')) self.is_spacy_3 = False except Exception: # Supports spacy 3.0 self.nlp.add_pipe("sentencizer") self.is_spacy_3 = True def sentence_processor(self, doc, min_length: int = 40, max_length: int = 600) -> List[str]: """ Processes a given spacy document and turns them into sentences. :param doc: The document to use from spacy. :param min_length: The minimum length a sentence should be to be considered. :param max_length: The maximum length a sentence should be to be considered. :return: Sentences. """ to_return = [] for c in doc.sents: if max_length > len(c.text.strip()) > min_length: if self.is_spacy_3: to_return.append(c.text.strip()) else: to_return.append(c.string.strip()) return to_return def process(self, body: str, min_length: int = 40, max_length: int = 600) -> List[str]: """ Processes the content sentences. :param body: The raw string body to process :param min_length: Minimum length that the sentences must be :param max_length: Max length that the sentences mus fall under :return: Returns a list of sentences. """ doc = self.nlp(body) return self.sentence_processor(doc, min_length, max_length) def __call__(self, body: str, min_length: int = 40, max_length: int = 600) -> List[str]: """ Processes the content sentences. :param body: The raw string body to process :param min_length: Minimum length that the sentences must be :param max_length: Max length that the sentences mus fall under :return: Returns a list of sentences. """ return self.process(body, min_length, max_length)