"""Split text to sentences. Use sentence_splitter if supported, else use polyglot.text.Text !apt install libicu-dev !install pyicu pycld2 Morfessor !pip install polyglot sentence_splitter """ # pylint: disable= from typing import List, Optional, Union import re from tqdm.auto import tqdm # from polyglot.detect.base import logger as polyglot_logger from polyglot.text import Detector, Text from sentence_splitter import split_text_into_sentences from logzero import logger # turn of polyglot.text.Detector warning # polyglot_logger.setLevel("ERROR") # fmt: off # use sentence_splitter if supported LANG_S = ["ca", "cs", "da", "nl", "en", "fi", "fr", "de", "el", "hu", "is", "it", "lv", "lt", "no", "pl", "pt", "ro", "ru", "sk", "sl", "es", "sv", "tr"] def _seg_text( text: str, lang: Optional[str] = None, # qmode: bool = False, maxlines: int = 1000 ) -> List[str]: # fmt: on """Split text to sentences. Use sentence_splitter if supported, else use polyglot.text.Text.sentences Blank lines will be removed. qmode: quick mode, skip split_text_into_sentences if True, default False vectors for all books are based on qmode=False. qmode=True is for quick test purpose only maxlines (default 1000), threshold for turn on tqdm progressbar set to <1 or a large number to turn it off """ if lang is None: try: lang = Detector(text).language.code except Exception as exc: logger.info("text[:30]: %s", text[:30]) logger.warning( "polyglot.text.Detector exc: %s, setting to 'en'", exc ) lang = "en" # if not qmode and lang in LANG_S: if lang in LANG_S: _ = [] lines = text.splitlines() # if maxlines > 1 and len(lines) > maxlines: if len(lines) > maxlines > 1: for para in tqdm(lines): if para.strip(): _.extend(split_text_into_sentences(para, lang)) else: for para in lines: if para.strip(): _.extend(split_text_into_sentences(para, lang)) return _ # return split_text_into_sentences(text, lang) # empty "" text or blank to avoid Exception if not text.strip(): return [] return [elm.string for elm in Text(text, lang).sentences] # fmt: off def seg_text( lst: Union[str, List[str]], lang: Optional[str] = None, maxlines: int = 1000, extra: Optional[str] = None, ) -> List[str]: # fmt:on """Split a list of text. Arguments: lst: text or text list extra: re.split(rf"{extra}, text) first Returns: list of splitted text. """ if isinstance(lst, str): lst = [lst] if extra: # insert \n lst = [re.sub(rf"({extra})", r"\1\n", elm) for elm in lst] res = [] for elm in lst: res.extend(_seg_text( elm, lang=lang, maxlines=maxlines, )) return res