|
"""Split text to sentences. |
|
|
|
Use sentence_splitter if supported, |
|
else use polyglot.text.Text |
|
|
|
!apt install libicu-dev |
|
!install pyicu pycld2 Morfessor |
|
!pip install polyglot sentence_splitter |
|
""" |
|
|
|
|
|
from typing import List, Optional, Union |
|
|
|
import re |
|
from tqdm.auto import tqdm |
|
|
|
from polyglot.text import Detector, Text |
|
from sentence_splitter import split_text_into_sentences |
|
|
|
from logzero import logger |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LANG_S = ["ca", "cs", "da", "nl", "en", "fi", "fr", "de", |
|
"el", "hu", "is", "it", "lv", "lt", "no", "pl", |
|
"pt", "ro", "ru", "sk", "sl", "es", "sv", "tr"] |
|
|
|
|
|
def _seg_text( |
|
text: str, |
|
lang: Optional[str] = None, |
|
|
|
maxlines: int = 1000 |
|
) -> List[str]: |
|
|
|
"""Split text to sentences. |
|
|
|
Use sentence_splitter if supported, |
|
else use polyglot.text.Text.sentences |
|
Blank lines will be removed. |
|
|
|
qmode: quick mode, skip split_text_into_sentences if True, default False |
|
vectors for all books are based on qmode=False. |
|
qmode=True is for quick test purpose only |
|
|
|
maxlines (default 1000), threshold for turn on tqdm progressbar |
|
set to <1 or a large number to turn it off |
|
""" |
|
if lang is None: |
|
try: |
|
lang = Detector(text).language.code |
|
except Exception as exc: |
|
logger.info("text[:30]: %s", text[:30]) |
|
logger.warning( |
|
"polyglot.text.Detector exc: %s, setting to 'en'", |
|
exc |
|
) |
|
lang = "en" |
|
|
|
|
|
if lang in LANG_S: |
|
_ = [] |
|
lines = text.splitlines() |
|
|
|
if len(lines) > maxlines > 1: |
|
for para in tqdm(lines): |
|
if para.strip(): |
|
_.extend(split_text_into_sentences(para, lang)) |
|
else: |
|
for para in lines: |
|
if para.strip(): |
|
_.extend(split_text_into_sentences(para, lang)) |
|
return _ |
|
|
|
|
|
|
|
|
|
if not text.strip(): |
|
return [] |
|
|
|
return [elm.string for elm in Text(text, lang).sentences] |
|
|
|
|
|
|
|
def seg_text( |
|
lst: Union[str, List[str]], |
|
lang: Optional[str] = None, |
|
maxlines: int = 1000, |
|
extra: Optional[str] = None, |
|
) -> List[str]: |
|
|
|
"""Split a list of text. |
|
|
|
Arguments: |
|
lst: text or text list |
|
extra: re.split(rf"{extra}, text) first |
|
Returns: |
|
list of splitted text. |
|
""" |
|
if isinstance(lst, str): |
|
lst = [lst] |
|
|
|
if extra: |
|
|
|
lst = [re.sub(rf"({extra})", r"\1\n", elm) for elm in lst] |
|
|
|
res = [] |
|
for elm in lst: |
|
res.extend(_seg_text( |
|
elm, |
|
lang=lang, |
|
maxlines=maxlines, |
|
)) |
|
|
|
return res |
|
|