Spaces:
Running
Running
""" | |
Split text to sentences. | |
Modifed from seg_text.seg_text.py, vtext sentence_splitter removed, use | |
Use sentence_splitter if supported, | |
else use polyglot.text.Text | |
!apt install libicu-dev | |
!install pyicu pycld2 | |
!pip install polyglot sentence_splitter | |
Use vtext and fastlid to rid of polyglot? | |
from vtext.tokenize_sentence import UnicodeSentenceTokenizer, PunctuationTokenizer | |
tok = UnicodeSentenceTokenizer() | |
seg = tok.tokenize(''' Text ''') for langs not in LANG_S | |
""" | |
# pylint: disable=invalid-name | |
import re | |
from typing import List, Optional, Union | |
import pysbd | |
from fastlid import fastlid | |
from loguru import logger | |
from tqdm.auto import tqdm | |
def _seg_text( | |
text: str, | |
lang: Optional[str] = None, | |
) -> List[str]: | |
""" | |
Split text to sentences. | |
Switched to pysbd | |
Args: | |
---- | |
text: string to split | |
lang: language, two-letter ISO (22 languages) | |
Returns: | |
------- | |
List of segmented sentences | |
""" | |
if lang is None: | |
try: | |
lang, _ = fastlid(text) | |
except Exception as exc: | |
logger.warning(" fastlid: %s, setting lang='en'", exc) | |
lang = "en" | |
if not text.strip(): | |
return [] | |
seg = pysbd.Segmenter(language=lang, clean=True) | |
try: | |
# _ = tok.tokenize(text) | |
_ = seg.segment(text) | |
except Exception as exc: | |
logger.exception(f"pysbd.Segmenter, {exc=}") | |
raise | |
return _ | |
def seg_text( | |
lst: Union[str, List[str]], | |
lang: Optional[str] = None, | |
maxlines: int = 1000, | |
extra: Optional[str] = None, | |
) -> List[str]: | |
"""Split a list of text. | |
Arguments: | |
lst: text or text list | |
lang: optional lang code | |
maxlines: (default 1000), threshold for turn on tqdm progressbar, set to <1 or a large number to turn it off | |
extra: re.split(rf"{extra}, text) first | |
Returns: | |
list of splitted text. | |
""" | |
if isinstance(lst, str): | |
lst = [lst] | |
if extra: | |
# insert \n | |
lst = [re.sub(rf"({extra})", r"\1\n", elm) for elm in lst] | |
res = [] | |
for elm in lst: | |
res.extend( | |
_seg_text( | |
elm, | |
lang=lang, | |
maxlines=maxlines, | |
# flag=False, | |
) | |
) | |
return res | |