mlbee / st_mlbee /split_text.py
ffreemt
Update split_text.py
4913387
raw
history blame
2.29 kB
"""
Split text to sentences.
Modifed from seg_text.seg_text.py, vtext sentence_splitter removed, use
Use sentence_splitter if supported,
else use polyglot.text.Text
!apt install libicu-dev
!install pyicu pycld2
!pip install polyglot sentence_splitter
Use vtext and fastlid to rid of polyglot?
from vtext.tokenize_sentence import UnicodeSentenceTokenizer, PunctuationTokenizer
tok = UnicodeSentenceTokenizer()
seg = tok.tokenize(''' Text ''') for langs not in LANG_S
"""
# pylint: disable=invalid-name
import re
from typing import List, Optional, Union
import pysbd
from fastlid import fastlid
from loguru import logger
from tqdm.auto import tqdm
def _seg_text(
text: str,
lang: Optional[str] = None,
) -> List[str]:
"""
Split text to sentences.
Switched to pysbd
Args:
----
text: string to split
lang: language, two-letter ISO (22 languages)
Returns:
-------
List of segmented sentences
"""
if lang is None:
try:
lang, _ = fastlid(text)
except Exception as exc:
logger.warning(" fastlid: %s, setting lang='en'", exc)
lang = "en"
if not text.strip():
return []
seg = pysbd.Segmenter(language=lang, clean=True)
try:
# _ = tok.tokenize(text)
_ = seg.segment(text)
except Exception as exc:
logger.exception(f"pysbd.Segmenter, {exc=}")
raise
return _
def seg_text(
lst: Union[str, List[str]],
lang: Optional[str] = None,
maxlines: int = 1000,
extra: Optional[str] = None,
) -> List[str]:
"""Split a list of text.
Arguments:
lst: text or text list
lang: optional lang code
maxlines: (default 1000), threshold for turn on tqdm progressbar, set to <1 or a large number to turn it off
extra: re.split(rf"{extra}, text) first
Returns:
list of splitted text.
"""
if isinstance(lst, str):
lst = [lst]
if extra:
# insert \n
lst = [re.sub(rf"({extra})", r"\1\n", elm) for elm in lst]
res = []
for elm in lst:
res.extend(
_seg_text(
elm,
lang=lang,
maxlines=maxlines,
# flag=False,
)
)
return res