Spaces:
Build error
Build error
File size: 3,118 Bytes
71a7230 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
"""Split text to sentences.
Use sentence_splitter if supported,
else use polyglot.text.Text
!apt install libicu-dev
!install pyicu pycld2 Morfessor
!pip install polyglot sentence_splitter
"""
# pylint: disable=
from typing import List, Optional, Union
import re
from tqdm.auto import tqdm
from polyglot.detect.base import logger as polyglot_logger
from polyglot.text import Detector, Text
from sentence_splitter import split_text_into_sentences
from logzero import logger
# turn of polyglot.text.Detector warning
polyglot_logger.setLevel("ERROR")
# fmt: off
# use sentence_splitter if supported
LANG_S = ["ca", "cs", "da", "nl", "en", "fi", "fr", "de",
"el", "hu", "is", "it", "lv", "lt", "no", "pl",
"pt", "ro", "ru", "sk", "sl", "es", "sv", "tr"]
def _seg_text(
text: str,
lang: Optional[str] = None,
# qmode: bool = False,
maxlines: int = 1000
) -> List[str]:
# fmt: on
"""Split text to sentences.
Use sentence_splitter if supported,
else use polyglot.text.Text.sentences
Blank lines will be removed.
qmode: quick mode, skip split_text_into_sentences if True, default False
vectors for all books are based on qmode=False.
qmode=True is for quick test purpose only
maxlines (default 1000), threshold for turn on tqdm progressbar
set to <1 or a large number to turn it off
"""
if lang is None:
try:
lang = Detector(text).language.code
except Exception as exc:
logger.info("text[:30]: %s", text[:30])
logger.warning(
"polyglot.text.Detector exc: %s, setting to 'en'",
exc
)
lang = "en"
# if not qmode and lang in LANG_S:
if lang in LANG_S:
_ = []
lines = text.splitlines()
# if maxlines > 1 and len(lines) > maxlines:
if len(lines) > maxlines > 1:
for para in tqdm(lines):
if para.strip():
_.extend(split_text_into_sentences(para, lang))
else:
for para in lines:
if para.strip():
_.extend(split_text_into_sentences(para, lang))
return _
# return split_text_into_sentences(text, lang)
# empty "" text or blank to avoid Exception
if not text.strip():
return []
return [elm.string for elm in Text(text, lang).sentences]
# fmt: off
def seg_text(
lst: Union[str, List[str]],
lang: Optional[str] = None,
maxlines: int = 1000,
extra: Optional[str] = None,
) -> List[str]:
# fmt:on
"""Split a list of text.
Arguments:
lst: text or text list
extra: re.split(rf"{extra}, text) first
Returns:
list of splitted text.
"""
if isinstance(lst, str):
lst = [lst]
if extra:
# insert \n
lst = [re.sub(rf"({extra})", r"\1\n", elm) for elm in lst]
res = []
for elm in lst:
res.extend(_seg_text(
elm,
lang=lang,
maxlines=maxlines,
))
return res
|