File size: 3,268 Bytes
71a7230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c04f50
 
71a7230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
"""Split text to sentences.

Use sentence_splitter if supported,
else use polyglot.text.Text

!apt install libicu-dev
!install pyicu pycld2 Morfessor
!pip install polyglot sentence_splitter
"""
# pylint: disable=

from typing import List, Optional, Union

import re
from tqdm.auto import tqdm
from polyglot.detect.base import logger as polyglot_logger
from polyglot.text import Detector, Text
from sentence_splitter import split_text_into_sentences

from logzero import logger

# turn of polyglot.text.Detector warning
polyglot_logger.setLevel("ERROR")


# fmt: off
# use sentence_splitter if supported
LANG_S = ["ca", "cs", "da", "nl", "en", "fi", "fr", "de",
          "el", "hu", "is", "it", "lv", "lt", "no", "pl",
          "pt", "ro", "ru", "sk", "sl", "es", "sv", "tr"]


def _seg_text(
        text: str,
        lang: Optional[str] = None,
        # qmode: bool = False,
        maxlines: int = 1000
) -> List[str]:
    # fmt: on
    """Split text to sentences.

    Use sentence_splitter if supported,
    else use polyglot.text.Text.sentences
    Blank lines will be removed.

    qmode: quick mode, skip split_text_into_sentences if True, default False
        vectors for all books are based on qmode=False.
        qmode=True is for quick test purpose only

    maxlines (default 1000), threshold for turn on tqdm progressbar
        set to <1 or a large number to turn it off
    """
    if lang is None:
        try:
            lang = Detector(text).language.code
        except Exception as exc:
            logger.info("text[:30]: %s", text[:30])
            logger.warning(
                "polyglot.text.Detector exc: %s, setting to 'en'",
                exc
            )
            lang = "en"

    # if not qmode and lang in LANG_S:
    if lang in LANG_S:
        _ = []
        lines = text.splitlines()
        # if maxlines > 1 and len(lines) > maxlines:
        if len(lines) > maxlines > 1:
            for para in tqdm(lines):
                if para.strip():
                    _.extend(split_text_into_sentences(para, lang))
        else:
            for para in lines:
                if para.strip():
                    _.extend(split_text_into_sentences(para, lang))
        return _

        # return split_text_into_sentences(text, lang)

    # empty "" text or blank to avoid Exception
    if not text.strip():
        return []

    return [elm.string for elm in Text(text, lang).sentences]


# fmt: off
def seg_text(
        lst: Union[str, List[str]],
        lang: Optional[str] = None,
        maxlines: int = 1000,
        extra: Optional[str] = None,
) -> List[str]:
    # fmt:on
    """Split a list of text.

    Arguments:
        lst: text or text list
        lang: optional lang code
        maxlines: (default 1000), threshold for turn on tqdm progressbar, set to <1 or a large number to turn it off
        extra: re.split(rf"{extra}, text) first
    Returns:
        list of splitted text.
    """
    if isinstance(lst, str):
        lst = [lst]

    if extra:
        # insert \n
        lst = [re.sub(rf"({extra})", r"\1\n", elm) for elm in lst]

    res = []
    for elm in lst:
        res.extend(_seg_text(
            elm,
            lang=lang,
            maxlines=maxlines,
        ))

    return res