File size: 1,176 Bytes
6cbcdf3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09239dd
6cbcdf3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efedef5
 
6cbcdf3
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
"""Separate text to zh en lists."""
# pylint: disable=

from typing import Iterable, List, Tuple, Union  # noqa
from fastlid import fastlid
from logzero import logger


def text2lists(text: Union[Iterable[str], str]) -> List[Tuple[str, str]]:
    """Separate text to zh en lists."""
    if not isinstance(text, str) and isinstance(text, Iterable):
        try:
            text = "\n".join(text)
        except Exception as e:
            logger.error(e)
            raise

    set_languages = ["en", "zh"]
    fastlid.set_languages = set_languages
    list1 = []
    list2 = []  # for determining en-zh or zh-en
    lang0, _ = fastlid(text[:15000])
    res = ""
    left = False  # start with left list1

    for elm in [_ for _ in text.splitlines() if _.strip()]:
        lang, _ = fastlid(elm)
        if lang == lang0:
            res = res + "\n" + elm
        else:
            left = not left
            if left:
                list1.append(res.strip())
            else:
                list2.append(res.strip())  # strip first \n
            res = elm
            lang0 = lang

    # find offset

    left = []  # noqa
    right = []  # noqa

    return [("", "")]