File size: 4,438 Bytes
6cbcdf3
4c04f50
 
6cbcdf3
7dce6dc
 
 
 
 
 
7fd4e54
6cbcdf3
 
7dce6dc
 
 
6cbcdf3
7dce6dc
 
 
 
 
 
 
 
7fd4e54
 
7dce6dc
 
 
 
 
 
 
 
 
6cbcdf3
 
 
 
 
 
 
7dce6dc
 
7fd4e54
 
 
 
 
 
 
 
 
 
 
 
 
7dce6dc
 
 
6cbcdf3
7dce6dc
 
 
 
7fd4e54
7dce6dc
 
6cbcdf3
 
7dce6dc
 
6cbcdf3
7dce6dc
6cbcdf3
 
5dff3c8
 
6cbcdf3
5dff3c8
 
7dce6dc
 
 
6cbcdf3
 
e50699c
 
 
 
 
 
7dce6dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6cbcdf3
7dce6dc
 
 
 
 
 
 
 
 
6cbcdf3
7dce6dc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""Separate text to zh en lists."""
# pylint: disable=unused-import, too-many-locals, invalid-name, too-many-branches, too-many-statements,


# from typing import Tuple,
from typing import Iterable, List, Optional, Tuple, Union  # noqa

import numpy as np

# from fastlid import fastlid
from polyglot.text import Detector
from logzero import logger

from radiobee.lists2cmat import lists2cmat
from radiobee.detect import detect


def text2lists(
    text: Union[Iterable[str], str],
    set_languages: Optional[List[str]] = None,
) -> Tuple[List[str], List[str]]:
    """Separate text to zh en lists.

    Args:
        text: mixed text
        set_languages: no default (open-end)
            use polyglot.text.Detector to pick two languages

    Attributes:
        cmat: correlation matrix (len(list_l) x len(list_r))
            before adjusting (shifting)
        offset: plus, [""] * offset + list2
                minus, [""] * (-offset) + list1
    Returns:
        two lists, best effort alignment
    """
    if not isinstance(text, str) and isinstance(text, Iterable):
        try:
            text = "\n".join(text)
        except Exception as e:
            logger.error(e)
            raise

    # set_languages default to ["en", "zh"]
    if set_languages is None:
        lang12 = [elm.code for elm in Detector(text).languages]

        # set_languages = ["en", "zh"]

        # set 'un' to 'en'
        # set_languages = ['en' if elm in ['un'] else elm for elm in lang12[:2]]
        set_languages = []
        for elm in lang12[:2]:
            if elm in ["un"]:
                logger.warning(" Unknown language, set to en")
                set_languages.append("en")
            else:
                set_languages.append(elm)

    # fastlid.set_languages = set_languages

    list1 = []
    list2 = []

    # lang0, _ = fastlid(text[:15000])
    lang0 = detect(text, set_languages)

    res = []
    left = True  # start with left list1

    for elm in [_ for _ in text.splitlines() if _.strip()]:
        # lang, _ = fastlid(elm)
        lang = detect(elm, set_languages)
        if lang == lang0:
            res.append(elm)
        else:
            if left:
                # list1.append("\n".join(res))
                list1.extend(res)
            else:
                # list2.append("\n".join(res))
                list2.extend(res)
            left = not left

            res = [elm]
            lang0 = lang

    # process the last
    if left:
        list1.extend(res)
    else:
        list2.extend(res)

    try:
        # lang1, _ = fastlid(' '.join(list1))
        lang1 = detect(" ".join(list1), set_languages)
    except Exception as exc:
        logger.error(exc)
        lang1 = "en"
    try:
        # lang2, _ = fastlid(' '.join(list2))
        lang2 = detect(" ".join(list2), set_languages)
    except Exception as exc:
        logger.error(exc)
        lang2 = "en"

    # find offset via diagonal(k),
    len1, len2 = len(list1), len(list2)

    # len2, len1 = cmat.shape
    # len_r, len_c = cmat.shape
    # ylim, xlim = cmat.shape
    ylim, xlim = len2, len1  # check

    # cmat dim: len1 x len2 or ylim x xlim
    cmat = lists2cmat(list1, list2, lang1, lang2)

    # sq_mean_pair = [(elm, np.square(cmat.diagonal(elm)).mean()) for elm in range(2 - ylim, xlim + 1)]
    # df = pd.DataFrame(sq_mean_pair, columns=['offset', 'sq_mean'])
    # df.plot.scatter('offset', 'sq_mean')
    # optimum_offset = df.offset[df.sq_mean.argmax()]

    # equiv to np.argmax(sq_mean) - (ylim - 2)
    # locate max, -ylim + 2 ...xlim: range(1 - ylim, xlim)
    # sqare sum

    sq_mean = [np.square(cmat.diagonal(elm)).mean() for elm in range(1 - ylim, xlim - 1)]
    # tot: xlim + ylim - 1

    # temp = [np.square(cmat.diagonal(elm)) for elm in range(2 - ylim, xlim + 1)]
    # sq_mean = [elm.mean() if np.any(elm) else 0.0 for elm in temp]

    # plt.figure()
    # plt.scatter(range(1 - ylim, xlim), sq_mean)

    offset = np.argmax(sq_mean) - (ylim - 1)

    text2lists.cmat = cmat
    text2lists.offset = offset
    text2lists.lang1 = lang1
    text2lists.lang2 = lang2

    # shift list1 if offsset >= 0, else shift list2
    if offset > -1:
        # list1a = list1[:]
        # list2a = [""] * offset + list2
        list2 = [""] * offset + list2
    else:
        list1 = [""] * (-offset) + list1
        # list1a = [""] * (-offset) + list1
        # list2a = list2[:]

    return list1, list2