File size: 9,357 Bytes
bf87f43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7cdc67
89d669f
f537c4c
0905493
d7cdc67
 
5ae3f92
d7cdc67
89d669f
16195e5
89d669f
d7cdc67
 
 
89d669f
7d6526a
f537c4c
c843262
89d669f
 
6663376
 
 
 
03be791
 
89d669f
 
 
 
 
 
 
 
03be791
 
7d6526a
99cd496
efbffad
 
 
37fb9cf
92cb1c3
4be3c52
abc8266
4be3c52
37fb9cf
4be3c52
79618dd
 
 
 
 
7d6526a
0b7bd15
efbffad
 
 
 
 
 
16195e5
 
6b29c07
6663376
89d669f
 
 
42b9713
 
 
 
89d669f
4aca0df
 
89d669f
5fb870c
 
 
844aef2
 
 
 
7d6526a
844aef2
0905493
 
 
 
 
3812263
 
 
 
0905493
771426e
ac0951c
6663376
71fa1a4
 
 
6663376
 
89d669f
 
16195e5
 
0c6d923
70f5c39
0905493
5ae3f92
 
 
844aef2
 
0905493
844aef2
 
5fb870c
 
 
844aef2
c843262
44c4eaa
5fb870c
16195e5
6663376
 
5ae3f92
 
 
da8f9c2
5ae3f92
da8f9c2
 
6663376
 
c978e0b
 
6663376
 
 
 
 
 
 
 
c978e0b
 
6663376
 
 
c843262
6663376
 
16195e5
c978e0b
5821b23
 
16195e5
c843262
c978e0b
 
5821b23
c978e0b
 
6663376
 
c978e0b
6ed04ff
89d669f
1ccfc22
899f8ea
c843262
899f8ea
c843262
6663376
 
4360582
c843262
 
 
1b2837a
130a100
6663376
 
c843262
 
 
 
771426e
edd5899
c843262
771426e
c843262
 
34c6270
899f8ea
 
c843262
cd374f0
c843262
 
 
6663376
771426e
c843262
 
 
 
 
 
 
 
771426e
6663376
c843262
771426e
 
c843262
 
 
 
 
 
 
771426e
c843262
 
 
 
 
 
 
771426e
6663376
312ea97
 
7d6526a
c843262
 
 
 
 
7f331da
c843262
 
130a100
c843262
 
6663376
c843262
16195e5
efbffad
 
 
16195e5
 
899f8ea
16195e5
1ccfc22
 
 
 
 
 
 
 
c843262
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
"""Gen ubee main.

private
url = 'https://hf.space/embed/mikeee/zero-shot/+/api/predict'
resp = httpx.post(
    url,
    json={"data": ["love", ",".join(["liebe", "this is test", "hate you"]), False]},
    timeout=httpx.Timeout(None, connect=3),
)
resp.json()
{'data': [{'label': 'liebe',
   'confidences': [{'label': 'liebe', 'confidence': 0.8688847422599792},
    {'label': 'this is test', 'confidence': 0.12558135390281677},
    {'label': 'hate you', 'confidence': 0.005533925257623196}]}],
 'duration': 0.265749454498291,
 'average_duration': 4.639325571060181}

"""
# pylint: disable=unused-import, wrong-import-position, wrong-import-order, too-many-locals, broad-except, line-too-long

import sys
from itertools import zip_longest
from pathlib import Path
from random import shuffle
from textwrap import dedent
from typing import Optional, Tuple

import gradio as gr
import logzero
import pandas as pd
from icecream import ic
from icecream import install as ic_install
from logzero import logger
from set_loglevel import set_loglevel

from ubee import __version__
from ubee.ubee import ubee

# for embeddable python
# if "." not in sys.path: sys.path.insert(0, ".")

logzero.loglevel(set_loglevel())
logger.debug(" debug on ")

ic_install()
ic.configureOutput(
    includeContext=True,
    outputFunction=logger.info,
)
ic.enable()
# ic.disenable()  # to turn off

ic(" ic.enabled ")

_ = """
ic("Testing...")
import model_pool
from model_pool import fetch_check_aux
print("model-pool version", model_pool.__version__)
print("gradio version", gr.__version__)

try:
    fetch_check_aux.fetch_check_aux()
except Exception as _:
    ic(["fetch_check_aux.fetch_check_aux", _])

from model_pool.load_model import load_model
try:
    clas = load_model("clas-l-user")
except Exception as _:
    ic(["load_model(\"clas-l-user\")", _])
# """

# _ = clas("love", ["liebe", "hate you", "test"])
# print(_)
# raise SystemExit("Exit by intention")
# {'sequence': 'love', 'labels': ['liebe', 'test', 'hate you'],
# 'scores': [0.8885253667831421, 0.10581762343645096, 0.005657028406858444]}
# Runs OK


# segment: str
def ifn(text1, text2, thresh):
    """Take inputs, return outputs.

    Args:
        text1: text
        text2: text
    Returns:
        pd.DataFrame
    """
    res1 = [elm.strip() for elm in text1.splitlines() if elm.strip()]
    res2 = [elm.strip() for elm in text2.splitlines() if elm.strip()]

    ic(res1)
    ic(res2)

    # _ = pd.DataFrame(zip_longest(res1, res2), columns=["text1", "text2"])
    # return _

    res1_, res2_ = ubee(res1, res2, thresh)
    # res1_, res2_ = res1, res2

    out_df = pd.DataFrame(
        zip_longest(res1, res2),
        columns=["text1", "text2"],
    )

    if res2_:
        _ = pd.DataFrame(res2_, columns=["text1", "text2"])
    else:
        _ = None

    # return out_df, pd.DataFrame(res1_, columns=["text1", "text2", "likelihood"]), _

    df = pd.DataFrame(res1_, columns=["text1", "text2", "likelihood"])
    html1 = df.to_html() if df is not None else df

    html2 = _.to_html() if _ is not None else _

    return html1, html2


def main():
    """Create main entry."""
    # global text1, text2, threash

    text_zh = Path("data/test_zh.txt").read_text(encoding="utf8")
    text_zh = [elm.strip() for elm in text_zh.splitlines() if elm.strip()][:10]
    text_zh = "\n\n".join(text_zh)

    text_en = [
        elm.strip()
        for elm in Path("data/test_en.txt").read_text(encoding="utf8").splitlines()
        if elm.strip()
    ]
    _ = text_en[:9]
    shuffle(_)
    text_en = "\n\n".join(_)

    title = "Ultimatumbee"
    theme = "dark-grass"
    theme = "grass"
    description = """WIP showcasing a novel aligner"""
    article = dedent(
        """
        ## NB

        *   The ultimatumbee aligner (``ubee`` for short) is intended for aligning text blocks (be it paragraphs, sentences or words). Since it is rather slow (30 para pairs (Wuthering Height ch1. for example) can take 10 to 20 mniutes), anything more than 50 blocks should probably be avaoided. Nevertheless, you are welcome to try. No big brother is watching.

        *   ``thresh``: longer text blocks justify a larger value; `.5` appears to be just right for paragraphs for Wuthering Height ch1.

        Stay tuned for more details coming soon...
        """
    ).strip()

    ex1_zh = [
        "雪开始下大了。",
        "我握住门柄又试一回。",
        "这时一个没穿外衣的年轻人,扛着一根草耙,在后面院子里出现了。",
        "他招呼我跟着他走,穿过了一个洗衣房和一片铺平的地,那儿有煤棚、抽水机和鸽笼,我们终于到了我上次被接待过的那间温暖的、热闹的大屋子。",
        "煤、炭和木材混合在一起燃起的熊熊炉火,使这屋子放着光彩。",
        "在准备摆上丰盛晚餐的桌旁,我很高兴地看到了那位“太太”,以前我从未料想到会有这么一个人存在的。",
        "我鞠躬等候,以为她会叫我坐下。",
        "她望望我,往她的椅背一靠,不动,也不出声。",
    ]
    ex1_en = [
        "The snow began to drive thickly.",
        "I seized the handle to essay another trial; when a young man without coat, and shouldering a pitchfork, appeared in the yard behind.",
        "He hailed me to follow him, and, after marching through a wash-house, and a paved area containing a coal shed, pump, and pigeon cot, we at length arrived in the huge, warm, cheerful apartment, where I was formerly received.",
        "It glowed delightfully in the radiance of an immense fire, compounded of coal, peat, and wood; and near the table, laid for a plentiful evening meal, I was pleased to observe the `missis', an individual whose existence I had never previously suspected.",
        "I bowed and waited, thinking she would bid me take a seat.",
        "She looked at me, leaning back in her chair, and remained motionless and mute.",
    ]
    shuffle(ex1_en)
    ex1_zh = "\n".join(ex1_zh)
    ex1_en = "\n".join(ex1_en)

    ex2_zh = "她\n望望\n我\n往\n她的\n椅背\n一靠\n不\n动\n也\n不\n出声"
    ex2_en = "She looked at me leaning back in her chair and remained motionless and mute".split()
    shuffle(ex2_en)
    ex2_en = "\n".join(ex2_en)

    examples = [
        [ex2_zh, ex2_en, 0.3],
        [text_zh, text_en, 0.5],
    ]
    lines = 15
    placeholder = "Type or paste text here"

    # blocks = gr.Blocks()

    with gr.Blocks() as blocks:
        gr.Markdown(
            dedent(
                f"""
            ## Ultimatumbee {__version__}

            Align non-sequential dualtexts.

            可对词、句、段,每个词(或句或段)一行。可对任意语言对(英中、英德、德法、中日……等等)。建议 threshold 门槛值 -- 词: 0.3,句:0.5, 段: 0.7。如果太多 leftover,可适当调小 threshold。 如果太多误对则可以适当调大 threshold。

            """
            ).strip()
        )
        with gr.Column():
            with gr.Row():
                text1 = gr.inputs.Textbox(
                    lines=lines, placeholder=placeholder, default=ex1_zh, label="text1"
                )
                text2 = gr.inputs.Textbox(
                    lines=lines, placeholder=placeholder, default=ex1_en, label="text2"
                )
            with gr.Row():
                thresh = gr.Slider(
                    minimum=0.1,
                    maximum=0.9,
                    step=0.1,
                    value=0.4,
                    label="threshold",
                )
                btn = gr.Button("Run")

            _ = """
            out_df = gr.outputs.Dataframe(
                headers=None,
                max_rows=lines,  # 20
                max_cols=None,
                overflow_row_behaviour="paginate",
                type="auto",
                label="To be aligned",
            )
            # """

            with gr.Row():
                _ = """
                aligned = gr.Dataframe(
                    headers=None,
                    max_rows=lines,  # 20
                    max_cols=None,
                    overflow_row_behaviour="paginate",
                    type="auto",
                    label="Aligned",
                )
                leftover = gr.Dataframe(
                    headers=None,
                    max_rows=lines,  # 20
                    max_cols=None,
                    overflow_row_behaviour="paginate",
                    type="auto",
                    label="Leftover",
                )
                # """

            aligned = gr.HTML(label="Aligned")
            leftover = gr.HTML(label="Leftover")

            btn.click(
                fn=ifn,
                inputs=[
                    text1,
                    text2,
                    thresh,
                ],
                outputs=[
                    # out_df,
                    aligned,
                    leftover,
                ],
            )

    # blocks.launch()
    blocks.launch(debug=True, enable_queue=True)


if __name__ == "__main__":
    # logger.info(" Start main()")
    main()

_ = """

        gr.inputs.Radio(
            ["para", "sent", "word"],
            default="para",
            label="segment"
        )
# """