File size: 1,135 Bytes
844aef2
 
 
d7cdc67
89d669f
d7cdc67
844aef2
d7cdc67
844aef2
89d669f
844aef2
 
 
 
 
 
 
 
 
 
d7cdc67
844aef2
 
 
 
 
 
 
 
 
 
 
5ae3f92
844aef2
 
c978e0b
844aef2
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
"""Align via ubee,"""
# pylint: disable=
from itertools import zip_longest
from typing import Iterable, List, Tuple

from icecream import ic
from logzero import logger

from ubee.uclas import uclas


def ubee(
    sents_zh: Iterable,
    sents_en: Iterable,
    thresh: float = 0.5,
) -> Tuple[List[Tuple[str, str, float]], List[Tuple[str, str]]]:
    """Align blocks.

    Args:
        sents_zh: list of text, can be any langauge supported by clas-l-user
        sents_en: ditto
    Returns:
        three tuples of aligned blocked
        leftovers (unaligned)
    """
    res = []
    labels = [*sents_en]

    lo1 = []
    lo2 = labels[:]

    for seq in sents_zh:
        ic(seq)
        label, likelihood = uclas(seq, labels, thresh=thresh)
        if label:
            likelihood = round(float(likelihood), 2)
            res.append((seq, label, likelihood))
            try:
                lo2.remove(label)
            except Exception as exc:
                logger.error(exc)
                logger.info("seq: %s, lable: %s", seq, label)
        else:
            lo1.append(seq)
    return res, [*zip_longest(lo1, lo2)]