File size: 1,585 Bytes
71a7230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""Align sents via gale-church."""
# pylint: disable=

from typing import List, Tuple  # noqa

import re

# from itertools import tee
# from more_itertools import ilen
from nltk.translate.gale_church import align_blocks

from radiobee.amend_avec import amend_avec


def align_sents(lst1: List[str], lst2: List[str]) -> List[Tuple[str, str]]:
    """Align sents.

    >>> lst1, lst2 = ['a', 'bs',], ['aaa', '34', 'a', 'b']
    """
    if isinstance(lst1, str):
        lst1 = [lst1]

    if isinstance(lst2, str):
        lst2 = [lst2]

    src_blocks = [len(re.sub(r"\s+", "", elm)) for elm in lst1]
    tgt_blocks = [len(re.sub(r"\s+", "", elm)) for elm in lst2]

    avec = align_blocks(src_blocks, tgt_blocks)

    len1, len2 = len(lst1), len(lst2)
    # lst1, _ = tee(lst1)
    # len1 = ilen(_)
    # lst2, _ = tee(lst2)
    # len2 = ilen(_)

    amended_avec = amend_avec(avec, len1, len2)

    texts = []
    # for elm in aset:
    for elm0, elm1 in amended_avec:
        # elm0, elm1, elm2 = elm
        _ = []

        # src_text first
        if isinstance(elm0, str):
            _.append("")
        else:
            # _.append(src_text[int(elm0)])
            _.append(lst1[int(elm0)])

        if isinstance(elm1, str):
            _.append("")
        else:
            # _.append(tgt_text[int(elm0)])
            _.append(lst2[int(elm1)])

        _a = """
        if isinstance(elm2, str):
            _.append("")
        else:
            _.append(round(elm2, 2))
        # """
        del _a

        texts.append(tuple(_))

    return texts

    # return ["", ""]