radiobee-aligner / radiobee /align_sents.py
freemt
Update shpinx docs
71a7230
raw
history blame
No virus
1.59 kB
"""Align sents via gale-church."""
# pylint: disable=
from typing import List, Tuple # noqa
import re
# from itertools import tee
# from more_itertools import ilen
from nltk.translate.gale_church import align_blocks
from radiobee.amend_avec import amend_avec
def align_sents(lst1: List[str], lst2: List[str]) -> List[Tuple[str, str]]:
"""Align sents.
>>> lst1, lst2 = ['a', 'bs',], ['aaa', '34', 'a', 'b']
"""
if isinstance(lst1, str):
lst1 = [lst1]
if isinstance(lst2, str):
lst2 = [lst2]
src_blocks = [len(re.sub(r"\s+", "", elm)) for elm in lst1]
tgt_blocks = [len(re.sub(r"\s+", "", elm)) for elm in lst2]
avec = align_blocks(src_blocks, tgt_blocks)
len1, len2 = len(lst1), len(lst2)
# lst1, _ = tee(lst1)
# len1 = ilen(_)
# lst2, _ = tee(lst2)
# len2 = ilen(_)
amended_avec = amend_avec(avec, len1, len2)
texts = []
# for elm in aset:
for elm0, elm1 in amended_avec:
# elm0, elm1, elm2 = elm
_ = []
# src_text first
if isinstance(elm0, str):
_.append("")
else:
# _.append(src_text[int(elm0)])
_.append(lst1[int(elm0)])
if isinstance(elm1, str):
_.append("")
else:
# _.append(tgt_text[int(elm0)])
_.append(lst2[int(elm1)])
_a = """
if isinstance(elm2, str):
_.append("")
else:
_.append(round(elm2, 2))
# """
del _a
texts.append(tuple(_))
return texts
# return ["", ""]