radiobee-aligner / radiobee /shuffle_sents.py
freemt
Update shpinx docs
71a7230
raw
history blame
No virus
1.59 kB
"""Shuffle sents."""
# pylint: disable=
from typing import List, Optional, Tuple, Union
from fastlid import fastlid
from logzero import logger # noqa
from radiobee.lists2cmat import lists2cmat
from radiobee.gen_pset import gen_pset
from radiobee.gen_aset import gen_aset
from radiobee.align_texts import align_texts
# fmt: off
def shuffle_sents(
lst1: List[str],
lst2: List[str],
eps: float = 6,
min_samples: int = 4,
tf_type: str = "linear",
idf_type: Optional[str] = None,
dl_type: Optional[str] = None,
norm: Optional[str] = None,
lang1: Optional[str] = None,
lang2: Optional[str] = None,
) -> List[Tuple[str, str, Union[str, float]]]:
# fmt: on
"""shuffle sents to the right positions.
Based on __main__.py.
"""
set_languages = fastlid.set_languages
fastlid.set_languages = ["en", "zh"]
if lang1 is None:
lang1, _ = fastlid(" ".join(lst1))
if lang2 is None:
lang2, _ = fastlid(" ".join(lst2))
# restore fastlid.set_languages
fastlid.set_languages = set_languages
cmat = lists2cmat(
lst1,
lst2,
tf_type=tf_type,
idf_type=idf_type,
dl_type=dl_type,
norm=norm,
lang1=lang1,
lang2=lang2,
)
pset = gen_pset(
cmat,
eps=eps,
min_samples=min_samples,
delta=7,
)
src_len, tgt_len = cmat.shape
aset = gen_aset(pset, src_len, tgt_len)
final_list = align_texts(aset, lst2, lst1)
return final_list
# return [("", "")]