Spaces:
Build error
Build error
File size: 1,591 Bytes
71a7230 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
"""Shuffle sents."""
# pylint: disable=
from typing import List, Optional, Tuple, Union
from fastlid import fastlid
from logzero import logger # noqa
from radiobee.lists2cmat import lists2cmat
from radiobee.gen_pset import gen_pset
from radiobee.gen_aset import gen_aset
from radiobee.align_texts import align_texts
# fmt: off
def shuffle_sents(
lst1: List[str],
lst2: List[str],
eps: float = 6,
min_samples: int = 4,
tf_type: str = "linear",
idf_type: Optional[str] = None,
dl_type: Optional[str] = None,
norm: Optional[str] = None,
lang1: Optional[str] = None,
lang2: Optional[str] = None,
) -> List[Tuple[str, str, Union[str, float]]]:
# fmt: on
"""shuffle sents to the right positions.
Based on __main__.py.
"""
set_languages = fastlid.set_languages
fastlid.set_languages = ["en", "zh"]
if lang1 is None:
lang1, _ = fastlid(" ".join(lst1))
if lang2 is None:
lang2, _ = fastlid(" ".join(lst2))
# restore fastlid.set_languages
fastlid.set_languages = set_languages
cmat = lists2cmat(
lst1,
lst2,
tf_type=tf_type,
idf_type=idf_type,
dl_type=dl_type,
norm=norm,
lang1=lang1,
lang2=lang2,
)
pset = gen_pset(
cmat,
eps=eps,
min_samples=min_samples,
delta=7,
)
src_len, tgt_len = cmat.shape
aset = gen_aset(pset, src_len, tgt_len)
final_list = align_texts(aset, lst2, lst1)
return final_list
# return [("", "")]
|