"""Genereat align set (aset) based on pset (pair set), src_lang and tgt_len.""" from typing import List, Tuple, Union from itertools import zip_longest # from logzero import logger # fmt: off def gen_aset( pset: List[Tuple[Union[str, float], Union[str, float], Union[str, float]]], src_len: int, # n_rows tgt_len: int, # n_cols ) -> List[Tuple[Union[str, float], Union[str, float], Union[str, float]]]: # fmt: on """Genereat align set (aset) based on pset, src_lang and tgt_len. src_len, tgt_len = cmat.shape zip_longest(..., fillvalue="") Args: pset: [x(lang2 zh), y(lang1 en), cos] src_len: lang1 (en) tgt_len: lang2 (zh) Returns: aset: [0...tgt_len, 0...src_len] [0, 0, .] ... [tgt_len-1, src_len-1, .] """ # empty pset [] if not pset: return [*zip_longest(range(tgt_len), range(src_len), fillvalue="")] # empty [[]] if len(pset) == 1: if not pset[0]: return [*zip_longest(range(tgt_len), range(src_len), fillvalue="")] buff = [] pos0, pos1 = -1, -1 for elm in pset: # elm0, elm1, elm2 = elm elm0, elm1, *elm2 = elm elm0 = int(elm0) elm1 = int(elm1) interval = max(elm0 - pos0 - 1, elm1 - pos1 - 1) _ = zip_longest(range(pos0 + 1, elm0), range(pos1 + 1, elm1), [""] * interval, fillvalue="") buff.extend(_) buff.append(elm) pos0, pos1 = elm0, elm1 # last batch if any elm0, elm1 = tgt_len, src_len interval = max(elm0 - pos0 - 1, elm1 - pos1 - 1) _ = zip_longest(range(pos0 + 1, elm0), range(pos1 + 1, elm1), [""] * interval, fillvalue="") buff.extend(_) return buff