"""Convert two lists of str (texts) to correlation matrix.""" # pylint: disable=too-many-arguments, too-many-locals, unused-import from typing import Dict, Iterable, List, Optional, Union # noqa import numpy as np from textacy.representations import Vectorizer from fastlid import fastlid from radiobee.en2zh_tokens import en2zh_tokens from radiobee.insert_spaces import insert_spaces from radiobee.gen_model import gen_model from radiobee.smatrix import smatrix # fmt: off def lists2cmat( text1: Union[str, Iterable[str]], text2: Union[str, Iterable[str]], # text1: Union[str, List[str]], # text2: Union[str, List[str]], lang1: Optional[str] = None, lang2: Optional[str] = None, model: Vectorizer = None, tf_type: str = "linear", idf_type: Optional[str] = "smooth", # dl_type: Optional[str] = "sqrt", # "lucene-style tfidf" dl_type: Optional[str] = None, # norm: Optional[str] = "l2", # + "l2" min_df: Union[int, float] = 1, max_df: Union[int, float] = 1.0, max_n_terms: Optional[int] = None, vocabulary_terms: Optional[Union[Dict[str, int], Iterable[str]]] = None ) -> np.ndarray: # fmt: on """Convert two lists to cmat. Args: text1: refer smatrix text2: refer smatrix lang1: optional 1st lang code lang2: optional 2nd lang code dl_type: doc lenth idf_type: idf tyoe max_df: max doc freq max_n_terms: max n terms min_df: min doc freq model: optional model norm: norm tf_type: term freq type vocabulary_terms: vocab refer smatrix Returs cmat """ if isinstance(text1, str): text1 = [text1] if isinstance(text2, str): text2 = [text2] set_languages = fastlid.set_languages fastlid.set_languages = ["en", "zh"] if lang1 is None: lang1, _ = fastlid(" ".join(text1)) if lang2 is None: lang2, _ = fastlid(" ".join(text2)) # restore fastlid.set_languages fastlid.set_languages = set_languages # en2zh_tokens def zh_tokens(textzh): return [insert_spaces(elm).split() for elm in textzh] if lang1 in ["zh"] and lang2 in ["en"]: vec1 = zh_tokens(text1) vec2 = en2zh_tokens(text2) elif lang1 in ["zh"] and lang2 in ["zh"]: vec1 = zh_tokens(text1) vec2 = zh_tokens(text2) elif lang1 in ["en"] and lang2 in ["en"]: vec1 = en2zh_tokens(text1) vec2 = en2zh_tokens(text2) # if lang1 in ["en"] and lang2 in ["zh"]: else: vec1 = en2zh_tokens(text1) vec2 = zh_tokens(text2) if model is None: model = gen_model(vec1) cmat = smatrix( vec1, vec2, model=model, tf_type=tf_type, idf_type=idf_type, dl_type=dl_type, norm=norm, min_df=min_df, max_df=max_df, max_n_terms=max_n_terms, vocabulary_terms=vocabulary_terms, ) return np.array(cmat)