Spaces:
Build error
Build error
File size: 3,070 Bytes
dab2de2 4c04f50 2f6222b dab2de2 51055fa dab2de2 4c04f50 dab2de2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
"""Convert two lists of str (texts) to correlation matrix."""
# pylint: disable=too-many-arguments, too-many-locals, unused-import
from typing import Dict, Iterable, List, Optional, Union # noqa
import numpy as np
from textacy.representations import Vectorizer
from fastlid import fastlid
from radiobee.en2zh_tokens import en2zh_tokens
from radiobee.insert_spaces import insert_spaces
from radiobee.gen_model import gen_model
from radiobee.smatrix import smatrix
# fmt: off
def lists2cmat(
text1: Union[str, Iterable[str]],
text2: Union[str, Iterable[str]],
# text1: Union[str, List[str]],
# text2: Union[str, List[str]],
lang1: Optional[str] = None,
lang2: Optional[str] = None,
model: Vectorizer = None,
tf_type: str = "linear",
idf_type: Optional[str] = "smooth",
# dl_type: Optional[str] = "sqrt", # "lucene-style tfidf"
dl_type: Optional[str] = None, #
norm: Optional[str] = "l2", # + "l2"
min_df: Union[int, float] = 1,
max_df: Union[int, float] = 1.0,
max_n_terms: Optional[int] = None,
vocabulary_terms: Optional[Union[Dict[str, int], Iterable[str]]] = None
) -> np.ndarray:
# fmt: on
"""Convert two lists to cmat.
Args:
text1: refer smatrix
text2: refer smatrix
lang1: optional 1st lang code
lang2: optional 2nd lang code
dl_type: doc lenth
idf_type: idf tyoe
max_df: max doc freq
max_n_terms: max n terms
min_df: min doc freq
model: optional model
norm: norm
tf_type: term freq type
vocabulary_terms: vocab refer smatrix
Returs
cmat
"""
if isinstance(text1, str):
text1 = [text1]
if isinstance(text2, str):
text2 = [text2]
set_languages = fastlid.set_languages
fastlid.set_languages = ["en", "zh"]
if lang1 is None:
lang1, _ = fastlid(" ".join(text1))
if lang2 is None:
lang2, _ = fastlid(" ".join(text2))
# restore fastlid.set_languages
fastlid.set_languages = set_languages
# en2zh_tokens
def zh_tokens(textzh):
return [insert_spaces(elm).split() for elm in textzh]
if lang1 in ["zh"] and lang2 in ["en"]:
vec1 = zh_tokens(text1)
vec2 = en2zh_tokens(text2)
elif lang1 in ["zh"] and lang2 in ["zh"]:
vec1 = zh_tokens(text1)
vec2 = zh_tokens(text2)
elif lang1 in ["en"] and lang2 in ["en"]:
vec1 = en2zh_tokens(text1)
vec2 = en2zh_tokens(text2)
# if lang1 in ["en"] and lang2 in ["zh"]:
else:
vec1 = en2zh_tokens(text1)
vec2 = zh_tokens(text2)
if model is None:
model = gen_model(vec1)
cmat = smatrix(
vec1,
vec2,
model=model,
tf_type=tf_type,
idf_type=idf_type,
dl_type=dl_type,
norm=norm,
min_df=min_df,
max_df=max_df,
max_n_terms=max_n_terms,
vocabulary_terms=vocabulary_terms,
)
return np.array(cmat)
|