File size: 3,070 Bytes
dab2de2
4c04f50
 
2f6222b
dab2de2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51055fa
dab2de2
 
 
 
 
 
 
 
 
 
4c04f50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dab2de2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""Convert two lists of str (texts) to correlation matrix."""
# pylint: disable=too-many-arguments, too-many-locals, unused-import

from typing import Dict, Iterable, List, Optional, Union  # noqa

import numpy as np
from textacy.representations import Vectorizer
from fastlid import fastlid

from radiobee.en2zh_tokens import en2zh_tokens
from radiobee.insert_spaces import insert_spaces
from radiobee.gen_model import gen_model
from radiobee.smatrix import smatrix


# fmt: off
def lists2cmat(
        text1: Union[str, Iterable[str]],
        text2: Union[str, Iterable[str]],
        # text1: Union[str, List[str]],
        # text2: Union[str, List[str]],
        lang1: Optional[str] = None,
        lang2: Optional[str] = None,
        model: Vectorizer = None,
        tf_type: str = "linear",
        idf_type: Optional[str] = "smooth",
        # dl_type: Optional[str] = "sqrt",  # "lucene-style tfidf"
        dl_type: Optional[str] = None,  #
        norm: Optional[str] = "l2",  # + "l2"
        min_df: Union[int, float] = 1,
        max_df: Union[int, float] = 1.0,
        max_n_terms: Optional[int] = None,
        vocabulary_terms: Optional[Union[Dict[str, int], Iterable[str]]] = None
) -> np.ndarray:
    # fmt: on
    """Convert two lists to cmat.

    Args:
        text1: refer smatrix
        text2: refer smatrix
        lang1: optional 1st lang code
        lang2: optional 2nd lang code
        dl_type: doc lenth
        idf_type: idf tyoe
        max_df: max doc freq
        max_n_terms: max n terms
        min_df: min doc freq
        model: optional model
        norm: norm
        tf_type: term freq type
        vocabulary_terms: vocab refer smatrix

    Returs
        cmat
    """
    if isinstance(text1, str):
        text1 = [text1]
    if isinstance(text2, str):
        text2 = [text2]

    set_languages = fastlid.set_languages
    fastlid.set_languages = ["en", "zh"]
    if lang1 is None:
        lang1, _ = fastlid(" ".join(text1))
    if lang2 is None:
        lang2, _ = fastlid(" ".join(text2))

    # restore fastlid.set_languages
    fastlid.set_languages = set_languages

    # en2zh_tokens
    def zh_tokens(textzh):
        return [insert_spaces(elm).split() for elm in textzh]

    if lang1 in ["zh"] and lang2 in ["en"]:
        vec1 = zh_tokens(text1)
        vec2 = en2zh_tokens(text2)
    elif lang1 in ["zh"] and lang2 in ["zh"]:
        vec1 = zh_tokens(text1)
        vec2 = zh_tokens(text2)
    elif lang1 in ["en"] and lang2 in ["en"]:
        vec1 = en2zh_tokens(text1)
        vec2 = en2zh_tokens(text2)

    # if lang1 in ["en"] and lang2 in ["zh"]:
    else:
        vec1 = en2zh_tokens(text1)
        vec2 = zh_tokens(text2)

    if model is None:
        model = gen_model(vec1)

    cmat = smatrix(
        vec1,
        vec2,
        model=model,
        tf_type=tf_type,
        idf_type=idf_type,
        dl_type=dl_type,
        norm=norm,
        min_df=min_df,
        max_df=max_df,
        max_n_terms=max_n_terms,
        vocabulary_terms=vocabulary_terms,
    )

    return np.array(cmat)