File size: 1,076 Bytes
811503f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c04f50
811503f
 
 
4c04f50
811503f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
"""Test lists2cmat."""
# pylint: disable=invalid-name

from itertools import zip_longest
from fastlid import fastlid
from radiobee.loadtext import loadtext
from radiobee.lists2cmat import lists2cmat

file1 = "data/test_en.txt"
file2 = "data/test_zh.txt"
file1 = "data/hlm-ch1-en.txt"
file2 = "data/hlm-ch1-zh.txt"

# assume English or Chinese
fastlid.set_languages = ["en", "zh", ]

text1 = loadtext(file1)
text2 = loadtext(file2)

lang1, _ = fastlid(text1)
lang2, _ = fastlid(text2)


def test_lists2cmat_hlm():
    """Test lists2cmat."""

    lst1, lst2 = [], []

    if text1:
        lst1 = [_.strip() for _ in text1.splitlines() if _.strip()]
    if text2:
        lst2 = [_.strip() for _ in text2.splitlines() if _.strip()]

    # en                zh
    len(lst1) == 135, len(lst2) == 55

    # cmat = texts2cmat(lst1, lst2, lang1, lang2)
    cmat = lists2cmat(lst1, lst2, lang1, lang2)

    assert cmat.shape == (55, 135)

    cmat21 = lists2cmat(lst2, lst1, lang2, lang1)

    assert cmat21.shape == (135, 55)
    assert lists2cmat(lst2, lst1).mean() > 0.05  # 0.09