File size: 2,293 Bytes
5e8e534
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
576392b
5e8e534
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c3ce0e
5e8e534
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Copyright 2024    Hung-Shin Lee (hungshinlee@gmail.com)
# Apache 2.0

import re
from pathlib import Path
from unicodedata import normalize

import jieba
import opencc

jieba.setLogLevel(20)
jieba.re_han_default = re.compile("([\u2e80-\U000e01efa-zA-Z0-9+#&\._%\-']+)", re.U)

s2tw_converter = opencc.OpenCC("s2tw.json")


def update_jieba_dict(
    lexicon: list,
    jieba_dict_path: Path,
    high_freq_words: list = [],
    high_freq_words_weight: int = 10,
) -> list:
    lexicon = sorted(set(lexicon))

    jieba_dict_path.unlink(missing_ok=True)
    Path("/tmp/jieba.cache").unlink(missing_ok=True)

    with jieba_dict_path.open("w", encoding="utf-8") as file:
        for word in lexicon:
            if word in high_freq_words:
                file.write(f"{word} {len(word) * high_freq_words_weight}\n")
            else:
                file.write(f"{word} {len(word)}\n")
        
    jieba.dt.initialized = False

    return lexicon


def run_jieba(line: str) -> list:
    # NOTE JIEBA θ™•η†ε€šθ‘Œζ–‡ζœ¬ηš„η΅ζžœζœƒε€±εŽ»εŽŸζœ¬ηš„θ‘Œη΅ζ§‹

    seg_list = list(jieba.cut(line, cut_all=False, HMM=False))

    return seg_list


def normalize_text(text: str, replace_dict: dict, replace_regex: str) -> str:
    def replace_match(match):
        return replace_dict[match.group(0)]

    text = re.sub("\x08", "", text)
    text = re.sub("\ufeff", "", text)
    text = re.sub("\u0010", "", text)
    text = normalize("NFKC", text)
    text = re.sub(replace_regex, replace_match, text)
    text = " ".join(text.split()).upper()

    return text


def apply_v2f(word_list: list, v2f_dict: dict, v2f_regex: str) -> list:
    result = []
    for word in word_list:
        result.append(re.sub(v2f_regex, lambda x: v2f_dict[x.group(0)], word))

    return result


def prep_regex(
    delimiter_list: list, replace_dict: dict = {}, v2f_dict: dict = {}
) -> tuple[str, str, str]:
    delimiter_regex = "|".join(map(re.escape, delimiter_list))

    replace_regex = ""
    if len(replace_dict):
        sorted_keys = sorted(replace_dict.keys(), key=len, reverse=True)
        replace_regex = "|".join(map(re.escape, sorted_keys))

    v2f_regex = ""
    if len(v2f_dict):
        v2f_regex = "|".join(map(re.escape, v2f_dict.keys()))

    return delimiter_regex, replace_regex, v2f_regex