Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,300 Bytes
5e8e534 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
# Copyright 2024 Hung-Shin Lee (hungshinlee@gmail.com)
# Apache 2.0
import re
from pathlib import Path
from typing import Tuple
from unicodedata import normalize
import jieba
import opencc
jieba.setLogLevel(20)
jieba.re_han_default = re.compile("([\u2e80-\U000e01efa-zA-Z0-9+#&\._%\-']+)", re.U)
s2tw_converter = opencc.OpenCC("s2tw.json")
def update_jieba_dict(
lexicon: list,
jieba_dict_path: Path,
high_freq_words: list = [],
high_freq_words_weight: int = 10,
) -> list:
lexicon = sorted(set(lexicon))
jieba_dict_path.unlink(missing_ok=True)
Path("/tmp/jieba.cache").unlink(missing_ok=True)
with jieba_dict_path.open("w") as file:
for word in lexicon:
if word in high_freq_words:
file.write(f"{word} {len(word) * high_freq_words_weight}\n")
else:
file.write(f"{word} {len(word)}\n")
jieba.dt.initialized = False
return lexicon
def run_jieba(line: str) -> list:
# NOTE JIEBA θηε€θ‘ζζ¬ηη΅ζζε€±ε»εζ¬ηθ‘η΅ζ§
seg_list = list(jieba.cut(line, cut_all=False, HMM=False))
return seg_list
def normalize_text(text: str, replace_dict: dict, replace_regex: str) -> str:
def replace_match(match):
return replace_dict[match.group(0)]
text = re.sub("\x08", "", text)
text = re.sub("\ufeff", "", text)
text = re.sub("\u0010", "", text)
text = normalize("NFKC", text)
text = re.sub(replace_regex, replace_match, text)
text = " ".join(text.split()).upper()
return text
def apply_v2f(word_list: list, v2f_dict: dict, v2f_regex: str) -> list:
result = []
for word in word_list:
result.append(re.sub(v2f_regex, lambda x: v2f_dict[x.group(0)], word))
return result
def prep_regex(
delimiter_list: list, replace_dict: dict = {}, v2f_dict: dict = {}
) -> Tuple[str, str, str]:
delimiter_regex = "|".join(map(re.escape, delimiter_list))
replace_regex = ""
if len(replace_dict):
sorted_keys = sorted(replace_dict.keys(), key=len, reverse=True)
replace_regex = "|".join(map(re.escape, sorted_keys))
v2f_regex = ""
if len(v2f_dict):
v2f_regex = "|".join(map(re.escape, v2f_dict.keys()))
return delimiter_regex, replace_regex, v2f_regex
|