Spaces:
Sleeping
Sleeping
import opencc | |
from typing import Literal | |
import re | |
class Corrector: | |
""" | |
SenseVoice model ouputs Simplified Chinese only, this class converts the output to Traditional Chinese | |
and fix common Cantonese spelling errors. | |
""" | |
def __init__(self, corrector: Literal["opencc"] = "opencc"): | |
self.corrector = corrector | |
self.converter = None | |
self.bert_model = None | |
if corrector == "opencc": | |
self.converter = opencc.OpenCC("s2hk") | |
self.regular_errors: list[tuple[re.Pattern, str]] = [ | |
(re.compile(r"俾(?!(?:路支|斯麥|益))"), r"畀"), | |
(re.compile(r"(?<!(?:聯))[系繫](?!(?:統))"), r"係"), | |
(re.compile(r"噶"), r"㗎"), | |
(re.compile(r"咁(?=[我你佢就樣就話係啊呀嘅,。])"), r"噉"), | |
(re.compile(r"(?<![曝晾])曬(?:[衣太衫褲被命嘢相])"), r"晒"), | |
(re.compile(r"(?<=[好])翻(?=[去到嚟])"), r"返"), | |
(re.compile(r"<\|\w+\|>"), r""), | |
] | |
def correct(self, text: str) -> str: | |
""" | |
Correct the output text using either a language model or OpenCC | |
Args: | |
text: Input text to correct | |
t2s_char_dict: Dictionary mapping traditional to simplified characters | |
lm_model: Either 'opencc' or a LanguageModel instance | |
Returns: | |
Corrected text string | |
""" | |
text = text.strip() | |
if not text: # Early return for empty string | |
return text | |
if self.corrector == "opencc": | |
return self.opencc_correct(text) | |
else: | |
raise ValueError("corrector should be either 'opencc' or 'bert'") | |
def opencc_correct(self, text: str) -> str: | |
""" | |
Convert text using OpenCC | |
Args: | |
text: Input text to convert | |
config: OpenCC configuration | |
Returns: | |
Converted text string | |
""" | |
opencc_text = self.converter.convert(text) | |
for pattern, replacement in self.regular_errors: | |
opencc_text = pattern.sub(replacement, opencc_text) | |
return opencc_text | |