Spaces:
Running
Running
File size: 2,187 Bytes
1d7163f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import opencc
from typing import Literal
import re
class Corrector:
"""
SenseVoice model ouputs Simplified Chinese only, this class converts the output to Traditional Chinese
and fix common Cantonese spelling errors.
"""
def __init__(self, corrector: Literal["opencc"] = "opencc"):
self.corrector = corrector
self.converter = None
self.bert_model = None
if corrector == "opencc":
self.converter = opencc.OpenCC("s2hk")
self.regular_errors: list[tuple[re.Pattern, str]] = [
(re.compile(r"俾(?!(?:路支|斯麥|益))"), r"畀"),
(re.compile(r"(?<!(?:聯))[系繫](?!(?:統))"), r"係"),
(re.compile(r"噶"), r"㗎"),
(re.compile(r"咁(?=[我你佢就樣就話係啊呀嘅,。])"), r"噉"),
(re.compile(r"(?<![曝晾])曬(?:[衣太衫褲被命嘢相])"), r"晒"),
(re.compile(r"(?<=[好])翻(?=[去到嚟])"), r"返"),
(re.compile(r"<\|\w+\|>"), r""),
]
def correct(self, text: str) -> str:
"""
Correct the output text using either a language model or OpenCC
Args:
text: Input text to correct
t2s_char_dict: Dictionary mapping traditional to simplified characters
lm_model: Either 'opencc' or a LanguageModel instance
Returns:
Corrected text string
"""
text = text.strip()
if not text: # Early return for empty string
return text
if self.corrector == "opencc":
return self.opencc_correct(text)
else:
raise ValueError("corrector should be either 'opencc' or 'bert'")
def opencc_correct(self, text: str) -> str:
"""
Convert text using OpenCC
Args:
text: Input text to convert
config: OpenCC configuration
Returns:
Converted text string
"""
opencc_text = self.converter.convert(text)
for pattern, replacement in self.regular_errors:
opencc_text = pattern.sub(replacement, opencc_text)
return opencc_text
|