File size: 2,187 Bytes
1d7163f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import opencc
from typing import Literal
import re



class Corrector:
    """
    SenseVoice model ouputs Simplified Chinese only, this class converts the output to Traditional Chinese
    and fix common Cantonese spelling errors.
    """

    def __init__(self, corrector: Literal["opencc"] = "opencc"):
        self.corrector = corrector
        self.converter = None
        self.bert_model = None

        if corrector == "opencc":
            self.converter = opencc.OpenCC("s2hk")
            self.regular_errors: list[tuple[re.Pattern, str]] = [
                (re.compile(r"俾(?!(?:路支|斯麥|益))"), r"畀"),
                (re.compile(r"(?<!(?:聯))[系繫](?!(?:統))"), r"係"),
                (re.compile(r"噶"), r"㗎"),
                (re.compile(r"咁(?=[我你佢就樣就話係啊呀嘅,。])"), r"噉"),
                (re.compile(r"(?<![曝晾])曬(?:[衣太衫褲被命嘢相])"), r"晒"),
                (re.compile(r"(?<=[好])翻(?=[去到嚟])"), r"返"),
                (re.compile(r"<\|\w+\|>"), r""),
            ]

    def correct(self, text: str) -> str:
        """
        Correct the output text using either a language model or OpenCC
        Args:
            text: Input text to correct
            t2s_char_dict: Dictionary mapping traditional to simplified characters
            lm_model: Either 'opencc' or a LanguageModel instance
        Returns:
            Corrected text string
        """
        text = text.strip()
        if not text:  # Early return for empty string
            return text

        if self.corrector == "opencc":
            return self.opencc_correct(text)
        else:
            raise ValueError("corrector should be either 'opencc' or 'bert'")

    def opencc_correct(self, text: str) -> str:
        """
        Convert text using OpenCC
        Args:
            text: Input text to convert
            config: OpenCC configuration
        Returns:
            Converted text string
        """
        opencc_text = self.converter.convert(text)
        for pattern, replacement in self.regular_errors:
            opencc_text = pattern.sub(replacement, opencc_text)

        return opencc_text