Spaces:

laubonghaudoi
/

cantonese-srt

Sleeping

App Files Files Community

cantonese-srt / corrector /Corrector.py

laubonghaudoi

Inital commit

1d7163f 3 months ago

raw

history blame

2.19 kB

	import opencc
	from typing import Literal
	import re



	class Corrector:
	"""
	SenseVoice model ouputs Simplified Chinese only, this class converts the output to Traditional Chinese
	and fix common Cantonese spelling errors.
	"""

	def __init__(self, corrector: Literal["opencc"] = "opencc"):
	self.corrector = corrector
	self.converter = None
	self.bert_model = None

	if corrector == "opencc":
	self.converter = opencc.OpenCC("s2hk")
	self.regular_errors: list[tuple[re.Pattern, str]] = [
	(re.compile(r"俾(?!(?:路支\|斯麥\|益))"), r"畀"),
	(re.compile(r"(?<!(?:聯))[系繫](?!(?:統))"), r"係"),
	(re.compile(r"噶"), r"㗎"),
	(re.compile(r"咁(?=[我你佢就樣就話係啊呀嘅，。])"), r"噉"),
	(re.compile(r"(?<![曝晾])曬(?:[衣太衫褲被命嘢相])"), r"晒"),
	(re.compile(r"(?<=[好])翻(?=[去到嚟])"), r"返"),
	(re.compile(r"<\\|\w+\\|>"), r""),
	]

	def correct(self, text: str) -> str:
	"""
	Correct the output text using either a language model or OpenCC
	Args:
	text: Input text to correct
	t2s_char_dict: Dictionary mapping traditional to simplified characters
	lm_model: Either 'opencc' or a LanguageModel instance
	Returns:
	Corrected text string
	"""
	text = text.strip()
	if not text: # Early return for empty string
	return text

	if self.corrector == "opencc":
	return self.opencc_correct(text)
	else:
	raise ValueError("corrector should be either 'opencc' or 'bert'")

	def opencc_correct(self, text: str) -> str:
	"""
	Convert text using OpenCC
	Args:
	text: Input text to convert
	config: OpenCC configuration
	Returns:
	Converted text string
	"""
	opencc_text = self.converter.convert(text)
	for pattern, replacement in self.regular_errors:
	opencc_text = pattern.sub(replacement, opencc_text)

	return opencc_text