Spaces:

united-link
/

taiwanese-hakka-tts

Running on Zero

App Files Files Community

taiwanese-hakka-tts / ipa /convert_digits.py

txya900619

feat: init upload

5e8e534 7 months ago

raw

history blame

5.78 kB

	# Copyright 2024 Hung-Shin Lee (hungshinlee@gmail.com)
	# Apache 2.0

	import itertools
	import re

	c_basic = "零一二三四五六七八九"
	d2c = {str(d): c for d, c in enumerate(c_basic)}
	d2c["."] = "點"


	def num4year(matched):
	def _num4year(num):
	return "{}".format("".join([c_basic[int(i)] for i in num]))

	matched_str = matched.group(0)
	for m in matched.groups():
	matched_str = matched_str.replace(m, _num4year(m))
	return matched_str


	def num2chines_simple(matched):
	return "{}".format("".join([d2c[i] for i in matched]))


	def num4percent(matched):
	matched = matched.group(1)
	return "百分之{}".format(num2chinese(matched[:-1]))


	def num4cellphone(matched):
	matched = matched.group(1)
	matched = matched.replace(" ", "").replace("-", "")
	return "".join([c_basic[int(i)] for i in matched])


	def num4er(matched): # 2 to 二
	matched = matched.group(1)
	return matched.replace("2", "二")


	def num4liang(matched): # 2 to 兩
	matched = matched.group(1)
	return matched.replace("2", "兩")


	def num4general(matched):
	num = matched.group(1)
	if re.match("[A-Za-z-─]", num[0]):
	if len(num[1:]) < 3:
	# MP3 or F-16
	return "{}{}".format(num[0], num2chinese(num[1:]))
	else:
	# AM104
	return "{}{}".format(num[0], num2chines_simple(num[1:]))

	else:
	if re.match("[0-9]", num[0]):
	return "{}".format(num2chinese(num))
	else:
	return "{}{}".format(num[0], num2chinese(num[1:]))


	def parse_num(text: str) -> str:
	# year
	text = re.sub("([0-9]{4})[到至]([0-9]{4})年", num4year, text)
	text = re.sub("([0-9]{4})年", num4year, text)

	# percentage
	text = re.sub(r"([0-9]+\.?[0-9]?%)", num4percent, text)

	# cellphone
	text = re.sub(r"([0-9]{4}\s?-\s?[0-9]{6})", num4cellphone, text)

	# single 2 to 二
	text = re.sub(r"([^\d]2[診樓月號])", num4er, text)
	text = re.sub(r"([初]2[^\d])", num4er, text)

	# single 2 to 兩
	text = re.sub(r"([^\d]2[^\d])", num4liang, text)

	# general number
	text = re.sub(r"([^0-9]?[0-9]+\.?[0-9]?)", num4general, text)

	return text


	def num2chinese(num, big=False, simp=False, o=False, twoalt=True) -> str:
	"""
	Converts numbers to Chinese representations.
	https://gist.github.com/gumblex/0d65cad2ba607fd14de7
	`big` : use financial characters.
	`simp` : use simplified characters instead of traditional characters.
	`o` : use 〇 for zero.
	`twoalt`: use 两/兩 for two when appropriate.
	Note that `o` and `twoalt` is ignored when `big` is used,
	and `twoalt` is ignored when `o` is used for formal representations.
	"""
	# check num first
	nd = str(num)
	if abs(float(nd)) >= 1e48:
	raise ValueError("number out of range")
	elif "e" in nd:
	raise ValueError("scientific notation is not supported")
	c_symbol = "正负点" if simp else "正負點"
	if o: # formal
	twoalt = False
	if big:
	c_basic = "零壹贰叁肆伍陆柒捌玖" if simp else "零壹貳參肆伍陸柒捌玖"
	c_unit1 = "拾佰仟"
	c_twoalt = "贰" if simp else "貳"
	else:
	c_basic = "〇一二三四五六七八九" if o else "零一二三四五六七八九"
	c_unit1 = "十百千"
	if twoalt:
	c_twoalt = "两" if simp else "兩"
	else:
	c_twoalt = "二"
	c_unit2 = "万亿兆京垓秭穰沟涧正载" if simp else "萬億兆京垓秭穰溝澗正載"

	def revuniq(l):
	return "".join(k for k, g in itertools.groupby(reversed(l)))

	nd = str(num)
	result = []
	if nd[0] == "+":
	result.append(c_symbol[0])
	elif nd[0] == "-":
	result.append(c_symbol[1])
	if "." in nd:
	integer, remainder = nd.lstrip("+-").split(".")
	else:
	integer, remainder = nd.lstrip("+-"), None
	if int(integer):
	splitted = [integer[max(i - 4, 0) : i] for i in range(len(integer), 0, -4)]
	intresult = []
	for nu, unit in enumerate(splitted):
	# special cases
	if int(unit) == 0: # 0000
	intresult.append(c_basic[0])
	continue
	elif nu > 0 and int(unit) == 2: # 0002
	intresult.append(c_twoalt + c_unit2[nu - 1])
	continue
	ulist = []
	unit = unit.zfill(4)
	for nc, ch in enumerate(reversed(unit)):
	if ch == "0":
	if ulist: # ???0
	ulist.append(c_basic[0])
	elif nc == 0:
	ulist.append(c_basic[int(ch)])
	elif nc == 1 and ch == "1" and all([i == "0" for i in unit[: nc + 1]]):
	# special case for tens
	# edit the 'elif' if you don't like
	# 十四, 三千零十四, 三千三百一十四
	ulist.append(c_unit1[0])
	elif nc > 1 and ch == "2":
	ulist.append(c_twoalt + c_unit1[nc - 1])
	else:
	ulist.append(c_basic[int(ch)] + c_unit1[nc - 1])
	# print(ulist)
	ustr = revuniq(ulist)
	if nu == 0:
	intresult.append(ustr)
	else:
	intresult.append(ustr + c_unit2[nu - 1])
	result.append(revuniq(intresult).strip(c_basic[0]))
	else:
	result.append(c_basic[0])
	if remainder:
	result.append(c_symbol[2])
	result.append("".join(c_basic[int(ch)] for ch in remainder))
	return "".join(result)


	if __name__ == "__main__":
	text = "若手機仔幾多號？吾手機仔係0964-498042。"

	print(f"{text} -> {parse_num(text)}")