Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,780 Bytes
5e8e534 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
# Copyright 2024 Hung-Shin Lee (hungshinlee@gmail.com)
# Apache 2.0
import itertools
import re
c_basic = "零一二三四五六七八九"
d2c = {str(d): c for d, c in enumerate(c_basic)}
d2c["."] = "點"
def num4year(matched):
def _num4year(num):
return "{}".format("".join([c_basic[int(i)] for i in num]))
matched_str = matched.group(0)
for m in matched.groups():
matched_str = matched_str.replace(m, _num4year(m))
return matched_str
def num2chines_simple(matched):
return "{}".format("".join([d2c[i] for i in matched]))
def num4percent(matched):
matched = matched.group(1)
return "百分之{}".format(num2chinese(matched[:-1]))
def num4cellphone(matched):
matched = matched.group(1)
matched = matched.replace(" ", "").replace("-", "")
return "".join([c_basic[int(i)] for i in matched])
def num4er(matched): # 2 to 二
matched = matched.group(1)
return matched.replace("2", "二")
def num4liang(matched): # 2 to 兩
matched = matched.group(1)
return matched.replace("2", "兩")
def num4general(matched):
num = matched.group(1)
if re.match("[A-Za-z-─]", num[0]):
if len(num[1:]) < 3:
# MP3 or F-16
return "{}{}".format(num[0], num2chinese(num[1:]))
else:
# AM104
return "{}{}".format(num[0], num2chines_simple(num[1:]))
else:
if re.match("[0-9]", num[0]):
return "{}".format(num2chinese(num))
else:
return "{}{}".format(num[0], num2chinese(num[1:]))
def parse_num(text: str) -> str:
# year
text = re.sub("([0-9]{4})[到至]([0-9]{4})年", num4year, text)
text = re.sub("([0-9]{4})年", num4year, text)
# percentage
text = re.sub(r"([0-9]+\.?[0-9]?%)", num4percent, text)
# cellphone
text = re.sub(r"([0-9]{4}\s?-\s?[0-9]{6})", num4cellphone, text)
# single 2 to 二
text = re.sub(r"([^\d]2[診樓月號])", num4er, text)
text = re.sub(r"([初]2[^\d])", num4er, text)
# single 2 to 兩
text = re.sub(r"([^\d]2[^\d])", num4liang, text)
# general number
text = re.sub(r"([^0-9]?[0-9]+\.?[0-9]?)", num4general, text)
return text
def num2chinese(num, big=False, simp=False, o=False, twoalt=True) -> str:
"""
Converts numbers to Chinese representations.
https://gist.github.com/gumblex/0d65cad2ba607fd14de7
`big` : use financial characters.
`simp` : use simplified characters instead of traditional characters.
`o` : use 〇 for zero.
`twoalt`: use 两/兩 for two when appropriate.
Note that `o` and `twoalt` is ignored when `big` is used,
and `twoalt` is ignored when `o` is used for formal representations.
"""
# check num first
nd = str(num)
if abs(float(nd)) >= 1e48:
raise ValueError("number out of range")
elif "e" in nd:
raise ValueError("scientific notation is not supported")
c_symbol = "正负点" if simp else "正負點"
if o: # formal
twoalt = False
if big:
c_basic = "零壹贰叁肆伍陆柒捌玖" if simp else "零壹貳參肆伍陸柒捌玖"
c_unit1 = "拾佰仟"
c_twoalt = "贰" if simp else "貳"
else:
c_basic = "〇一二三四五六七八九" if o else "零一二三四五六七八九"
c_unit1 = "十百千"
if twoalt:
c_twoalt = "两" if simp else "兩"
else:
c_twoalt = "二"
c_unit2 = "万亿兆京垓秭穰沟涧正载" if simp else "萬億兆京垓秭穰溝澗正載"
def revuniq(l):
return "".join(k for k, g in itertools.groupby(reversed(l)))
nd = str(num)
result = []
if nd[0] == "+":
result.append(c_symbol[0])
elif nd[0] == "-":
result.append(c_symbol[1])
if "." in nd:
integer, remainder = nd.lstrip("+-").split(".")
else:
integer, remainder = nd.lstrip("+-"), None
if int(integer):
splitted = [integer[max(i - 4, 0) : i] for i in range(len(integer), 0, -4)]
intresult = []
for nu, unit in enumerate(splitted):
# special cases
if int(unit) == 0: # 0000
intresult.append(c_basic[0])
continue
elif nu > 0 and int(unit) == 2: # 0002
intresult.append(c_twoalt + c_unit2[nu - 1])
continue
ulist = []
unit = unit.zfill(4)
for nc, ch in enumerate(reversed(unit)):
if ch == "0":
if ulist: # ???0
ulist.append(c_basic[0])
elif nc == 0:
ulist.append(c_basic[int(ch)])
elif nc == 1 and ch == "1" and all([i == "0" for i in unit[: nc + 1]]):
# special case for tens
# edit the 'elif' if you don't like
# 十四, 三千零十四, 三千三百一十四
ulist.append(c_unit1[0])
elif nc > 1 and ch == "2":
ulist.append(c_twoalt + c_unit1[nc - 1])
else:
ulist.append(c_basic[int(ch)] + c_unit1[nc - 1])
# print(ulist)
ustr = revuniq(ulist)
if nu == 0:
intresult.append(ustr)
else:
intresult.append(ustr + c_unit2[nu - 1])
result.append(revuniq(intresult).strip(c_basic[0]))
else:
result.append(c_basic[0])
if remainder:
result.append(c_symbol[2])
result.append("".join(c_basic[int(ch)] for ch in remainder))
return "".join(result)
if __name__ == "__main__":
text = "若手機仔幾多號?吾手機仔係0964-498042。"
print(f"{text} -> {parse_num(text)}")
|