File size: 4,454 Bytes
9b2107c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Licensed under WTFPL or the Unlicense or CC0.
# This uses Python 3, but it's easy to port to Python 2 by changing
# strings to u'xx'.
import itertools
import re
def _num2chinese(num: str, big=False, simp=True, o=False, twoalt=False) -> str:
"""Convert numerical arabic numbers (0->9) to chinese hanzi numbers (〇 -> 九)
Args:
num (str): arabic number to convert
big (bool, optional): use financial characters. Defaults to False.
simp (bool, optional): use simplified characters instead of tradictional characters. Defaults to True.
o (bool, optional): use 〇 for 'zero'. Defaults to False.
twoalt (bool, optional): use 两/兩 for 'two' when appropriate. Defaults to False.
Raises:
ValueError: if number is more than 1e48
ValueError: if 'e' exposent in number
Returns:
str: converted number as hanzi characters
"""
# check num first
nd = str(num)
if abs(float(nd)) >= 1e48:
raise ValueError("number out of range")
if "e" in nd:
raise ValueError("scientific notation is not supported")
c_symbol = "正负点" if simp else "正負點"
if o: # formal
twoalt = False
if big:
c_basic = "零壹贰叁肆伍陆柒捌玖" if simp else "零壹貳參肆伍陸柒捌玖"
c_unit1 = "拾佰仟"
c_twoalt = "贰" if simp else "貳"
else:
c_basic = "〇一二三四五六七八九" if o else "零一二三四五六七八九"
c_unit1 = "十百千"
if twoalt:
c_twoalt = "两" if simp else "兩"
else:
c_twoalt = "二"
c_unit2 = "万亿兆京垓秭穰沟涧正载" if simp else "萬億兆京垓秭穰溝澗正載"
revuniq = lambda l: "".join(k for k, g in itertools.groupby(reversed(l)))
nd = str(num)
result = []
if nd[0] == "+":
result.append(c_symbol[0])
elif nd[0] == "-":
result.append(c_symbol[1])
if "." in nd:
integer, remainder = nd.lstrip("+-").split(".")
else:
integer, remainder = nd.lstrip("+-"), None
if int(integer):
splitted = [integer[max(i - 4, 0) : i] for i in range(len(integer), 0, -4)]
intresult = []
for nu, unit in enumerate(splitted):
# special cases
if int(unit) == 0: # 0000
intresult.append(c_basic[0])
continue
if nu > 0 and int(unit) == 2: # 0002
intresult.append(c_twoalt + c_unit2[nu - 1])
continue
ulist = []
unit = unit.zfill(4)
for nc, ch in enumerate(reversed(unit)):
if ch == "0":
if ulist: # ???0
ulist.append(c_basic[0])
elif nc == 0:
ulist.append(c_basic[int(ch)])
elif nc == 1 and ch == "1" and unit[1] == "0":
# special case for tens
# edit the 'elif' if you don't like
# 十四, 三千零十四, 三千三百一十四
ulist.append(c_unit1[0])
elif nc > 1 and ch == "2":
ulist.append(c_twoalt + c_unit1[nc - 1])
else:
ulist.append(c_basic[int(ch)] + c_unit1[nc - 1])
ustr = revuniq(ulist)
if nu == 0:
intresult.append(ustr)
else:
intresult.append(ustr + c_unit2[nu - 1])
result.append(revuniq(intresult).strip(c_basic[0]))
else:
result.append(c_basic[0])
if remainder:
result.append(c_symbol[2])
result.append("".join(c_basic[int(ch)] for ch in remainder))
return "".join(result)
def _number_replace(match) -> str:
"""function to apply in a match, transform all numbers in a match by chinese characters
Args:
match (re.Match): numbers regex matches
Returns:
str: replaced characters for the numbers
"""
match_str: str = match.group()
return _num2chinese(match_str)
def replace_numbers_to_characters_in_text(text: str) -> str:
"""Replace all arabic numbers in a text by their equivalent in chinese characters (simplified)
Args:
text (str): input text to transform
Returns:
str: output text
"""
text = re.sub(r"[0-9]+", _number_replace, text)
return text
|