|
import re |
|
from typing import Union |
|
|
|
from proces import preprocess |
|
|
|
from .an2cn import An2Cn |
|
from .conf import NUMBER_CN2AN, UNIT_CN2AN, STRICT_CN_NUMBER, NORMAL_CN_NUMBER, NUMBER_LOW_AN2CN, UNIT_LOW_AN2CN |
|
|
|
|
|
class Cn2An(object): |
|
def __init__(self) -> None: |
|
self.all_num = "".join(list(NUMBER_CN2AN.keys())) |
|
self.all_unit = "".join(list(UNIT_CN2AN.keys())) |
|
self.strict_cn_number = STRICT_CN_NUMBER |
|
self.normal_cn_number = NORMAL_CN_NUMBER |
|
self.check_key_dict = { |
|
"strict": "".join(self.strict_cn_number.values()) + "点负", |
|
"normal": "".join(self.normal_cn_number.values()) + "点负", |
|
"smart": "".join(self.normal_cn_number.values()) + "点负" + "01234567890.-" |
|
} |
|
self.pattern_dict = self.__get_pattern() |
|
self.ac = An2Cn() |
|
self.mode_list = ["strict", "normal", "smart"] |
|
|
|
def cn2an(self, inputs: Union[str, int, float] = None, mode: str = "strict") -> Union[float, int]: |
|
"""中文数字转阿拉伯数字 |
|
|
|
:param inputs: 中文数字、阿拉伯数字、中文数字和阿拉伯数字 |
|
:param mode: strict 严格,normal 正常,smart 智能 |
|
:return: 阿拉伯数字 |
|
""" |
|
if inputs is not None or inputs == "": |
|
if mode not in self.mode_list: |
|
raise ValueError(f"mode 仅支持 {str(self.mode_list)} !") |
|
|
|
|
|
if not isinstance(inputs, str): |
|
inputs = str(inputs) |
|
|
|
|
|
|
|
|
|
inputs = preprocess(inputs, pipelines=[ |
|
"traditional_to_simplified", |
|
"full_angle_to_half_angle" |
|
]) |
|
|
|
|
|
inputs = inputs.replace("廿", "二十") |
|
|
|
|
|
sign, integer_data, decimal_data, is_all_num = self.__check_input_data_is_valid(inputs, mode) |
|
|
|
|
|
if sign == 0: |
|
return integer_data |
|
else: |
|
if not is_all_num: |
|
if decimal_data is None: |
|
output = self.__integer_convert(integer_data) |
|
else: |
|
output = self.__integer_convert(integer_data) + self.__decimal_convert(decimal_data) |
|
|
|
output = round(output, len(decimal_data)) |
|
else: |
|
if decimal_data is None: |
|
output = self.__direct_convert(integer_data) |
|
else: |
|
output = self.__direct_convert(integer_data) + self.__decimal_convert(decimal_data) |
|
|
|
output = round(output, len(decimal_data)) |
|
else: |
|
raise ValueError("输入数据为空!") |
|
|
|
return sign * output |
|
|
|
def __get_pattern(self) -> dict: |
|
|
|
_0 = "[零]" |
|
_1_9 = "[一二三四五六七八九]" |
|
_10_99 = f"{_1_9}?[十]{_1_9}?" |
|
_1_99 = f"({_10_99}|{_1_9})" |
|
_100_999 = f"({_1_9}[百]([零]{_1_9})?|{_1_9}[百]{_10_99})" |
|
_1_999 = f"({_100_999}|{_1_99})" |
|
_1000_9999 = f"({_1_9}[千]([零]{_1_99})?|{_1_9}[千]{_100_999})" |
|
_1_9999 = f"({_1000_9999}|{_1_999})" |
|
_10000_99999999 = f"({_1_9999}[万]([零]{_1_999})?|{_1_9999}[万]{_1000_9999})" |
|
_1_99999999 = f"({_10000_99999999}|{_1_9999})" |
|
_100000000_9999999999999999 = f"({_1_99999999}[亿]([零]{_1_99999999})?|{_1_99999999}[亿]{_10000_99999999})" |
|
_1_9999999999999999 = f"({_100000000_9999999999999999}|{_1_99999999})" |
|
str_int_pattern = f"^({_0}|{_1_9999999999999999})$" |
|
nor_int_pattern = f"^({_0}|{_1_9999999999999999})$" |
|
|
|
str_dec_pattern = "^[零一二三四五六七八九]{0,15}[一二三四五六七八九]$" |
|
nor_dec_pattern = "^[零一二三四五六七八九]{0,16}$" |
|
|
|
for str_num in self.strict_cn_number.keys(): |
|
str_int_pattern = str_int_pattern.replace(str_num, self.strict_cn_number[str_num]) |
|
str_dec_pattern = str_dec_pattern.replace(str_num, self.strict_cn_number[str_num]) |
|
for nor_num in self.normal_cn_number.keys(): |
|
nor_int_pattern = nor_int_pattern.replace(nor_num, self.normal_cn_number[nor_num]) |
|
nor_dec_pattern = nor_dec_pattern.replace(nor_num, self.normal_cn_number[nor_num]) |
|
|
|
pattern_dict = { |
|
"strict": { |
|
"int": str_int_pattern, |
|
"dec": str_dec_pattern |
|
}, |
|
"normal": { |
|
"int": nor_int_pattern, |
|
"dec": nor_dec_pattern |
|
} |
|
} |
|
return pattern_dict |
|
|
|
def __copy_num(self, num): |
|
cn_num = "" |
|
for n in num: |
|
cn_num += NUMBER_LOW_AN2CN[int(n)] |
|
return cn_num |
|
|
|
def __check_input_data_is_valid(self, check_data: str, mode: str) -> (int, str, str, bool): |
|
|
|
stop_words = ["元整", "圆整", "元正", "圆正"] |
|
for word in stop_words: |
|
if check_data[-2:] == word: |
|
check_data = check_data[:-2] |
|
|
|
|
|
if mode != "strict": |
|
normal_stop_words = ["圆", "元"] |
|
for word in normal_stop_words: |
|
if check_data[-1] == word: |
|
check_data = check_data[:-1] |
|
|
|
|
|
yjf_pattern = re.compile(fr"^.*?[元圆][{self.all_num}]角([{self.all_num}]分)?$") |
|
result = yjf_pattern.search(check_data) |
|
if result: |
|
check_data = check_data.replace("元", "点").replace("角", "").replace("分", "") |
|
|
|
|
|
if "零十" in check_data: |
|
check_data = check_data.replace("零十", "零一十") |
|
if "零百" in check_data: |
|
check_data = check_data.replace("零百", "零一百") |
|
|
|
for data in check_data: |
|
if data not in self.check_key_dict[mode]: |
|
raise ValueError(f"当前为{mode}模式,输入的数据不在转化范围内:{data}!") |
|
|
|
|
|
if check_data[0] == "负": |
|
check_data = check_data[1:] |
|
sign = -1 |
|
else: |
|
sign = 1 |
|
|
|
if "点" in check_data: |
|
split_data = check_data.split("点") |
|
if len(split_data) == 2: |
|
integer_data, decimal_data = split_data |
|
|
|
if mode == "smart": |
|
integer_data = re.sub(r"\d+", lambda x: self.ac.an2cn(x.group()), integer_data) |
|
decimal_data = re.sub(r"\d+", lambda x: self.__copy_num(x.group()), decimal_data) |
|
mode = "normal" |
|
else: |
|
raise ValueError("数据中包含不止一个点!") |
|
else: |
|
integer_data = check_data |
|
decimal_data = None |
|
|
|
if mode == "smart": |
|
|
|
pattern1 = re.compile(fr"^-?\d+(\.\d+)?[{self.all_unit}]?$") |
|
result1 = pattern1.search(integer_data) |
|
if result1: |
|
if result1.group() == integer_data: |
|
if integer_data[-1] in UNIT_CN2AN.keys(): |
|
output = int(float(integer_data[:-1]) * UNIT_CN2AN[integer_data[-1]]) |
|
else: |
|
output = float(integer_data) |
|
return 0, output, None, None |
|
|
|
integer_data = re.sub(r"\d+", lambda x: self.ac.an2cn(x.group()), integer_data) |
|
mode = "normal" |
|
|
|
result_int = re.compile(self.pattern_dict[mode]["int"]).search(integer_data) |
|
if result_int: |
|
if result_int.group() == integer_data: |
|
if decimal_data is not None: |
|
result_dec = re.compile(self.pattern_dict[mode]["dec"]).search(decimal_data) |
|
if result_dec: |
|
if result_dec.group() == decimal_data: |
|
return sign, integer_data, decimal_data, False |
|
else: |
|
return sign, integer_data, decimal_data, False |
|
else: |
|
if mode == "strict": |
|
raise ValueError(f"不符合格式的数据:{integer_data}") |
|
elif mode == "normal": |
|
|
|
ptn_all_num = re.compile(f"^[{self.all_num}]+$") |
|
result_all_num = ptn_all_num.search(integer_data) |
|
if result_all_num: |
|
if result_all_num.group() == integer_data: |
|
if decimal_data is not None: |
|
result_dec = re.compile(self.pattern_dict[mode]["dec"]).search(decimal_data) |
|
if result_dec: |
|
if result_dec.group() == decimal_data: |
|
return sign, integer_data, decimal_data, True |
|
else: |
|
return sign, integer_data, decimal_data, True |
|
|
|
|
|
ptn_speaking_mode = re.compile(f"^[{self.all_num}][{self.all_unit}][{self.all_num}]$") |
|
result_speaking_mode = ptn_speaking_mode.search(integer_data) |
|
if result_speaking_mode: |
|
if result_speaking_mode.group() == integer_data: |
|
_unit = UNIT_LOW_AN2CN[UNIT_CN2AN[integer_data[1]]//10] |
|
integer_data = integer_data + _unit |
|
if decimal_data is not None: |
|
result_dec = re.compile(self.pattern_dict[mode]["dec"]).search(decimal_data) |
|
if result_dec: |
|
if result_dec.group() == decimal_data: |
|
return sign, integer_data, decimal_data, False |
|
else: |
|
return sign, integer_data, decimal_data, False |
|
|
|
raise ValueError(f"不符合格式的数据:{check_data}") |
|
|
|
def __integer_convert(self, integer_data: str) -> int: |
|
|
|
output_integer = 0 |
|
unit = 1 |
|
ten_thousand_unit = 1 |
|
for index, cn_num in enumerate(reversed(integer_data)): |
|
|
|
if cn_num in NUMBER_CN2AN: |
|
num = NUMBER_CN2AN[cn_num] |
|
output_integer += num * unit |
|
|
|
elif cn_num in UNIT_CN2AN: |
|
unit = UNIT_CN2AN[cn_num] |
|
|
|
if unit % 10000 == 0: |
|
|
|
if unit > ten_thousand_unit: |
|
ten_thousand_unit = unit |
|
|
|
else: |
|
ten_thousand_unit = unit * ten_thousand_unit |
|
unit = ten_thousand_unit |
|
|
|
if unit < ten_thousand_unit: |
|
unit = unit * ten_thousand_unit |
|
|
|
if index == len(integer_data) - 1: |
|
output_integer += unit |
|
else: |
|
raise ValueError(f"{cn_num} 不在转化范围内") |
|
|
|
return int(output_integer) |
|
|
|
def __decimal_convert(self, decimal_data: str) -> float: |
|
len_decimal_data = len(decimal_data) |
|
|
|
if len_decimal_data > 16: |
|
print(f"注意:小数部分长度为 {len_decimal_data} ,将自动截取前 16 位有效精度!") |
|
decimal_data = decimal_data[:16] |
|
len_decimal_data = 16 |
|
|
|
output_decimal = 0 |
|
for index in range(len(decimal_data) - 1, -1, -1): |
|
unit_key = NUMBER_CN2AN[decimal_data[index]] |
|
output_decimal += unit_key * 10 ** -(index + 1) |
|
|
|
|
|
output_decimal = round(output_decimal, len_decimal_data) |
|
|
|
return output_decimal |
|
|
|
def __direct_convert(self, data: str) -> int: |
|
output_data = 0 |
|
for index in range(len(data) - 1, -1, -1): |
|
unit_key = NUMBER_CN2AN[data[index]] |
|
output_data += unit_key * 10 ** (len(data) - index - 1) |
|
|
|
return output_data |
|
|