# import numpy as np import re import string import json from datetime import datetime from typing import Text, Dict # delete tone and lower anphabet = ['a', 'ă', 'â', 'b', 'c', 'd', 'đ', 'e', 'ê', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'ô', 'ơ', 'p', 'q', 'r', 's', 't', 't', 'u', 'ư', 'v', 'x', 'y', ] tone = { 'á, à, ã, ạ, ả, ấ, ầ, ẫ, ậ, ẩ, ắ, ằ, ẵ, ặ, ẳ, â, ă': 'a', 'ó, ò, õ, ọ, ỏ, ố, ồ, ỗ, ộ, ổ, ớ, ờ, ỡ, ợ, ở, ơ, ô': 'o', 'é, è, ẽ, ẹ, ẻ, ế, ề, ễ, ệ, ể, ê': 'e', 'í, ì, ĩ, ị, ỉ': 'i', 'ú, ù, ũ, ụ, ủ, ứ, ừ, ự, ử, ữ, ư': 'u', 'đ': 'd', 'ý, ỳ, ỹ, ỵ, ỷ': 'y' } RT = {} for i in tone.items(): for j in i[0]: if j == ',' or j == ' ': continue RT[j] = i[1] def remove_accent(text): res = '' for char in text: res += RT[char] if char in RT else char return res # remove functuation def remove_punctuation(text): whitespace = ' ' for i in text: if i in string.punctuation: text = text.replace(i, whitespace) return ' '.join(text.split()) def clean_text(text): text = text.encode("ascii", errors="ignore").decode( "ascii" ) # remove non-ascii, Chinese characters text = re.sub(r"http\S+", "", text) text = re.sub(r"\n", " ", text) text = re.sub(r"\n\n", " ", text) text = re.sub(r"\t", " ", text) text = text.strip(" ") text = re.sub( " +", " ", text ).strip() # get rid of multiple spaces and replace with a single return text def remove_prefix(address): if address != remove_accent(address): return re.sub('(tỉnh |thành phố |huyện |thị trấn |thị xã |phường |xã |quận |đường |phố |tp )', '', address, flags=re.IGNORECASE).strip() return re.sub('(tinh |thanh pho |huyen |thi tran |thi xa |phuong |xa |quan |duong |pho |tp )', '', address, flags=re.IGNORECASE).strip() def clean_detail_address(detail_address): detail_address = remove_prefix(detail_address) try: if detail_address[-1] in string.punctuation: detail_address = detail_address[:-1] except: pass return detail_address def get_detail_address(address, std_address): address = address.lower() split_token = list(std_address.values())[0].split()[0] if address == remove_accent(address): split_token = remove_accent(split_token) detail_address = address.split(split_token)[0] if detail_address == address: return '' detail_address = clean_detail_address(detail_address) return detail_address def get_full_result(raw_address, std_address, score): full_result = dict() full_result['detail_address'] = get_detail_address(raw_address, std_address) full_result['main_address'] = std_address full_result['similarity_score'] = score return full_result def save_result(file_path: Text, result: Dict) -> None: log_sample = dict() log_sample['result'] = result log_sample['created_at'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") logs = json.load(open(file_path, "r", encoding="utf8")) logs.append(log_sample) json.dump( logs, open(file_path, "w", encoding="utf8"), ensure_ascii=False, indent=4 )