Spaces:
Sleeping
Sleeping
# import numpy as np | |
import re | |
import string | |
import json | |
from datetime import datetime | |
from typing import Text, Dict | |
# delete tone and lower | |
anphabet = ['a', 'ă', 'â', 'b', 'c', 'd', | |
'đ', 'e', 'ê', 'g', 'h', 'i', | |
'k', 'l', 'm', 'n', 'o', 'ô', | |
'ơ', 'p', 'q', 'r', 's', 't', | |
't', 'u', 'ư', 'v', 'x', 'y', | |
] | |
tone = { | |
'á, à, ã, ạ, ả, ấ, ầ, ẫ, ậ, ẩ, ắ, ằ, ẵ, ặ, ẳ, â, ă': 'a', | |
'ó, ò, õ, ọ, ỏ, ố, ồ, ỗ, ộ, ổ, ớ, ờ, ỡ, ợ, ở, ơ, ô': 'o', | |
'é, è, ẽ, ẹ, ẻ, ế, ề, ễ, ệ, ể, ê': 'e', | |
'í, ì, ĩ, ị, ỉ': 'i', | |
'ú, ù, ũ, ụ, ủ, ứ, ừ, ự, ử, ữ, ư': 'u', | |
'đ': 'd', | |
'ý, ỳ, ỹ, ỵ, ỷ': 'y' | |
} | |
RT = {} | |
for i in tone.items(): | |
for j in i[0]: | |
if j == ',' or j == ' ': | |
continue | |
RT[j] = i[1] | |
def remove_accent(text): | |
res = '' | |
for char in text: | |
res += RT[char] if char in RT else char | |
return res | |
# remove functuation | |
def remove_punctuation(text): | |
whitespace = ' ' | |
for i in text: | |
if i in string.punctuation: | |
text = text.replace(i, whitespace) | |
return ' '.join(text.split()) | |
def clean_text(text): | |
text = text.encode("ascii", errors="ignore").decode( | |
"ascii" | |
) # remove non-ascii, Chinese characters | |
text = re.sub(r"http\S+", "", text) | |
text = re.sub(r"\n", " ", text) | |
text = re.sub(r"\n\n", " ", text) | |
text = re.sub(r"\t", " ", text) | |
text = text.strip(" ") | |
text = re.sub( | |
" +", " ", text | |
).strip() # get rid of multiple spaces and replace with a single | |
return text | |
def remove_prefix(address): | |
if address != remove_accent(address): | |
return re.sub('(tỉnh |thành phố |huyện |thị trấn |thị xã |phường |xã |quận |đường |phố |tp )', '', address, flags=re.IGNORECASE).strip() | |
return re.sub('(tinh |thanh pho |huyen |thi tran |thi xa |phuong |xa |quan |duong |pho |tp )', '', address, flags=re.IGNORECASE).strip() | |
def clean_detail_address(detail_address): | |
detail_address = remove_prefix(detail_address) | |
try: | |
if detail_address[-1] in string.punctuation: | |
detail_address = detail_address[:-1] | |
except: | |
pass | |
return detail_address | |
def get_detail_address(address, std_address): | |
address = address.lower() | |
split_token = list(std_address.values())[0].split()[0] | |
if address == remove_accent(address): | |
split_token = remove_accent(split_token) | |
detail_address = address.split(split_token)[0] | |
if detail_address == address: | |
return '' | |
detail_address = clean_detail_address(detail_address) | |
return detail_address | |
def get_full_result(raw_address, std_address, score): | |
full_result = dict() | |
full_result['detail_address'] = get_detail_address(raw_address, std_address) | |
full_result['main_address'] = std_address | |
full_result['similarity_score'] = score | |
return full_result | |
def save_result(file_path: Text, result: Dict) -> None: | |
log_sample = dict() | |
log_sample['result'] = result | |
log_sample['created_at'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
logs = json.load(open(file_path, "r", encoding="utf8")) | |
logs.append(log_sample) | |
json.dump( | |
logs, | |
open(file_path, "w", encoding="utf8"), | |
ensure_ascii=False, | |
indent=4 | |
) | |