CaoHaiNam's picture
update code
3ca6892
raw
history blame contribute delete
No virus
3.49 kB
# import numpy as np
import re
import string
import json
from datetime import datetime
from typing import Text, Dict
# delete tone and lower
anphabet = ['a', 'ă', 'â', 'b', 'c', 'd',
'đ', 'e', 'ê', 'g', 'h', 'i',
'k', 'l', 'm', 'n', 'o', 'ô',
'ơ', 'p', 'q', 'r', 's', 't',
't', 'u', 'ư', 'v', 'x', 'y',
]
tone = {
'á, à, ã, ạ, ả, ấ, ầ, ẫ, ậ, ẩ, ắ, ằ, ẵ, ặ, ẳ, â, ă': 'a',
'ó, ò, õ, ọ, ỏ, ố, ồ, ỗ, ộ, ổ, ớ, ờ, ỡ, ợ, ở, ơ, ô': 'o',
'é, è, ẽ, ẹ, ẻ, ế, ề, ễ, ệ, ể, ê': 'e',
'í, ì, ĩ, ị, ỉ': 'i',
'ú, ù, ũ, ụ, ủ, ứ, ừ, ự, ử, ữ, ư': 'u',
'đ': 'd',
'ý, ỳ, ỹ, ỵ, ỷ': 'y'
}
RT = {}
for i in tone.items():
for j in i[0]:
if j == ',' or j == ' ':
continue
RT[j] = i[1]
def remove_accent(text):
res = ''
for char in text:
res += RT[char] if char in RT else char
return res
# remove functuation
def remove_punctuation(text):
whitespace = ' '
for i in text:
if i in string.punctuation:
text = text.replace(i, whitespace)
return ' '.join(text.split())
def clean_text(text):
text = text.encode("ascii", errors="ignore").decode(
"ascii"
) # remove non-ascii, Chinese characters
text = re.sub(r"http\S+", "", text)
text = re.sub(r"\n", " ", text)
text = re.sub(r"\n\n", " ", text)
text = re.sub(r"\t", " ", text)
text = text.strip(" ")
text = re.sub(
" +", " ", text
).strip() # get rid of multiple spaces and replace with a single
return text
def remove_prefix(address):
if address != remove_accent(address):
return re.sub('(tỉnh |thành phố |huyện |thị trấn |thị xã |phường |xã |quận |đường |phố |tp )', '', address, flags=re.IGNORECASE).strip()
return re.sub('(tinh |thanh pho |huyen |thi tran |thi xa |phuong |xa |quan |duong |pho |tp )', '', address, flags=re.IGNORECASE).strip()
def clean_detail_address(detail_address):
detail_address = remove_prefix(detail_address)
try:
if detail_address[-1] in string.punctuation:
detail_address = detail_address[:-1]
except:
pass
return detail_address
def get_detail_address(address, std_address):
address = address.lower()
split_token = list(std_address.values())[0].split()[0]
if address == remove_accent(address):
split_token = remove_accent(split_token)
detail_address = address.split(split_token)[0]
if detail_address == address:
return ''
detail_address = clean_detail_address(detail_address)
return detail_address
def get_full_result(raw_address, std_address, score):
full_result = dict()
full_result['detail_address'] = get_detail_address(raw_address, std_address)
full_result['main_address'] = std_address
full_result['similarity_score'] = score
return full_result
def save_result(file_path: Text, result: Dict) -> None:
log_sample = dict()
log_sample['result'] = result
log_sample['created_at'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
logs = json.load(open(file_path, "r", encoding="utf8"))
logs.append(log_sample)
json.dump(
logs,
open(file_path, "w", encoding="utf8"),
ensure_ascii=False,
indent=4
)