Spaces:

CaoHaiNam
/

address-standardization

Running

App Files Files Community

address-standardization / utils.py

CaoHaiNam

update code

3ca6892 about 2 months ago

raw

history blame contribute delete

No virus

3.49 kB

	# import numpy as np
	import re
	import string
	import json
	from datetime import datetime
	from typing import Text, Dict

	# delete tone and lower
	anphabet = ['a', 'ă', 'â', 'b', 'c', 'd',
	'đ', 'e', 'ê', 'g', 'h', 'i',
	'k', 'l', 'm', 'n', 'o', 'ô',
	'ơ', 'p', 'q', 'r', 's', 't',
	't', 'u', 'ư', 'v', 'x', 'y',
	]

	tone = {
	'á, à, ã, ạ, ả, ấ, ầ, ẫ, ậ, ẩ, ắ, ằ, ẵ, ặ, ẳ, â, ă': 'a',
	'ó, ò, õ, ọ, ỏ, ố, ồ, ỗ, ộ, ổ, ớ, ờ, ỡ, ợ, ở, ơ, ô': 'o',
	'é, è, ẽ, ẹ, ẻ, ế, ề, ễ, ệ, ể, ê': 'e',
	'í, ì, ĩ, ị, ỉ': 'i',
	'ú, ù, ũ, ụ, ủ, ứ, ừ, ự, ử, ữ, ư': 'u',
	'đ': 'd',
	'ý, ỳ, ỹ, ỵ, ỷ': 'y'
	}

	RT = {}
	for i in tone.items():
	for j in i[0]:
	if j == ',' or j == ' ':
	continue
	RT[j] = i[1]


	def remove_accent(text):

	res = ''
	for char in text:
	res += RT[char] if char in RT else char
	return res


	# remove functuation
	def remove_punctuation(text):

	whitespace = ' '
	for i in text:
	if i in string.punctuation:
	text = text.replace(i, whitespace)
	return ' '.join(text.split())


	def clean_text(text):
	text = text.encode("ascii", errors="ignore").decode(
	"ascii"
	) # remove non-ascii, Chinese characters
	text = re.sub(r"http\S+", "", text)
	text = re.sub(r"\n", " ", text)
	text = re.sub(r"\n\n", " ", text)
	text = re.sub(r"\t", " ", text)
	text = text.strip(" ")
	text = re.sub(
	" +", " ", text
	).strip() # get rid of multiple spaces and replace with a single
	return text


	def remove_prefix(address):
	if address != remove_accent(address):
	return re.sub('(tỉnh \|thành phố \|huyện \|thị trấn \|thị xã \|phường \|xã \|quận \|đường \|phố \|tp )', '', address, flags=re.IGNORECASE).strip()
	return re.sub('(tinh \|thanh pho \|huyen \|thi tran \|thi xa \|phuong \|xa \|quan \|duong \|pho \|tp )', '', address, flags=re.IGNORECASE).strip()


	def clean_detail_address(detail_address):
	detail_address = remove_prefix(detail_address)
	try:
	if detail_address[-1] in string.punctuation:
	detail_address = detail_address[:-1]
	except:
	pass
	return detail_address


	def get_detail_address(address, std_address):
	address = address.lower()
	split_token = list(std_address.values())[0].split()[0]
	if address == remove_accent(address):
	split_token = remove_accent(split_token)
	detail_address = address.split(split_token)[0]
	if detail_address == address:
	return ''
	detail_address = clean_detail_address(detail_address)
	return detail_address


	def get_full_result(raw_address, std_address, score):
	full_result = dict()
	full_result['detail_address'] = get_detail_address(raw_address, std_address)
	full_result['main_address'] = std_address
	full_result['similarity_score'] = score
	return full_result


	def save_result(file_path: Text, result: Dict) -> None:
	log_sample = dict()
	log_sample['result'] = result
	log_sample['created_at'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	logs = json.load(open(file_path, "r", encoding="utf8"))
	logs.append(log_sample)
	json.dump(
	logs,
	open(file_path, "w", encoding="utf8"),
	ensure_ascii=False,
	indent=4
	)