File size: 3,493 Bytes
3a379e2
 
 
3ca6892
 
 
3a379e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ca6892
3a379e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ca6892
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# import numpy as np
import re
import string
import json
from datetime import datetime
from typing import Text, Dict

# delete tone and lower
anphabet = ['a', 'ă', 'â', 'b', 'c', 'd',
            'đ', 'e', 'ê', 'g', 'h', 'i',
            'k', 'l', 'm', 'n', 'o', 'ô',
            'ơ', 'p', 'q', 'r', 's', 't',
            't', 'u', 'ư', 'v', 'x', 'y',
            ]

tone = {
            'á, à, ã, ạ, ả, ấ, ầ, ẫ, ậ, ẩ, ắ, ằ, ẵ, ặ, ẳ, â, ă': 'a',
            'ó, ò, õ, ọ, ỏ, ố, ồ, ỗ, ộ, ổ, ớ, ờ, ỡ, ợ, ở, ơ, ô': 'o',
            'é, è, ẽ, ẹ, ẻ, ế, ề, ễ, ệ, ể, ê': 'e',
            'í, ì, ĩ, ị, ỉ': 'i',
            'ú, ù, ũ, ụ, ủ, ứ, ừ, ự, ử, ữ, ư': 'u',
            'đ': 'd',
            'ý, ỳ, ỹ, ỵ, ỷ': 'y'
        }

RT = {}
for i in tone.items():
    for j in i[0]:
        if j == ',' or j == ' ':
            continue
        RT[j] = i[1]


def remove_accent(text):

    res = ''
    for char in text:
        res += RT[char] if char in RT else char
    return res


# remove functuation
def remove_punctuation(text):

    whitespace = ' '
    for i in text:
        if i in string.punctuation:
            text = text.replace(i, whitespace)
    return ' '.join(text.split())


def clean_text(text):
    text = text.encode("ascii", errors="ignore").decode(
        "ascii"
    )  # remove non-ascii, Chinese characters
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\n\n", " ", text)
    text = re.sub(r"\t", " ", text)
    text = text.strip(" ")
    text = re.sub(
        " +", " ", text
    ).strip()  # get rid of multiple spaces and replace with a single
    return text


def remove_prefix(address):
    if address != remove_accent(address):
        return re.sub('(tỉnh |thành phố |huyện |thị trấn |thị xã |phường |xã |quận |đường |phố |tp )', '', address, flags=re.IGNORECASE).strip()
    return re.sub('(tinh |thanh pho |huyen |thi tran |thi xa |phuong |xa |quan |duong |pho |tp )', '', address, flags=re.IGNORECASE).strip()
    

def clean_detail_address(detail_address):
    detail_address = remove_prefix(detail_address)
    try:
        if detail_address[-1] in string.punctuation:
            detail_address = detail_address[:-1]
    except:
        pass
    return detail_address


def get_detail_address(address, std_address):
    address = address.lower()
    split_token = list(std_address.values())[0].split()[0]
    if address == remove_accent(address):
        split_token = remove_accent(split_token)
    detail_address = address.split(split_token)[0]
    if detail_address == address:
        return ''
    detail_address = clean_detail_address(detail_address)
    return detail_address


def get_full_result(raw_address, std_address, score):
    full_result = dict()
    full_result['detail_address'] = get_detail_address(raw_address, std_address)
    full_result['main_address'] = std_address
    full_result['similarity_score'] = score 
    return full_result


def save_result(file_path: Text, result: Dict) -> None:
    log_sample = dict()
    log_sample['result'] = result
    log_sample['created_at'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    logs = json.load(open(file_path, "r", encoding="utf8"))
    logs.append(log_sample)
    json.dump(
        logs,
        open(file_path, "w", encoding="utf8"),
        ensure_ascii=False,
        indent=4
    )