Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,128 +1,150 @@
|
|
1 |
import json
|
2 |
from functools import lru_cache
|
3 |
import gradio as gr
|
|
|
4 |
|
5 |
@lru_cache(maxsize=1)
|
6 |
-
def
|
7 |
-
with open(json_file, 'r', encoding='utf-8') as file:
|
8 |
-
return json.load(file)
|
9 |
-
|
10 |
-
@lru_cache(maxsize=1)
|
11 |
-
def load_saved_results(json_file):
|
12 |
with open(json_file, 'r', encoding='utf-8') as file:
|
13 |
return json.load(file)
|
14 |
|
15 |
def preprocess_jyutping_data(jyutping_data):
|
16 |
-
|
17 |
-
|
18 |
-
for mapping in mappings
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
for text in text_batch:
|
26 |
-
jyutping_result = [char_to_jyutping.get(char, char) for char in text]
|
27 |
-
jyutping_split = list(set(jyutping_result))
|
28 |
-
results.append({
|
29 |
-
"chinese": list(text),
|
30 |
-
"jyutping": jyutping_split
|
31 |
-
})
|
32 |
-
return results
|
33 |
|
34 |
def get_similar_initials():
|
35 |
return {
|
36 |
-
'b': ['d', 'p'],
|
37 |
-
'
|
38 |
-
'
|
39 |
-
'
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
'
|
45 |
-
'
|
46 |
-
'
|
47 |
-
'ng': ['n'],
|
48 |
-
'p': ['b'],
|
49 |
-
's': ['c'],
|
50 |
-
't': ['d'],
|
51 |
-
'w': ['jw'],
|
52 |
-
'z': ['j']
|
53 |
}
|
54 |
|
55 |
-
|
|
|
56 |
initial1 = jyutping1[:2] if jyutping1[:2] in similar_initials else jyutping1[0]
|
57 |
initial2 = jyutping2[:2] if jyutping2[:2] in similar_initials else jyutping2[0]
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
def match_user_input(user_input):
|
61 |
-
|
62 |
similar_initials = get_similar_initials()
|
63 |
-
|
64 |
-
|
65 |
-
char_to_jyutping = preprocess_jyutping_data(jyutping_data)
|
66 |
-
jyutping_results = chinese_batch_to_jyutping([user_input], char_to_jyutping)
|
67 |
|
68 |
-
user_jyutping =
|
69 |
-
input_text = jyutping_results[0]["chinese"]
|
70 |
|
71 |
-
exact_match = next((result for result in saved_results
|
|
|
72 |
|
73 |
if exact_match:
|
74 |
-
return
|
75 |
-
"input_text":
|
76 |
"input_jyutping": user_jyutping,
|
77 |
-
"match": exact_match
|
78 |
-
|
79 |
-
|
80 |
-
closest_match = None
|
81 |
-
highest_similarity_score = 0
|
82 |
|
|
|
83 |
for result in saved_results:
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
-
return json.dumps({
|
98 |
-
"message": "No suitable match found. Please check the input or try different text."
|
99 |
-
}, ensure_ascii=False, indent=4)
|
100 |
|
101 |
sample_cases = [
|
102 |
-
"龍民大廈",
|
103 |
-
"
|
104 |
-
"賀民天街",
|
105 |
-
"荔枝支道",
|
106 |
-
"元周街",
|
107 |
-
"謝非道",
|
108 |
-
"金中道",
|
109 |
-
"得立街",
|
110 |
-
"地梨根得里"
|
111 |
]
|
112 |
|
113 |
-
|
|
|
114 |
user_input = sample_case if sample_case else custom_input
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
interface = gr.Interface(
|
118 |
fn=gradio_app,
|
119 |
inputs=[
|
120 |
-
gr.Textbox(placeholder="
|
121 |
gr.Dropdown(choices=[None] + sample_cases, label="Choose a Sample Case")
|
122 |
],
|
123 |
outputs=gr.JSON(label="Matching Result"),
|
124 |
title="Cantonese Homophone and Phonetic Matching 粵語同音異字處理",
|
125 |
-
description="
|
126 |
)
|
127 |
|
128 |
-
interface.launch()
|
|
|
1 |
import json
|
2 |
from functools import lru_cache
|
3 |
import gradio as gr
|
4 |
+
from difflib import SequenceMatcher
|
5 |
|
6 |
@lru_cache(maxsize=1)
|
7 |
+
def load_json_file(json_file):
|
|
|
|
|
|
|
|
|
|
|
8 |
with open(json_file, 'r', encoding='utf-8') as file:
|
9 |
return json.load(file)
|
10 |
|
11 |
def preprocess_jyutping_data(jyutping_data):
|
12 |
+
return {
|
13 |
+
char: syllable for syllable, mappings in jyutping_data.items()
|
14 |
+
for mapping in mappings for char in mapping["漢字"]
|
15 |
+
}
|
16 |
+
|
17 |
+
|
18 |
+
def chinese_to_jyutping(text, char_to_jyutping):
|
19 |
+
return [char_to_jyutping.get(char, char) for char in text]
|
20 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
def get_similar_initials():
|
23 |
return {
|
24 |
+
'b': ['d', 'p'], 'c': ['s'], 'd': ['b', 't'], 'f': ['h'],
|
25 |
+
'g': ['gw'], 'gw': ['g'], 'h': ['f'], 'j': ['z'],
|
26 |
+
'jw': ['w'], 'l': ['n'], 'n': ['l'], 'ng': ['n'],
|
27 |
+
'p': ['b'], 's': ['c'], 't': ['d'], 'w': ['jw'], 'z': ['j']
|
28 |
+
}
|
29 |
+
|
30 |
+
def get_lazy_pronunciations():
|
31 |
+
return {
|
32 |
+
'n': ['l'], 'l': ['n'],
|
33 |
+
'gw': ['g'], 'g': ['gw'],
|
34 |
+
'k': ['t'], 't': ['k'],
|
35 |
+
'ng': ['n'], 'n': ['ng']
|
|
|
|
|
|
|
|
|
|
|
36 |
}
|
37 |
|
38 |
+
|
39 |
+
def are_jyutping_similar(jyutping1, jyutping2, similar_initials, lazy_pronunciations):
|
40 |
initial1 = jyutping1[:2] if jyutping1[:2] in similar_initials else jyutping1[0]
|
41 |
initial2 = jyutping2[:2] if jyutping2[:2] in similar_initials else jyutping2[0]
|
42 |
+
|
43 |
+
return (initial1 == initial2 or
|
44 |
+
initial2 in similar_initials.get(initial1, []) or
|
45 |
+
initial2 in lazy_pronunciations.get(initial1, []))
|
46 |
+
|
47 |
+
|
48 |
+
@lru_cache(maxsize=1)
|
49 |
+
def get_char_to_jyutping():
|
50 |
+
jyutping_data = load_json_file('lexi-can_key.json')
|
51 |
+
return preprocess_jyutping_data(jyutping_data)
|
52 |
+
|
53 |
+
|
54 |
+
def calculate_phonetic_similarity(user_jyutping, result_jyutping, similar_initials, lazy_pronunciations):
|
55 |
+
similar_count = sum(
|
56 |
+
1 for uj in user_jyutping for rj in result_jyutping
|
57 |
+
if are_jyutping_similar(uj, rj, similar_initials, lazy_pronunciations)
|
58 |
+
)
|
59 |
+
return similar_count / max(len(user_jyutping), len(result_jyutping))
|
60 |
+
|
61 |
|
62 |
def match_user_input(user_input):
|
63 |
+
char_to_jyutping = get_char_to_jyutping()
|
64 |
similar_initials = get_similar_initials()
|
65 |
+
lazy_pronunciations = get_lazy_pronunciations()
|
66 |
+
saved_results = load_json_file('jyutping_results_largec.json')
|
|
|
|
|
67 |
|
68 |
+
user_jyutping = chinese_to_jyutping(user_input, char_to_jyutping)
|
|
|
69 |
|
70 |
+
exact_match = next((result for result in saved_results
|
71 |
+
if set(user_jyutping).issubset(result["jyutping"])), None)
|
72 |
|
73 |
if exact_match:
|
74 |
+
return {
|
75 |
+
"input_text": user_input,
|
76 |
"input_jyutping": user_jyutping,
|
77 |
+
"match": exact_match,
|
78 |
+
"match_type": "exact"
|
79 |
+
}
|
|
|
|
|
80 |
|
81 |
+
matches = []
|
82 |
for result in saved_results:
|
83 |
+
phonetic_score = calculate_phonetic_similarity(user_jyutping, result["jyutping"], similar_initials,
|
84 |
+
lazy_pronunciations)
|
85 |
+
text_similarity = SequenceMatcher(None, user_input, result["text"]).ratio()
|
86 |
+
length_diff = abs(len(user_input) - len(result["text"]))
|
87 |
+
length_penalty = 1 / (1 + length_diff)
|
88 |
+
|
89 |
+
total_score = (phonetic_score * 0.6) + (text_similarity * 0.3) + (length_penalty * 0.1)
|
90 |
+
matches.append((result, total_score))
|
91 |
+
|
92 |
+
matches.sort(key=lambda x: x[1], reverse=True)
|
93 |
+
top_matches = matches[:3]
|
94 |
+
|
95 |
+
return {
|
96 |
+
"input_text": user_input,
|
97 |
+
"input_jyutping": user_jyutping,
|
98 |
+
"matches": [
|
99 |
+
{
|
100 |
+
"match": match[0],
|
101 |
+
"score": match[1],
|
102 |
+
"match_type": "phonetic_similarity"
|
103 |
+
} for match in top_matches
|
104 |
+
]
|
105 |
+
}
|
106 |
|
|
|
|
|
|
|
107 |
|
108 |
sample_cases = [
|
109 |
+
"龍民大廈", "得輔導西", "賀民天街", "荔枝支道", "黎知覺道", "元周街",
|
110 |
+
"謝非道", "金中道", "得立街", "地梨根得里"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
]
|
112 |
|
113 |
+
|
114 |
+
def gradio_app(custom_input, sample_case):
|
115 |
user_input = sample_case if sample_case else custom_input
|
116 |
+
if not user_input:
|
117 |
+
return "Please enter text or select a sample case."
|
118 |
+
|
119 |
+
result = match_user_input(user_input)
|
120 |
+
|
121 |
+
if "match" in result:
|
122 |
+
return json.dumps(result, ensure_ascii=False, indent=4)
|
123 |
+
else:
|
124 |
+
formatted_result = {
|
125 |
+
"input_text": result["input_text"],
|
126 |
+
"input_jyutping": result["input_jyutping"],
|
127 |
+
"matches": [
|
128 |
+
{
|
129 |
+
"text": match["match"]["text"],
|
130 |
+
"jyutping": match["match"]["jyutping"],
|
131 |
+
"score": round(match["score"], 4),
|
132 |
+
"match_type": match["match_type"]
|
133 |
+
} for match in result["matches"]
|
134 |
+
]
|
135 |
+
}
|
136 |
+
return json.dumps(formatted_result, ensure_ascii=False, indent=4)
|
137 |
+
|
138 |
|
139 |
interface = gr.Interface(
|
140 |
fn=gradio_app,
|
141 |
inputs=[
|
142 |
+
gr.Textbox(placeholder="Enter text", label="Placename/Street/Building name"),
|
143 |
gr.Dropdown(choices=[None] + sample_cases, label="Choose a Sample Case")
|
144 |
],
|
145 |
outputs=gr.JSON(label="Matching Result"),
|
146 |
title="Cantonese Homophone and Phonetic Matching 粵語同音異字處理",
|
147 |
+
description="Enter Cantonese text or select a sample case, and the app will return a match or the closest matches based on phonetic similarity. 輸入粵語文本或選擇一個範例案例,應用程式將傳回粵拼匹配或基於語音相似的最接近匹配。"
|
148 |
)
|
149 |
|
150 |
+
interface.launch()
|