OttoYu's picture
Update app.py
c441b12 verified
import json
from functools import lru_cache
import gradio as gr
from difflib import SequenceMatcher
@lru_cache(maxsize=1)
def load_json_file(json_file):
with open(json_file, 'r', encoding='utf-8') as file:
return json.load(file)
def preprocess_jyutping_data(jyutping_data):
return {
char: syllable for syllable, mappings in jyutping_data.items()
for mapping in mappings for char in mapping["漢字"]
}
def chinese_to_jyutping(text, char_to_jyutping):
return [char_to_jyutping.get(char, char) for char in text]
def get_similar_initials():
return {
'b': ['d', 'p'], 'c': ['s'], 'd': ['b', 't'], 'f': ['h'],
'g': ['gw'], 'gw': ['g'], 'h': ['f'], 'j': ['z'],
'jw': ['w'], 'l': ['n'], 'n': ['l'], 'ng': ['n'],
'p': ['b'], 's': ['c'], 't': ['d'], 'w': ['jw'], 'z': ['j']
}
def get_lazy_pronunciations():
return {
'n': ['l'], 'l': ['n'],
'gw': ['g'], 'g': ['gw'],
'k': ['t'], 't': ['k'],
'ng': ['n'], 'n': ['ng']
}
def are_jyutping_similar(jyutping1, jyutping2, similar_initials, lazy_pronunciations):
initial1 = jyutping1[:2] if jyutping1[:2] in similar_initials else jyutping1[0]
initial2 = jyutping2[:2] if jyutping2[:2] in similar_initials else jyutping2[0]
return (initial1 == initial2 or
initial2 in similar_initials.get(initial1, []) or
initial2 in lazy_pronunciations.get(initial1, []))
@lru_cache(maxsize=1)
def get_char_to_jyutping():
jyutping_data = load_json_file('lexi-can_key.json')
return preprocess_jyutping_data(jyutping_data)
def calculate_phonetic_similarity(user_jyutping, result_jyutping, similar_initials, lazy_pronunciations):
similar_count = sum(
1 for uj in user_jyutping for rj in result_jyutping
if are_jyutping_similar(uj, rj, similar_initials, lazy_pronunciations)
)
return similar_count / max(len(user_jyutping), len(result_jyutping))
def match_user_input(user_input):
char_to_jyutping = get_char_to_jyutping()
similar_initials = get_similar_initials()
lazy_pronunciations = get_lazy_pronunciations()
saved_results = load_json_file('jyutping_results_largec.json')
user_jyutping = chinese_to_jyutping(user_input, char_to_jyutping)
exact_match = next((result for result in saved_results
if set(user_jyutping).issubset(result["jyutping"])), None)
if exact_match:
return {
"input_text": user_input,
"input_jyutping": user_jyutping,
"match": exact_match,
"match_type": "exact"
}
matches = []
for result in saved_results:
phonetic_score = calculate_phonetic_similarity(user_jyutping, result["jyutping"], similar_initials,
lazy_pronunciations)
text_similarity = SequenceMatcher(None, user_input, result["text"]).ratio()
length_diff = abs(len(user_input) - len(result["text"]))
length_penalty = 1 / (1 + length_diff)
total_score = (phonetic_score * 0.6) + (text_similarity * 0.3) + (length_penalty * 0.1)
matches.append((result, total_score))
matches.sort(key=lambda x: x[1], reverse=True)
top_matches = matches[:3]
return {
"input_text": user_input,
"input_jyutping": user_jyutping,
"matches": [
{
"match": match[0],
"score": match[1],
"match_type": "phonetic_similarity"
} for match in top_matches
]
}
sample_cases = [
"龍民大廈", "得輔導西", "賀民天街", "荔枝支道", "黎知覺道", "元周街",
"謝非道", "金中道", "得立街", "地梨根得里"
]
def gradio_app(custom_input, sample_case):
user_input = sample_case if sample_case else custom_input
if not user_input:
return "Please enter text or select a sample case."
result = match_user_input(user_input)
if "match" in result:
return json.dumps(result, ensure_ascii=False, indent=4)
else:
formatted_result = {
"input_text": result["input_text"],
"input_jyutping": result["input_jyutping"],
"matches": [
{
"text": match["match"]["text"],
"jyutping": match["match"]["jyutping"],
"score": round(match["score"], 4),
"match_type": match["match_type"]
} for match in result["matches"]
]
}
return json.dumps(formatted_result, ensure_ascii=False, indent=4)
interface = gr.Interface(
fn=gradio_app,
inputs=[
gr.Textbox(placeholder="Enter text", label="Placename/Street/Building name"),
gr.Dropdown(choices=[None] + sample_cases, label="Choose a Sample Case")
],
outputs=gr.JSON(label="Matching Result"),
title="Cantonese Homophone and Phonetic Matching 粵語同音異字處理",
description="Enter Cantonese text or select a sample case, and the app will return a match or the closest matches based on phonetic similarity. 輸入粵語文本或選擇一個範例案例,應用程式將傳回粵拼匹配或基於語音相似的最接近匹配。"
)
interface.launch()