Spaces:
Runtime error
Runtime error
import json | |
from functools import lru_cache | |
import gradio as gr | |
from difflib import SequenceMatcher | |
def load_json_file(json_file): | |
with open(json_file, 'r', encoding='utf-8') as file: | |
return json.load(file) | |
def preprocess_jyutping_data(jyutping_data): | |
return { | |
char: syllable for syllable, mappings in jyutping_data.items() | |
for mapping in mappings for char in mapping["漢字"] | |
} | |
def chinese_to_jyutping(text, char_to_jyutping): | |
return [char_to_jyutping.get(char, char) for char in text] | |
def get_similar_initials(): | |
return { | |
'b': ['d', 'p'], 'c': ['s'], 'd': ['b', 't'], 'f': ['h'], | |
'g': ['gw'], 'gw': ['g'], 'h': ['f'], 'j': ['z'], | |
'jw': ['w'], 'l': ['n'], 'n': ['l'], 'ng': ['n'], | |
'p': ['b'], 's': ['c'], 't': ['d'], 'w': ['jw'], 'z': ['j'] | |
} | |
def get_lazy_pronunciations(): | |
return { | |
'n': ['l'], 'l': ['n'], | |
'gw': ['g'], 'g': ['gw'], | |
'k': ['t'], 't': ['k'], | |
'ng': ['n'], 'n': ['ng'] | |
} | |
def are_jyutping_similar(jyutping1, jyutping2, similar_initials, lazy_pronunciations): | |
initial1 = jyutping1[:2] if jyutping1[:2] in similar_initials else jyutping1[0] | |
initial2 = jyutping2[:2] if jyutping2[:2] in similar_initials else jyutping2[0] | |
return (initial1 == initial2 or | |
initial2 in similar_initials.get(initial1, []) or | |
initial2 in lazy_pronunciations.get(initial1, [])) | |
def get_char_to_jyutping(): | |
jyutping_data = load_json_file('lexi-can_key.json') | |
return preprocess_jyutping_data(jyutping_data) | |
def calculate_phonetic_similarity(user_jyutping, result_jyutping, similar_initials, lazy_pronunciations): | |
similar_count = sum( | |
1 for uj in user_jyutping for rj in result_jyutping | |
if are_jyutping_similar(uj, rj, similar_initials, lazy_pronunciations) | |
) | |
return similar_count / max(len(user_jyutping), len(result_jyutping)) | |
def match_user_input(user_input): | |
char_to_jyutping = get_char_to_jyutping() | |
similar_initials = get_similar_initials() | |
lazy_pronunciations = get_lazy_pronunciations() | |
saved_results = load_json_file('jyutping_results_largec.json') | |
user_jyutping = chinese_to_jyutping(user_input, char_to_jyutping) | |
exact_match = next((result for result in saved_results | |
if set(user_jyutping).issubset(result["jyutping"])), None) | |
if exact_match: | |
return { | |
"input_text": user_input, | |
"input_jyutping": user_jyutping, | |
"match": exact_match, | |
"match_type": "exact" | |
} | |
matches = [] | |
for result in saved_results: | |
phonetic_score = calculate_phonetic_similarity(user_jyutping, result["jyutping"], similar_initials, | |
lazy_pronunciations) | |
text_similarity = SequenceMatcher(None, user_input, result["text"]).ratio() | |
length_diff = abs(len(user_input) - len(result["text"])) | |
length_penalty = 1 / (1 + length_diff) | |
total_score = (phonetic_score * 0.6) + (text_similarity * 0.3) + (length_penalty * 0.1) | |
matches.append((result, total_score)) | |
matches.sort(key=lambda x: x[1], reverse=True) | |
top_matches = matches[:3] | |
return { | |
"input_text": user_input, | |
"input_jyutping": user_jyutping, | |
"matches": [ | |
{ | |
"match": match[0], | |
"score": match[1], | |
"match_type": "phonetic_similarity" | |
} for match in top_matches | |
] | |
} | |
sample_cases = [ | |
"龍民大廈", "得輔導西", "賀民天街", "荔枝支道", "黎知覺道", "元周街", | |
"謝非道", "金中道", "得立街", "地梨根得里" | |
] | |
def gradio_app(custom_input, sample_case): | |
user_input = sample_case if sample_case else custom_input | |
if not user_input: | |
return "Please enter text or select a sample case." | |
result = match_user_input(user_input) | |
if "match" in result: | |
return json.dumps(result, ensure_ascii=False, indent=4) | |
else: | |
formatted_result = { | |
"input_text": result["input_text"], | |
"input_jyutping": result["input_jyutping"], | |
"matches": [ | |
{ | |
"text": match["match"]["text"], | |
"jyutping": match["match"]["jyutping"], | |
"score": round(match["score"], 4), | |
"match_type": match["match_type"] | |
} for match in result["matches"] | |
] | |
} | |
return json.dumps(formatted_result, ensure_ascii=False, indent=4) | |
interface = gr.Interface( | |
fn=gradio_app, | |
inputs=[ | |
gr.Textbox(placeholder="Enter text", label="Placename/Street/Building name"), | |
gr.Dropdown(choices=[None] + sample_cases, label="Choose a Sample Case") | |
], | |
outputs=gr.JSON(label="Matching Result"), | |
title="Cantonese Homophone and Phonetic Matching 粵語同音異字處理", | |
description="Enter Cantonese text or select a sample case, and the app will return a match or the closest matches based on phonetic similarity. 輸入粵語文本或選擇一個範例案例,應用程式將傳回粵拼匹配或基於語音相似的最接近匹配。" | |
) | |
interface.launch() |