Spaces:

OttoYu
/

Cantonese-Phonetics

Runtime error

App Files Files Community

Cantonese-Phonetics / app.py

OttoYu

Update app.py

c441b12 verified 11 months ago

raw

history blame contribute delete

5.29 kB

	import json
	from functools import lru_cache
	import gradio as gr
	from difflib import SequenceMatcher

	@lru_cache(maxsize=1)
	def load_json_file(json_file):
	with open(json_file, 'r', encoding='utf-8') as file:
	return json.load(file)

	def preprocess_jyutping_data(jyutping_data):
	return {
	char: syllable for syllable, mappings in jyutping_data.items()
	for mapping in mappings for char in mapping["漢字"]
	}


	def chinese_to_jyutping(text, char_to_jyutping):
	return [char_to_jyutping.get(char, char) for char in text]


	def get_similar_initials():
	return {
	'b': ['d', 'p'], 'c': ['s'], 'd': ['b', 't'], 'f': ['h'],
	'g': ['gw'], 'gw': ['g'], 'h': ['f'], 'j': ['z'],
	'jw': ['w'], 'l': ['n'], 'n': ['l'], 'ng': ['n'],
	'p': ['b'], 's': ['c'], 't': ['d'], 'w': ['jw'], 'z': ['j']
	}

	def get_lazy_pronunciations():
	return {
	'n': ['l'], 'l': ['n'],
	'gw': ['g'], 'g': ['gw'],
	'k': ['t'], 't': ['k'],
	'ng': ['n'], 'n': ['ng']
	}


	def are_jyutping_similar(jyutping1, jyutping2, similar_initials, lazy_pronunciations):
	initial1 = jyutping1[:2] if jyutping1[:2] in similar_initials else jyutping1[0]
	initial2 = jyutping2[:2] if jyutping2[:2] in similar_initials else jyutping2[0]

	return (initial1 == initial2 or
	initial2 in similar_initials.get(initial1, []) or
	initial2 in lazy_pronunciations.get(initial1, []))


	@lru_cache(maxsize=1)
	def get_char_to_jyutping():
	jyutping_data = load_json_file('lexi-can_key.json')
	return preprocess_jyutping_data(jyutping_data)


	def calculate_phonetic_similarity(user_jyutping, result_jyutping, similar_initials, lazy_pronunciations):
	similar_count = sum(
	1 for uj in user_jyutping for rj in result_jyutping
	if are_jyutping_similar(uj, rj, similar_initials, lazy_pronunciations)
	)
	return similar_count / max(len(user_jyutping), len(result_jyutping))


	def match_user_input(user_input):
	char_to_jyutping = get_char_to_jyutping()
	similar_initials = get_similar_initials()
	lazy_pronunciations = get_lazy_pronunciations()
	saved_results = load_json_file('jyutping_results_largec.json')

	user_jyutping = chinese_to_jyutping(user_input, char_to_jyutping)

	exact_match = next((result for result in saved_results
	if set(user_jyutping).issubset(result["jyutping"])), None)

	if exact_match:
	return {
	"input_text": user_input,
	"input_jyutping": user_jyutping,
	"match": exact_match,
	"match_type": "exact"
	}

	matches = []
	for result in saved_results:
	phonetic_score = calculate_phonetic_similarity(user_jyutping, result["jyutping"], similar_initials,
	lazy_pronunciations)
	text_similarity = SequenceMatcher(None, user_input, result["text"]).ratio()
	length_diff = abs(len(user_input) - len(result["text"]))
	length_penalty = 1 / (1 + length_diff)

	total_score = (phonetic_score * 0.6) + (text_similarity * 0.3) + (length_penalty * 0.1)
	matches.append((result, total_score))

	matches.sort(key=lambda x: x[1], reverse=True)
	top_matches = matches[:3]

	return {
	"input_text": user_input,
	"input_jyutping": user_jyutping,
	"matches": [
	{
	"match": match[0],
	"score": match[1],
	"match_type": "phonetic_similarity"
	} for match in top_matches
	]
	}


	sample_cases = [
	"龍民大廈", "得輔導西", "賀民天街", "荔枝支道", "黎知覺道", "元周街",
	"謝非道", "金中道", "得立街", "地梨根得里"
	]


	def gradio_app(custom_input, sample_case):
	user_input = sample_case if sample_case else custom_input
	if not user_input:
	return "Please enter text or select a sample case."

	result = match_user_input(user_input)

	if "match" in result:
	return json.dumps(result, ensure_ascii=False, indent=4)
	else:
	formatted_result = {
	"input_text": result["input_text"],
	"input_jyutping": result["input_jyutping"],
	"matches": [
	{
	"text": match["match"]["text"],
	"jyutping": match["match"]["jyutping"],
	"score": round(match["score"], 4),
	"match_type": match["match_type"]
	} for match in result["matches"]
	]
	}
	return json.dumps(formatted_result, ensure_ascii=False, indent=4)


	interface = gr.Interface(
	fn=gradio_app,
	inputs=[
	gr.Textbox(placeholder="Enter text", label="Placename/Street/Building name"),
	gr.Dropdown(choices=[None] + sample_cases, label="Choose a Sample Case")
	],
	outputs=gr.JSON(label="Matching Result"),
	title="Cantonese Homophone and Phonetic Matching 粵語同音異字處理",
	description="Enter Cantonese text or select a sample case, and the app will return a match or the closest matches based on phonetic similarity. 輸入粵語文本或選擇一個範例案例，應用程式將傳回粵拼匹配或基於語音相似的最接近匹配。"
	)

	interface.launch()