OttoYu commited on
Commit
c441b12
·
verified ·
1 Parent(s): 30cac2e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -86
app.py CHANGED
@@ -1,128 +1,150 @@
1
  import json
2
  from functools import lru_cache
3
  import gradio as gr
 
4
 
5
  @lru_cache(maxsize=1)
6
- def load_lexi_can_key(json_file):
7
- with open(json_file, 'r', encoding='utf-8') as file:
8
- return json.load(file)
9
-
10
- @lru_cache(maxsize=1)
11
- def load_saved_results(json_file):
12
  with open(json_file, 'r', encoding='utf-8') as file:
13
  return json.load(file)
14
 
15
  def preprocess_jyutping_data(jyutping_data):
16
- char_to_jyutping = {}
17
- for syllable, mappings in jyutping_data.items():
18
- for mapping in mappings:
19
- for char in mapping["漢字"]:
20
- char_to_jyutping.setdefault(char, syllable)
21
- return char_to_jyutping
22
-
23
- def chinese_batch_to_jyutping(text_batch, char_to_jyutping):
24
- results = []
25
- for text in text_batch:
26
- jyutping_result = [char_to_jyutping.get(char, char) for char in text]
27
- jyutping_split = list(set(jyutping_result))
28
- results.append({
29
- "chinese": list(text),
30
- "jyutping": jyutping_split
31
- })
32
- return results
33
 
34
  def get_similar_initials():
35
  return {
36
- 'b': ['d', 'p'],
37
- 'c': ['s'],
38
- 'd': ['b', 't'],
39
- 'f': ['h'],
40
- 'g': ['gw'],
41
- 'gw': ['g'],
42
- 'h': ['f'],
43
- 'j': ['z'],
44
- 'jw': ['w'],
45
- 'l': ['n'],
46
- 'n': ['l'],
47
- 'ng': ['n'],
48
- 'p': ['b'],
49
- 's': ['c'],
50
- 't': ['d'],
51
- 'w': ['jw'],
52
- 'z': ['j']
53
  }
54
 
55
- def are_jyutping_similar(jyutping1, jyutping2, similar_initials):
 
56
  initial1 = jyutping1[:2] if jyutping1[:2] in similar_initials else jyutping1[0]
57
  initial2 = jyutping2[:2] if jyutping2[:2] in similar_initials else jyutping2[0]
58
- return initial1 == initial2 or initial2 in similar_initials.get(initial1, [])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  def match_user_input(user_input):
61
- jyutping_data = load_lexi_can_key('lexi-can_key.json')
62
  similar_initials = get_similar_initials()
63
- saved_results = load_saved_results('jyutping_results_largec.json')
64
-
65
- char_to_jyutping = preprocess_jyutping_data(jyutping_data)
66
- jyutping_results = chinese_batch_to_jyutping([user_input], char_to_jyutping)
67
 
68
- user_jyutping = jyutping_results[0]["jyutping"]
69
- input_text = jyutping_results[0]["chinese"]
70
 
71
- exact_match = next((result for result in saved_results if set(user_jyutping).issubset(result["jyutping"])), None)
 
72
 
73
  if exact_match:
74
- return json.dumps({
75
- "input_text": input_text,
76
  "input_jyutping": user_jyutping,
77
- "match": exact_match
78
- }, ensure_ascii=False, indent=4)
79
-
80
- closest_match = None
81
- highest_similarity_score = 0
82
 
 
83
  for result in saved_results:
84
- score = sum(1 for jyutping in result["jyutping"] for uj in user_jyutping if
85
- are_jyutping_similar(uj, jyutping, similar_initials))
86
- if score > highest_similarity_score:
87
- highest_similarity_score = score
88
- closest_match = result
89
-
90
- if closest_match:
91
- return json.dumps({
92
- "input_text": input_text,
93
- "input_jyutping": user_jyutping,
94
- "closest_match": closest_match
95
- }, ensure_ascii=False, indent=4)
 
 
 
 
 
 
 
 
 
 
 
96
 
97
- return json.dumps({
98
- "message": "No suitable match found. Please check the input or try different text."
99
- }, ensure_ascii=False, indent=4)
100
 
101
  sample_cases = [
102
- "龍民大廈",
103
- "得輔導西",
104
- "賀民天街",
105
- "荔枝支道",
106
- "元周街",
107
- "謝非道",
108
- "金中道",
109
- "得立街",
110
- "地梨根得里"
111
  ]
112
 
113
- def gradio_app(sample_case, custom_input):
 
114
  user_input = sample_case if sample_case else custom_input
115
- return match_user_input(user_input)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  interface = gr.Interface(
118
  fn=gradio_app,
119
  inputs=[
120
- gr.Textbox(placeholder="Or enter text", label="Placename/Street/Building name"),
121
  gr.Dropdown(choices=[None] + sample_cases, label="Choose a Sample Case")
122
  ],
123
  outputs=gr.JSON(label="Matching Result"),
124
  title="Cantonese Homophone and Phonetic Matching 粵語同音異字處理",
125
- description="Select a sample case or enter Cantonese text, and the app will return a match or the closest match based on phonetic similarity. 選擇一個範例案例或輸入粵語文本,應用程式將傳回粵拼匹配或基於語音相似性的最接近匹配。"
126
  )
127
 
128
- interface.launch()
 
1
  import json
2
  from functools import lru_cache
3
  import gradio as gr
4
+ from difflib import SequenceMatcher
5
 
6
  @lru_cache(maxsize=1)
7
+ def load_json_file(json_file):
 
 
 
 
 
8
  with open(json_file, 'r', encoding='utf-8') as file:
9
  return json.load(file)
10
 
11
  def preprocess_jyutping_data(jyutping_data):
12
+ return {
13
+ char: syllable for syllable, mappings in jyutping_data.items()
14
+ for mapping in mappings for char in mapping["漢字"]
15
+ }
16
+
17
+
18
+ def chinese_to_jyutping(text, char_to_jyutping):
19
+ return [char_to_jyutping.get(char, char) for char in text]
20
+
 
 
 
 
 
 
 
 
21
 
22
  def get_similar_initials():
23
  return {
24
+ 'b': ['d', 'p'], 'c': ['s'], 'd': ['b', 't'], 'f': ['h'],
25
+ 'g': ['gw'], 'gw': ['g'], 'h': ['f'], 'j': ['z'],
26
+ 'jw': ['w'], 'l': ['n'], 'n': ['l'], 'ng': ['n'],
27
+ 'p': ['b'], 's': ['c'], 't': ['d'], 'w': ['jw'], 'z': ['j']
28
+ }
29
+
30
+ def get_lazy_pronunciations():
31
+ return {
32
+ 'n': ['l'], 'l': ['n'],
33
+ 'gw': ['g'], 'g': ['gw'],
34
+ 'k': ['t'], 't': ['k'],
35
+ 'ng': ['n'], 'n': ['ng']
 
 
 
 
 
36
  }
37
 
38
+
39
+ def are_jyutping_similar(jyutping1, jyutping2, similar_initials, lazy_pronunciations):
40
  initial1 = jyutping1[:2] if jyutping1[:2] in similar_initials else jyutping1[0]
41
  initial2 = jyutping2[:2] if jyutping2[:2] in similar_initials else jyutping2[0]
42
+
43
+ return (initial1 == initial2 or
44
+ initial2 in similar_initials.get(initial1, []) or
45
+ initial2 in lazy_pronunciations.get(initial1, []))
46
+
47
+
48
+ @lru_cache(maxsize=1)
49
+ def get_char_to_jyutping():
50
+ jyutping_data = load_json_file('lexi-can_key.json')
51
+ return preprocess_jyutping_data(jyutping_data)
52
+
53
+
54
+ def calculate_phonetic_similarity(user_jyutping, result_jyutping, similar_initials, lazy_pronunciations):
55
+ similar_count = sum(
56
+ 1 for uj in user_jyutping for rj in result_jyutping
57
+ if are_jyutping_similar(uj, rj, similar_initials, lazy_pronunciations)
58
+ )
59
+ return similar_count / max(len(user_jyutping), len(result_jyutping))
60
+
61
 
62
  def match_user_input(user_input):
63
+ char_to_jyutping = get_char_to_jyutping()
64
  similar_initials = get_similar_initials()
65
+ lazy_pronunciations = get_lazy_pronunciations()
66
+ saved_results = load_json_file('jyutping_results_largec.json')
 
 
67
 
68
+ user_jyutping = chinese_to_jyutping(user_input, char_to_jyutping)
 
69
 
70
+ exact_match = next((result for result in saved_results
71
+ if set(user_jyutping).issubset(result["jyutping"])), None)
72
 
73
  if exact_match:
74
+ return {
75
+ "input_text": user_input,
76
  "input_jyutping": user_jyutping,
77
+ "match": exact_match,
78
+ "match_type": "exact"
79
+ }
 
 
80
 
81
+ matches = []
82
  for result in saved_results:
83
+ phonetic_score = calculate_phonetic_similarity(user_jyutping, result["jyutping"], similar_initials,
84
+ lazy_pronunciations)
85
+ text_similarity = SequenceMatcher(None, user_input, result["text"]).ratio()
86
+ length_diff = abs(len(user_input) - len(result["text"]))
87
+ length_penalty = 1 / (1 + length_diff)
88
+
89
+ total_score = (phonetic_score * 0.6) + (text_similarity * 0.3) + (length_penalty * 0.1)
90
+ matches.append((result, total_score))
91
+
92
+ matches.sort(key=lambda x: x[1], reverse=True)
93
+ top_matches = matches[:3]
94
+
95
+ return {
96
+ "input_text": user_input,
97
+ "input_jyutping": user_jyutping,
98
+ "matches": [
99
+ {
100
+ "match": match[0],
101
+ "score": match[1],
102
+ "match_type": "phonetic_similarity"
103
+ } for match in top_matches
104
+ ]
105
+ }
106
 
 
 
 
107
 
108
  sample_cases = [
109
+ "龍民大廈", "得輔導西", "賀民天街", "荔枝支道", "黎知覺道", "元周街",
110
+ "謝非道", "金中道", "得立街", "地梨根得里"
 
 
 
 
 
 
 
111
  ]
112
 
113
+
114
+ def gradio_app(custom_input, sample_case):
115
  user_input = sample_case if sample_case else custom_input
116
+ if not user_input:
117
+ return "Please enter text or select a sample case."
118
+
119
+ result = match_user_input(user_input)
120
+
121
+ if "match" in result:
122
+ return json.dumps(result, ensure_ascii=False, indent=4)
123
+ else:
124
+ formatted_result = {
125
+ "input_text": result["input_text"],
126
+ "input_jyutping": result["input_jyutping"],
127
+ "matches": [
128
+ {
129
+ "text": match["match"]["text"],
130
+ "jyutping": match["match"]["jyutping"],
131
+ "score": round(match["score"], 4),
132
+ "match_type": match["match_type"]
133
+ } for match in result["matches"]
134
+ ]
135
+ }
136
+ return json.dumps(formatted_result, ensure_ascii=False, indent=4)
137
+
138
 
139
  interface = gr.Interface(
140
  fn=gradio_app,
141
  inputs=[
142
+ gr.Textbox(placeholder="Enter text", label="Placename/Street/Building name"),
143
  gr.Dropdown(choices=[None] + sample_cases, label="Choose a Sample Case")
144
  ],
145
  outputs=gr.JSON(label="Matching Result"),
146
  title="Cantonese Homophone and Phonetic Matching 粵語同音異字處理",
147
+ description="Enter Cantonese text or select a sample case, and the app will return a match or the closest matches based on phonetic similarity. 輸入粵語文本或選擇一個範例案例,應用程式將傳回粵拼匹配或基於語音相似的最接近匹配。"
148
  )
149
 
150
+ interface.launch()