OttoYu commited on
Commit
ab38ebf
·
verified ·
1 Parent(s): c441b12

Upload CantonesePhonetics.js

Browse files
Files changed (1) hide show
  1. CantonesePhonetics.js +186 -0
CantonesePhonetics.js ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class CantonesePhonetics {
2
+ constructor() {
3
+ this.charToJyutping = {};
4
+ this.savedResults = [];
5
+ this.similarInitials = {
6
+ b: ["p", "m"],
7
+ c: ["z", "s"],
8
+ d: ["t", "n"],
9
+ f: ["h", "w"],
10
+ g: ["k", "ng"],
11
+ gw: ["kw"],
12
+ h: ["f", "w"],
13
+ j: ["z", "c"],
14
+ k: ["g", "h"],
15
+ kw: ["gw"],
16
+ l: ["n"],
17
+ m: ["n", "b"],
18
+ n: ["l", "m", "ng"],
19
+ ng: ["g", "n"],
20
+ p: ["b", "m"],
21
+ s: ["c", "z"],
22
+ t: ["d", "n"],
23
+ w: ["f", "h"],
24
+ z: ["c", "j"]
25
+ };
26
+ this.similarFinals = {
27
+ aa: ["a", "aai", "aau"],
28
+ aai: ["aa", "ai"],
29
+ aau: ["aa", "au"],
30
+ ai: ["ei", "aai"],
31
+ au: ["ou", "aau"],
32
+ e: ["i", "ei"],
33
+ ei: ["ai", "i"],
34
+ i: ["e", "ei", "yu"],
35
+ o: ["u", "ou"],
36
+ oi: ["ui"],
37
+ ou: ["u", "au"],
38
+ u: ["o", "ou", "yu"],
39
+ ui: ["oi"],
40
+ yu: ["i", "u"]
41
+ };
42
+ }
43
+
44
+ async initialize() {
45
+ const [jyutpingData, results] = await Promise.all([
46
+ fetch("https://huggingface.co/spaces/OttoYu/Cantonese-Phonetics/raw/main/lexi-can_key.json").then(response => response.json()),
47
+ fetch("https://huggingface.co/spaces/OttoYu/Cantonese-Phonetics/raw/main/jyutping_results_largec.json").then(response => response.json())
48
+ ]);
49
+
50
+ this.charToJyutping = this.preprocessJyutpingData(jyutpingData);
51
+ this.savedResults = results;
52
+ }
53
+
54
+ preprocessJyutpingData(jyutpingData) {
55
+ const result = {};
56
+ for (const [syllable, mappings] of Object.entries(jyutpingData)) {
57
+ for (const mapping of mappings) {
58
+ for (const char of mapping["漢字"]) {
59
+ result[char] = syllable;
60
+ }
61
+ }
62
+ }
63
+ return result;
64
+ }
65
+
66
+ chineseToJyutping(text) {
67
+ return text.split("").map(char => this.charToJyutping[char] || char);
68
+ }
69
+
70
+ areJyutpingSimilar(jyutping1, jyutping2) {
71
+ function splitJyutping(jyutping) {
72
+ if (jyutping.length > 1 && "wzjgk".includes(jyutping[1])) {
73
+ return [jyutping.slice(0, 2), jyutping.slice(2)];
74
+ }
75
+ return [jyutping[0], jyutping.slice(1)];
76
+ }
77
+
78
+ const [initial1, final1] = splitJyutping(jyutping1);
79
+ const [initial2, final2] = splitJyutping(jyutping2);
80
+
81
+ const initialMatch = initial1 === initial2 ||
82
+ (this.similarInitials[initial1] && this.similarInitials[initial1].includes(initial2)) ||
83
+ (this.similarInitials[initial2] && this.similarInitials[initial2].includes(initial1));
84
+
85
+ const finalMatch = final1 === final2 ||
86
+ (this.similarFinals[final1] && this.similarFinals[final1].includes(final2)) ||
87
+ (this.similarFinals[final2] && this.similarFinals[final2].includes(final1));
88
+
89
+ return initialMatch && finalMatch;
90
+ }
91
+
92
+ calculatePhoneticSimilarity(userJyutping, resultJyutping) {
93
+ const similarCount = userJyutping.reduce(
94
+ (count, uj) => count + resultJyutping.filter(rj => this.areJyutpingSimilar(uj, rj)).length,
95
+ 0
96
+ );
97
+ return similarCount / Math.max(userJyutping.length, resultJyutping.length);
98
+ }
99
+
100
+ similarity(s1, s2) {
101
+ let longer = s1,
102
+ shorter = s2;
103
+ if (s1.length < s2.length) {
104
+ longer = s2;
105
+ shorter = s1;
106
+ }
107
+ const longerLength = longer.length;
108
+ if (longerLength == 0) {
109
+ return 1.0;
110
+ }
111
+ return (longerLength - this.editDistance(longer, shorter)) / longerLength;
112
+ }
113
+
114
+ editDistance(s1, s2) {
115
+ s1 = s1.toLowerCase();
116
+ s2 = s2.toLowerCase();
117
+
118
+ const costs = new Array();
119
+ for (let i = 0; i <= s1.length; i++) {
120
+ let lastValue = i;
121
+ for (let j = 0; j <= s2.length; j++) {
122
+ if (i == 0) costs[j] = j;
123
+ else {
124
+ if (j > 0) {
125
+ let newValue = costs[j - 1];
126
+ if (s1.charAt(i - 1) != s2.charAt(j - 1))
127
+ newValue = Math.min(Math.min(newValue, lastValue), costs[j]) + 1;
128
+ costs[j - 1] = lastValue;
129
+ lastValue = newValue;
130
+ }
131
+ }
132
+ }
133
+ if (i > 0) costs[s2.length] = lastValue;
134
+ }
135
+ return costs[s2.length];
136
+ }
137
+
138
+ matchUserInput(userInput) {
139
+ const userJyutping = this.chineseToJyutping(userInput);
140
+
141
+ const exactMatch = this.savedResults.find(result =>
142
+ userJyutping.every(uj => result.jyutping.includes(uj))
143
+ );
144
+
145
+ if (exactMatch) {
146
+ return {
147
+ input_text: userInput,
148
+ input_jyutping: userJyutping,
149
+ match: exactMatch,
150
+ match_type: "exact"
151
+ };
152
+ }
153
+
154
+ const matches = this.savedResults
155
+ .map(result => {
156
+ if (!result.text || !result.jyutping) return null;
157
+
158
+ const phoneticScore = this.calculatePhoneticSimilarity(userJyutping, result.jyutping);
159
+ const textSimilarity = this.similarity(userInput, result.text);
160
+ const lengthDiff = Math.abs(userInput.length - result.text.length);
161
+ const lengthPenalty = 1 / (1 + lengthDiff);
162
+
163
+ const totalScore = phoneticScore * 0.7 + textSimilarity * 0.2 + lengthPenalty * 0.1;
164
+ return {
165
+ result,
166
+ score: totalScore
167
+ };
168
+ })
169
+ .filter(Boolean);
170
+
171
+ matches.sort((a, b) => b.score - a.score);
172
+ const topMatches = matches.slice(0, 3);
173
+
174
+ return {
175
+ input_text: userInput,
176
+ input_jyutping: userJyutping,
177
+ matches: topMatches.map(match => ({
178
+ match: match.result,
179
+ score: match.score,
180
+ match_type: "phonetic_similarity"
181
+ }))
182
+ };
183
+ }
184
+ }
185
+
186
+ const phonetics = new CantonesePhonetics();