Sam Passaglia commited on
Commit
9d2f9e8
1 Parent(s): 65d65b7
Files changed (2) hide show
  1. requirements.txt +2 -1
  2. yomikata/dictionary.py +116 -3
requirements.txt CHANGED
@@ -16,4 +16,5 @@ datasets>=2.7.1
16
  pynvml==11.4.1
17
  sentencepiece>=0.1.97
18
  streamlit==1.17.0
19
- rich
 
16
  pynvml==11.4.1
17
  sentencepiece>=0.1.97
18
  streamlit==1.17.0
19
+ rich
20
+ altair<5
yomikata/dictionary.py CHANGED
@@ -3,11 +3,16 @@ dictionary.py
3
  Provides the Dictionary class which implements Reader using dictionary lookup.
4
  """
5
 
 
 
 
 
6
  from speach import ttlig
7
- from config.config import ASCII_SPACE_TOKEN
 
8
  from yomikata import utils
 
9
  from yomikata.reader import Reader
10
- import jaconv
11
 
12
 
13
  class Dictionary(Reader):
@@ -93,6 +98,114 @@ class Dictionary(Reader):
93
  if (surface == kana) or pos in ["記号", "補助記号", "特殊"]:
94
  output += surface
95
  else:
96
- output += ttlig.RubyToken.from_furi(surface, kana).to_code()
97
  output = output.replace(ASCII_SPACE_TOKEN, " ")
98
  return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  Provides the Dictionary class which implements Reader using dictionary lookup.
4
  """
5
 
6
+ from difflib import ndiff
7
+
8
+ import jaconv
9
+ from chirptext import deko
10
  from speach import ttlig
11
+ from speach.ttlig import RubyFrag, RubyToken
12
+
13
  from yomikata import utils
14
+ from config.config import ASCII_SPACE_TOKEN
15
  from yomikata.reader import Reader
 
16
 
17
 
18
  class Dictionary(Reader):
98
  if (surface == kana) or pos in ["記号", "補助記号", "特殊"]:
99
  output += surface
100
  else:
101
+ output += Dictionary.furi_to_ruby(surface, kana).to_code()
102
  output = output.replace(ASCII_SPACE_TOKEN, " ")
103
  return output
104
+
105
+ @staticmethod
106
+ def furi_to_ruby(surface, kana):
107
+ """Combine a surface string and a kana string to a RubyToken object with furigana.
108
+
109
+ Args:
110
+ surface (str): Surface string
111
+ kana (str): Kana string
112
+
113
+ Returns:
114
+ RubyToken: RubyToken object with furigana
115
+
116
+ This code is modified from the version in the part of speach library:
117
+ https://github.com/neocl/speach/
118
+ https://github.com/neocl/speach/blob/main/speach/ttlig.py
119
+ :copyright: (c) 2018 Le Tuan Anh <tuananh.ke@gmail.com>
120
+ :license: MIT
121
+ """
122
+
123
+ def common_substring_from_right(string1, string2):
124
+ i = -1 # start from the end of strings
125
+ while -i <= min(len(string1), len(string2)):
126
+ if string1[i] != string2[i]: # if characters don't match, break
127
+ break
128
+ i -= 1 # decrement i to move towards start
129
+ return string1[i + 1 :] if i != -1 else "" # return common substring
130
+
131
+ def assert_rubytoken_kana_match(ruby: RubyToken, kana: str) -> None:
132
+ assert (
133
+ "".join(
134
+ [token.furi if isinstance(token, RubyFrag) else token for token in ruby.groups]
135
+ )
136
+ == kana
137
+ )
138
+
139
+ original_kana = kana
140
+
141
+ final_text = common_substring_from_right(surface, kana)
142
+
143
+ if final_text:
144
+ surface = surface[: -len(final_text)]
145
+ kana = kana[: -len(final_text)]
146
+
147
+ ruby = RubyToken(surface=surface)
148
+ if deko.is_kana(surface):
149
+ ruby.append(surface)
150
+ if final_text:
151
+ ruby.append(final_text)
152
+ assert_rubytoken_kana_match(ruby, original_kana)
153
+ return ruby
154
+
155
+ edit_seq = ndiff(surface, kana)
156
+ kanji = ""
157
+ text = ""
158
+ furi = ""
159
+ before = ""
160
+ expected = ""
161
+ for item in edit_seq:
162
+ if item.startswith("- "):
163
+ # flush text if needed
164
+ if expected and kanji and furi:
165
+ ruby.append(RubyFrag(text=kanji, furi=furi))
166
+ kanji = ""
167
+ furi = ""
168
+ print(ruby)
169
+ if text:
170
+ ruby.append(text)
171
+ text = ""
172
+ kanji += item[2:]
173
+ elif item.startswith("+ "):
174
+ if expected and item[2:] == expected:
175
+ if expected and kanji and furi:
176
+ ruby.append(RubyFrag(text=kanji, furi=furi))
177
+ kanji = ""
178
+ furi = ""
179
+ ruby.append(item[2:])
180
+ expected = ""
181
+ else:
182
+ furi += item[2:]
183
+ elif item.startswith(" "):
184
+ if before == "-" and not furi:
185
+ # shifting happened
186
+ expected = item[2:]
187
+ furi += item[2:]
188
+ else:
189
+ text += item[2:]
190
+ # flush if possible
191
+ if kanji and furi:
192
+ ruby.append(RubyFrag(text=kanji, furi=furi))
193
+ kanji = ""
194
+ furi = ""
195
+ else:
196
+ # possible error?
197
+ pass
198
+ before = item[0] # end for
199
+ if kanji:
200
+ if furi:
201
+ ruby.append(RubyFrag(text=kanji, furi=furi))
202
+ else:
203
+ ruby.append(kanji)
204
+ elif text:
205
+ ruby.append(text)
206
+
207
+ if final_text:
208
+ ruby.append(final_text)
209
+
210
+ assert_rubytoken_kana_match(ruby, original_kana)
211
+ return ruby