freemt commited on
Commit
3d38118
1 Parent(s): dc641b6

Update df_aligned file_csv file_xlsx

Browse files
radiobee/__main__.py CHANGED
@@ -1,6 +1,8 @@
1
  """Run interactively."""
2
  from typing import Tuple # , Optional
3
 
 
 
4
  import joblib
5
  from random import randint
6
  from textwrap import dedent
@@ -25,6 +27,9 @@ from radiobee.process_upload import process_upload
25
  from radiobee.files2df import files2df
26
  from radiobee.file2text import file2text
27
  from radiobee.lists2cmat import lists2cmat
 
 
 
28
 
29
  # from radiobee.plot_df import plot_df
30
  from radiobee.cmat2tset import cmat2tset
@@ -124,32 +129,30 @@ if __name__ == "__main__":
124
  gr.inputs.Slider(
125
  minimum=1,
126
  maximum=20,
127
- step=1,
128
- default=6,
129
- # label="suggested min_samples value: 4-8",
130
  ),
131
  gr.inputs.Slider(
132
  minimum=1,
133
  maximum=20,
134
- step=0.1,
135
- default=10,
136
- # label="suggested esp value: 1.7-3",
137
  ),
138
  ]
139
 
140
  # modi
141
  examples = [
142
- ["data/test_zh.txt", "data/test_en.txt", "linear", "None", "None", "None", 6, 10, ],
143
- ["data/test_en.txt", "data/test_zh.txt", "linear", "None", "None", "None", 6, 10, ],
144
- ["data/shakespeare_zh500.txt", "data/shakespeare_en500.txt", "linear", "None", "None", "None", 6, 10, ],
145
- ["data/shakespeare_en500.txt", "data/shakespeare_zh500.txt", "linear", "None", "None", "None", 6, 10, ],
146
- ["data/hlm-ch1-zh.txt", "data/hlm-ch1-en.txt", "linear", "None", "None", "None", 6, 10, ],
147
- ["data/hlm-ch1-en.txt", "data/hlm-ch1-zh.txt", "linear", "None", "None", "None", 6, 10, ],
148
  ]
149
  outputs = ["dataframe", "plot"]
150
  outputs = ["plot"]
151
  outputs = ["dataframe", "plot"]
152
- out1 = gr.outputs.Dataframe(
153
  headers=None,
154
  max_rows=12, # 20
155
  max_cols=None,
@@ -157,9 +160,28 @@ if __name__ == "__main__":
157
  type="auto",
158
  label="To be aligned",
159
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  outputs = [
161
- out1,
162
  "plot",
 
 
 
163
  ]
164
  # outputs = ["dataframe", "plot", "plot"] # wont work
165
  # outputs = ["dataframe"]
@@ -174,14 +196,14 @@ if __name__ == "__main__":
174
  idf_type,
175
  dl_type,
176
  norm,
 
177
  min_samples,
178
- eps
179
  ):
180
  # modi fn
181
- """Process inputs."""
182
  logger.debug(" *debug* ")
183
 
184
- # cnover "None" to None
185
  for _ in [idf_type, dl_type, norm]:
186
  if _ in "None":
187
  _ = None
@@ -190,159 +212,86 @@ if __name__ == "__main__":
190
  logger.info("file1.name: *%s*, file2.name: *%s*", file1.name, file2.name)
191
 
192
  # bypass if file1 or file2 is str input
193
- if not (isinstance(file1, str) or isinstance(file2, str)):
194
- text1 = file2text(file1)
195
- text2 = file2text(file2)
196
- lang1, _ = fastlid(text1)
197
- lang2, _ = fastlid(text2)
198
-
199
- df1 = files2df(file1, file2)
200
-
201
- lst1 = [elm for elm in df1.text1 if elm]
202
- lst2 = [elm for elm in df1.text2 if elm]
203
- len1 = len(lst1)
204
- len2 = len(lst2)
205
-
206
- # this wont work
207
- # for obj in [text1, text2, df1, lst1, lst2, ]:
208
- # savelzma(text1) wont work
209
-
210
- # for debugging
211
- # joblib.dump(text1, f"data/{nameof(text1)}.lzma")
212
- # joblib.dump(text2, f"data/{nameof(text2)}.lzma")
213
- # joblib.dump(df1, f"data/{nameof(df1)}.lzma")
214
- # joblib.dump(lst1, f"data/{nameof(lst1)}.lzma")
215
- # joblib.dump(lst2, f"data/{nameof(lst2)}.lzma")
216
-
217
- # modi typing https://textacy.readthedocs.io/en/stable/api_reference/representations.html
218
- # tf_type: Literal[linear, sqrt, log, binary] = 'linear'
219
- # idf_tyep: Optional[Literal[standard, smooth, bm25]] = None
220
- # dl_type: Optional[Literal[linear, sqrt, log]] = None
221
- # norm: norm: Optional[Literal[l1, l2]] = None
222
- # min_df: int | float = 1
223
- # max_df: int | float = 1.0
224
-
225
- # cmat = lists2cmat(lst1, lst2)
226
- cmat = lists2cmat(
227
- lst1,
228
- lst2,
229
- tf_type=tf_type,
230
- idf_type=idf_type,
231
- dl_type=dl_type,
232
- norm=norm,
233
- )
234
-
235
- tset = pd.DataFrame(cmat2tset(cmat))
236
- tset.columns = ["x", "y", "cos"]
237
-
238
- # for debugging, logger.debug logger.info dont show up
239
- # print("lst1: %s" % lst1)
240
- # print("lst2: %s" % lst2)
241
- # print("cmat: %s" % cmat)
242
- # print("tset: %s" % tset)
243
-
244
- logger.debug("lst1: %s", lst1)
245
- logger.debug("lst2: %s", lst2)
246
- logger.debug("cmat: %s", cmat)
247
- logger.debug("tset: %s", tset)
248
-
249
- # plt0 = plot_df(pd.DataFrame(cmat))
250
- df_ = tset
251
-
252
- # moved to inputs
253
- # min_samples: int = 6
254
- # eps: float = 10
255
-
256
- # ylim: Optional[int] = None
257
- xlabel: str = lang1
258
- ylabel: str = lang2
259
-
260
- sns.set()
261
- sns.set_style("darkgrid")
262
-
263
- # fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(11.69, 8.27))
264
- # fig, ([ax2, ax0], [ax1, ax3]) = plt.subplots(2, 2, figsize=(11.69, 8.27))
265
- # fig, (ax2, ax0, ax1) = plt.subplots(3)
266
- # fig, (ax2, ax0, ax1) = plt.subplots(3, figsize=(11.69, 8.27))
267
- # fig, (ax2, ax0, ax1) = plt.subplots(1, 3, figsize=(36.69, 8.27))
268
- # fig, (ax2, ax0, ax1) = plt.subplots(1, 3, figsize=(66.69, 22.27))
269
- # fig, (ax2, ax0, ax1) = plt.subplots(1, 3)
270
- # fig.subplots_adjust(hspace=.4)
271
-
272
- fig = plt.figure()
273
- gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
274
- ax2 = fig.add_subplot(gs[0, 0])
275
- ax0 = fig.add_subplot(gs[0, 1])
276
- ax1 = fig.add_subplot(gs[1, 0])
277
-
278
- cmap = "viridis_r"
279
- sns.heatmap(cmat, cmap=cmap, ax=ax2).invert_yaxis()
280
- ax2.set_xlabel(xlabel)
281
- ax2.set_ylabel(ylabel)
282
- ax2.set_title("cos similarity heatmap")
283
-
284
- fig.suptitle("alignment projection")
285
-
286
- _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
287
- _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
288
-
289
- df_.plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)
290
-
291
- # clustered
292
- df_[_].plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax1)
293
-
294
- # outliers
295
- df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)
296
-
297
- # ax0.set_xlabel("")
298
- # ax0.set_ylabel("zh")
299
- ax0.set_xlabel(xlabel)
300
- ax0.set_ylabel(ylabel)
301
-
302
- ax0.set_xlim(0, len1)
303
- ax0.set_ylim(0, len2)
304
- ax0.set_title("max along columns ('x': outliers)")
305
-
306
- # ax1.set_xlabel("en")
307
- # ax1.set_ylabel("zh")
308
- ax1.set_xlabel(xlabel)
309
- ax1.set_ylabel(ylabel)
310
-
311
- ax1.set_xlim(0, len1)
312
- ax1.set_ylim(0, len2)
313
- ax1.set_title(f"potential aligned pairs ({round(sum(_) / len1, 2):.0%})")
314
-
315
- # return df, plot_df(pd.DataFrame(cmat))
316
- # tset.plot.scatter("x", "y", c="cos", cmap="viridis_r")
317
- else:
318
- fig, ax1 = plt.subplots()
319
- df1 = pd.DataFrame(
320
- [
321
- [5.1, 3.5, 0],
322
- [4.9, 3.0, 0],
323
- [7.0, 3.2, 1],
324
- [6.4, 3.2, 1],
325
- [5.9, 3.0, 2],
326
- ],
327
- columns=["length", "width", "species"],
328
- )
329
- df1.plot.scatter(x="length", y="width", c="DarkBlue", ax=ax1)
330
- # plt_heatmap = plt
331
-
332
- # plt.scatter(df.length, df.width) # gradio eturn plt.gcf() or plt
333
 
334
- # return df, plt
335
- # return plt
336
- # return df, df
337
- # return df1.iloc[:10, :], plt
338
 
339
- # pd.concat([df0, pd.DataFrame([[".", ".", "..."]], columns=df0.columns)], ignore_index=1)
340
- # pd.concat([df0.iloc[:2, :], pd.DataFrame([[".", ".", "..."]], columns=df0.columns), df0.iloc[-1:, :]], ignore_index=1)
341
 
342
- # _ = pd.concat([df1.iloc[:4, :], pd.DataFrame([["...", "...", "...", ]], columns=df1.columns), df1.iloc[-2:, :]], ignore_index=True)
343
- # _ = pd.concat([df.iloc[:2, :], pd.DataFrame([[".", ".", "..."]], columns=df.columns), df.iloc[-1:, :]], ignore_index=1)
344
 
345
- _ = pd.concat(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  [
347
  df1.iloc[:4, :],
348
  pd.DataFrame(
@@ -359,10 +308,30 @@ if __name__ == "__main__":
359
  ignore_index=1,
360
  )
361
 
362
- return _, plt
363
- # return _, plt
 
 
 
 
 
 
 
 
364
 
365
- # """
 
 
 
 
 
 
 
 
 
 
 
 
366
 
367
  server_port = 7860
368
  with socket(AF_INET, SOCK_STREAM) as sock:
@@ -382,8 +351,8 @@ if __name__ == "__main__":
382
  ## NB
383
  * Click "Clear" first for subsequent submits when uploading files.
384
  * `tf_type` `idf_type` `dl_type` `norm`: Normally there is no need to touch these unless you know what you are doing.
385
- * Suggested `min_samples` and `esp` values -- `min_samples`: 4-8, `esp` (minimum epsilon): 8-12.
386
- - Smaller `min_samples` or larger `esp` will result in more aligned pairs but also more **false positives** (pairs falsely identified as candidates). On the other hand, larger `min_samples` or smaller `esp` values tend to miss 'good' pairs.
387
  * If you need to have a better look at the image, you can right-click on the image and select copy-image-address and open a new tab in the browser with the copied image address.
388
  * `Flag`: Should something go wrong, you can click Flag to save the output and inform the developer.
389
  """
@@ -408,7 +377,7 @@ if __name__ == "__main__":
408
  inputs=inputs,
409
  outputs=outputs,
410
  title="radiobee-aligner🔠",
411
- description="showcasing a blazing fast dualtext aligner, currrently supported language pairs: en-zh/zh-en",
412
  article=article,
413
  examples=examples,
414
  # theme="darkgrass",
 
1
  """Run interactively."""
2
  from typing import Tuple # , Optional
3
 
4
+
5
+ from pathlib import Path
6
  import joblib
7
  from random import randint
8
  from textwrap import dedent
 
27
  from radiobee.files2df import files2df
28
  from radiobee.file2text import file2text
29
  from radiobee.lists2cmat import lists2cmat
30
+ from radiobee.gen_pset import gen_pset
31
+ from radiobee.gen_aset import gen_aset
32
+ from radiobee.align_texts import align_texts
33
 
34
  # from radiobee.plot_df import plot_df
35
  from radiobee.cmat2tset import cmat2tset
 
129
  gr.inputs.Slider(
130
  minimum=1,
131
  maximum=20,
132
+ step=0.1,
133
+ default=10,
 
134
  ),
135
  gr.inputs.Slider(
136
  minimum=1,
137
  maximum=20,
138
+ step=1,
139
+ default=6,
 
140
  ),
141
  ]
142
 
143
  # modi
144
  examples = [
145
+ ["data/test_zh.txt", "data/test_en.txt", "linear", "None", "None", "None", 10, 6, ],
146
+ ["data/test_en.txt", "data/test_zh.txt", "linear", "None", "None", "None", 10, 6, ],
147
+ ["data/shakespeare_zh500.txt", "data/shakespeare_en500.txt", "linear", "None", "None", "None", 10, 6, ],
148
+ ["data/shakespeare_en500.txt", "data/shakespeare_zh500.txt", "linear", "None", "None", "None", 10, 6, ],
149
+ ["data/hlm-ch1-zh.txt", "data/hlm-ch1-en.txt", "linear", "None", "None", "None", 10, 6, ],
150
+ ["data/hlm-ch1-en.txt", "data/hlm-ch1-zh.txt", "linear", "None", "None", "None", 10, 6, ],
151
  ]
152
  outputs = ["dataframe", "plot"]
153
  outputs = ["plot"]
154
  outputs = ["dataframe", "plot"]
155
+ out_df = gr.outputs.Dataframe(
156
  headers=None,
157
  max_rows=12, # 20
158
  max_cols=None,
 
160
  type="auto",
161
  label="To be aligned",
162
  )
163
+ out_df_aligned = gr.outputs.Dataframe(
164
+ headers=None,
165
+ # max_rows=12, # 20
166
+ max_cols=3,
167
+ overflow_row_behaviour="paginate",
168
+ type="auto",
169
+ label="aligned pairs",
170
+ )
171
+ out_file_dl = gr.outputs.File(
172
+ label="Click to download csv",
173
+ )
174
+ out_file_dl_excel = gr.outputs.File(
175
+ label="Click to download xlsx",
176
+ )
177
+
178
+ # modi outputs
179
  outputs = [
180
+ out_df,
181
  "plot",
182
+ out_file_dl,
183
+ out_file_dl_excel,
184
+ out_df_aligned,
185
  ]
186
  # outputs = ["dataframe", "plot", "plot"] # wont work
187
  # outputs = ["dataframe"]
 
196
  idf_type,
197
  dl_type,
198
  norm,
199
+ eps,
200
  min_samples,
 
201
  ):
202
  # modi fn
203
+ """Process inputs and return outputs."""
204
  logger.debug(" *debug* ")
205
 
206
+ # conver "None" to None for those Radio types
207
  for _ in [idf_type, dl_type, norm]:
208
  if _ in "None":
209
  _ = None
 
212
  logger.info("file1.name: *%s*, file2.name: *%s*", file1.name, file2.name)
213
 
214
  # bypass if file1 or file2 is str input
215
+ # if not (isinstance(file1, str) or isinstance(file2, str)):
216
+ text1 = file2text(file1)
217
+ text2 = file2text(file2)
218
+ lang1, _ = fastlid(text1)
219
+ lang2, _ = fastlid(text2)
220
+
221
+ df1 = files2df(file1, file2)
222
+
223
+ lst1 = [elm for elm in df1.text1 if elm]
224
+ lst2 = [elm for elm in df1.text2 if elm]
225
+ len1 = len(lst1)
226
+ len2 = len(lst2)
227
+
228
+ cmat = lists2cmat(
229
+ lst1,
230
+ lst2,
231
+ tf_type=tf_type,
232
+ idf_type=idf_type,
233
+ dl_type=dl_type,
234
+ norm=norm,
235
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
+ tset = pd.DataFrame(cmat2tset(cmat))
238
+ tset.columns = ["x", "y", "cos"]
 
 
239
 
240
+ df_ = tset
 
241
 
242
+ xlabel: str = lang1
243
+ ylabel: str = lang2
244
 
245
+ sns.set()
246
+ sns.set_style("darkgrid")
247
+
248
+ fig = plt.figure()
249
+ gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
250
+ ax2 = fig.add_subplot(gs[0, 0])
251
+ ax0 = fig.add_subplot(gs[0, 1])
252
+ ax1 = fig.add_subplot(gs[1, 0])
253
+
254
+ cmap = "viridis_r"
255
+ sns.heatmap(cmat, cmap=cmap, ax=ax2).invert_yaxis()
256
+ ax2.set_xlabel(xlabel)
257
+ ax2.set_ylabel(ylabel)
258
+ ax2.set_title("cos similarity heatmap")
259
+
260
+ fig.suptitle("alignment projection")
261
+
262
+ _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
263
+ _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
264
+
265
+ df_.plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)
266
+
267
+ # clustered
268
+ df_[_].plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax1)
269
+
270
+ # outliers
271
+ df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)
272
+
273
+ # ax0.set_xlabel("")
274
+ # ax0.set_ylabel("zh")
275
+ ax0.set_xlabel(xlabel)
276
+ ax0.set_ylabel(ylabel)
277
+
278
+ ax0.set_xlim(0, len1)
279
+ ax0.set_ylim(0, len2)
280
+ ax0.set_title("max along columns ('x': outliers)")
281
+
282
+ # ax1.set_xlabel("en")
283
+ # ax1.set_ylabel("zh")
284
+ ax1.set_xlabel(xlabel)
285
+ ax1.set_ylabel(ylabel)
286
+
287
+ ax1.set_xlim(0, len1)
288
+ ax1.set_ylim(0, len2)
289
+ ax1.set_title(f"potential aligned pairs ({round(sum(_) / len1, 2):.0%})")
290
+
291
+ # return df, plot_df(pd.DataFrame(cmat))
292
+ # tset.plot.scatter("x", "y", c="cos", cmap="viridis_r")
293
+
294
+ df_trimmed = pd.concat(
295
  [
296
  df1.iloc[:4, :],
297
  pd.DataFrame(
 
308
  ignore_index=1,
309
  )
310
 
311
+ # process lst1, lst2 to obtained df_aligned
312
+ pset = gen_pset(
313
+ cmat,
314
+ eps=eps,
315
+ min_samples=min_samples,
316
+ delta=7,
317
+ )
318
+ src_len, tgt_len = cmat.shape
319
+ aset = gen_aset(pset, src_len, tgt_len)
320
+ final_list = align_texts(aset, lst2, lst1) # note the order
321
 
322
+ # df_aligned = df_trimmed
323
+ df_aligned = pd.DataFrame(final_list, columns=["text1", "text2", "likelihood"])
324
+
325
+ _ = df_aligned.to_csv(index=False)
326
+ file_dl = Path(f"{Path(file1.name).stem[:-8]}-{Path(file2.name).stem[:-8]}.csv")
327
+ file_dl.write_text(_, encoding="utf8")
328
+
329
+ file_dl_xlsx = Path(f"{Path(file1.name).stem[:-8]}-{Path(file2.name).stem[:-8]}.xlsx")
330
+ df_aligned.to_excel(file_dl_xlsx)
331
+
332
+ # return df_trimmed, plt
333
+ return df_trimmed, plt, file_dl, file_dl_xlsx, df_aligned
334
+ # modi outputs
335
 
336
  server_port = 7860
337
  with socket(AF_INET, SOCK_STREAM) as sock:
 
351
  ## NB
352
  * Click "Clear" first for subsequent submits when uploading files.
353
  * `tf_type` `idf_type` `dl_type` `norm`: Normally there is no need to touch these unless you know what you are doing.
354
+ * Suggested `esp` and `min_samples` values -- `esp` (minimum epsilon): 8-12, `min_samples`: 4-8.
355
+ - Smaller larger `esp` or `min_samples` will result in more aligned pairs but also more **false positives** (pairs falsely identified as candidates). On the other hand, larger smaller `esp` or `min_samples` values tend to miss 'good' pairs.
356
  * If you need to have a better look at the image, you can right-click on the image and select copy-image-address and open a new tab in the browser with the copied image address.
357
  * `Flag`: Should something go wrong, you can click Flag to save the output and inform the developer.
358
  """
 
377
  inputs=inputs,
378
  outputs=outputs,
379
  title="radiobee-aligner🔠",
380
+ description="WIP showcasing a blazing fast dualtext aligner, currrently supported language pairs: en-zh/zh-en",
381
  article=article,
382
  examples=examples,
383
  # theme="darkgrass",
radiobee/align_texts.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Align texts based on aset, src_text, tgt_text."""
2
+ from typing import List, Tuple, Union
3
+ from logzero import logger
4
+
5
+
6
+ # fmt: off
7
+ def align_texts(
8
+ aset: List[Tuple[Union[str, float], Union[str, float], Union[str, float]]],
9
+ src_text: List[str],
10
+ tgt_text: List[str],
11
+ ) -> List[Tuple[Union[str], Union[str], Union[str, float]]]:
12
+ # fmt: on
13
+ """Align texts (paras/sents) based on aset, src_text, tgt_text.
14
+
15
+ Args:
16
+ aset: align set
17
+ src_text: source text
18
+ tgt_text: target text
19
+
20
+ Returns:
21
+ aligned texts with possible mertics
22
+ """
23
+ xset, yset, metrics = zip(*aset) # unzip aset
24
+ xset = [elm for elm in xset if elm != ""]
25
+ yset = [elm for elm in yset if elm != ""]
26
+
27
+ if (len(xset), len(yset)) != (len(tgt_text), len(src_text)):
28
+ logger.warning(
29
+ " (%s, %s) != (%s, %s) ", len(xset), len(yset), len(tgt_text), len(src_text)
30
+ )
31
+ # raise Exception(" See previous message")
32
+
33
+ texts = []
34
+ for elm in aset:
35
+ elm0, elm1, elm2 = elm
36
+ _ = []
37
+
38
+ # src_text first
39
+ if isinstance(elm1, str):
40
+ _.append("")
41
+ else:
42
+ _.append(src_text[int(elm1)])
43
+
44
+ if isinstance(elm0, str):
45
+ _.append("")
46
+ else:
47
+ _.append(tgt_text[int(elm0)])
48
+
49
+ if isinstance(elm2, str):
50
+ _.append("")
51
+ else:
52
+ _.append(round(elm2, 2))
53
+
54
+ texts.append(tuple(_))
55
+
56
+ # return [("", "", 0.)]
57
+ return texts
radiobee/cmat2tset.py CHANGED
@@ -46,11 +46,11 @@ def cmat2tset(
46
  """
47
  low_ = cmat.min() - 1
48
  argmax_max = []
49
- src_len, tgt_len = cmat.shape
50
  for _ in range(min(src_len, tgt_len)):
51
  argmax = int(cmat.argmax())
52
  row, col = divmod(argmax, tgt_len)
53
- argmax_max.append([col, row, cmat.max()])
54
 
55
  # erase row-th row and col-th col of cmat
56
  cmat[row, :] = low_
 
46
  """
47
  low_ = cmat.min() - 1
48
  argmax_max = []
49
+ src_len, tgt_len = cmat.shape # ylim, xlim
50
  for _ in range(min(src_len, tgt_len)):
51
  argmax = int(cmat.argmax())
52
  row, col = divmod(argmax, tgt_len)
53
+ argmax_max.append([col, row, cmat.max()]) # x-axis, y-axis
54
 
55
  # erase row-th row and col-th col of cmat
56
  cmat[row, :] = low_
radiobee/gen_aset.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Genereat align set (aset) based on pset (pair set), src_lang and tgt_len."""
2
+ from typing import List, Tuple, Union
3
+ from itertools import zip_longest
4
+
5
+ # from logzero import logger
6
+
7
+
8
+ # fmt: off
9
+ def gen_aset(
10
+ pset: List[Tuple[int, int, float]],
11
+ src_len: int, # n_rows
12
+ tgt_len: int, # n_cols
13
+ ) -> List[Tuple[Union[str, float], Union[str, float], Union[str, float]]]:
14
+ # fmt: on
15
+ """Genereat align set (aset) based on pset, src_lang and tgt_len.
16
+
17
+ src_len, tgt_len = cmat.shape
18
+ zip_longest(..., fillvalue="")
19
+
20
+ Args:
21
+ pset: [x(lang2 zh), y(lang1 en), cos]
22
+ src_len: lang1 (en)
23
+ tgt_len: lang2 (zh)
24
+
25
+ Returns:
26
+ aset:
27
+ [0...tgt_len, 0...src_len]
28
+ [0, 0, .]
29
+ ...
30
+ [tgt_len-1, src_len-1, .]
31
+ """
32
+ # empty pset []
33
+ if not pset:
34
+ return [*zip_longest(range(tgt_len), range(src_len), fillvalue="")]
35
+ # empty [[]]
36
+ if len(pset) == 1:
37
+ if not pset[0]:
38
+ return [*zip_longest(range(tgt_len), range(src_len), fillvalue="")]
39
+
40
+ buff = []
41
+ pos0, pos1 = -1, -1
42
+ for elm in pset:
43
+ # elm0, elm1, elm2 = elm
44
+ elm0, elm1, *elm2 = elm
45
+ elm0 = int(elm0)
46
+ elm1 = int(elm1)
47
+ interval = max(elm0 - pos0 - 1, elm1 - pos1 - 1)
48
+ _ = zip_longest(range(pos0 + 1, elm0), range(pos1 + 1, elm1), [""] * interval, fillvalue="")
49
+ buff.extend(_)
50
+ buff.append(elm)
51
+ pos0, pos1 = elm0, elm1
52
+
53
+ # last batch if any
54
+ elm0, elm1 = tgt_len, src_len
55
+ interval = max(elm0 - pos0 - 1, elm1 - pos1 - 1)
56
+ _ = zip_longest(range(pos0 + 1, elm0), range(pos1 + 1, elm1), [""] * interval, fillvalue="")
57
+ buff.extend(_)
58
+
59
+ return buff
radiobee/gen_eps_minsamples.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gen suggested eps min_samples."""
2
+
3
+
4
+ def gen_eps_minsamples(src_len: int, tgt_len: int) -> dict:
5
+ """Gen suggested eps min_samples."""
6
+ eps = src_len * 0.01
7
+ if eps < 3:
8
+ eps = 3
9
+
10
+ min_samples = tgt_len / 100 * 0.5
11
+ if min_samples < 3:
12
+ min_samples = 3
13
+ return {"eps": eps, "min_samples": min_samples}
radiobee/gen_pset.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gne pset from cmat. Find pairs for a given cmat.
2
+
3
+ tinybee.find_pairs.py with fixed estimator='dbscan' eps=eps, min_samples=min_samples
4
+ """
5
+ from typing import List, Tuple, Union
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from sklearn.cluster import DBSCAN
10
+ import logzero
11
+ from logzero import logger
12
+ from radiobee.cmat2tset import cmat2tset
13
+ from radiobee.interpolate_pset import interpolate_pset
14
+
15
+
16
+ def gen_pset(
17
+ cmat1: Union[List[List[float]], np.ndarray, pd.DataFrame],
18
+ eps: float = 10,
19
+ min_samples: int = 6,
20
+ delta: float = 7,
21
+ verbose: Union[bool, int] = False,
22
+ ) -> List[Tuple[int, int, Union[float, str]]]:
23
+ """Gen pset from cmat.
24
+ Find pairs for a given cmat.
25
+
26
+ Args:
27
+ cmat: correlation/similarity matrix
28
+ eps: min epsilon for DBSCAN (10)
29
+ min_samples: minimum # of samples for DBSCAN (6)
30
+ delta: tolerance (7)
31
+
32
+ Returns:
33
+ pairs + "" or metric (float)
34
+
35
+ dbscan_pairs' setup
36
+ if eps is None:
37
+ eps = src_len * .01
38
+ if eps < 3:
39
+ eps = 3
40
+ if min_samples is None:
41
+ min_samples = tgt_len / 100 * 0.5
42
+ if min_samples < 3:
43
+ min_samples = 3
44
+
45
+ def gen_eps_minsamples(src_len, tgt_len):
46
+ eps = src_len * .01
47
+ if eps < 3:
48
+ eps = 3
49
+
50
+ min_samples = tgt_len / 100 * 0.5
51
+ if min_samples < 3:
52
+ min_samples = 3
53
+ return {"eps": eps, "min_samples": min_samples}
54
+
55
+ """
56
+ if isinstance(verbose, bool):
57
+ if verbose:
58
+ verbose = 10
59
+ else:
60
+ verbose = 20
61
+ logzero.loglevel(verbose)
62
+
63
+ # if isinstance(cmat, list):
64
+ cmat = np.array(cmat1)
65
+
66
+ src_len, tgt_len = cmat.shape
67
+
68
+ # tset = cmat2tset(cmat)
69
+ tset = cmat2tset(cmat).tolist()
70
+
71
+ logger.debug("tset: %s", tset)
72
+
73
+ # iset = gen_iset(cmat, verbose=verbose, estimator=estimator)
74
+ labels = DBSCAN(eps=eps, min_samples=min_samples).fit(tset).labels_
75
+
76
+ df_tset = pd.DataFrame(tset, columns=["x", "y", "cos"])
77
+ cset = df_tset[labels > -1].to_numpy()
78
+
79
+ # sort cset
80
+ _ = sorted(cset.tolist(), key=lambda x: x[0])
81
+ iset = interpolate_pset(_, tgt_len)
82
+
83
+ # *_, ymax = zip(*tset)
84
+ # ymax = list(ymax)
85
+ # low_ = np.min(ymax) - 1 # reset to minimum_value - 1
86
+
87
+ buff = [(-1, -1, ""), (tgt_len, src_len, "")]
88
+ # for _ in range(tgt_len):
89
+ for idx, tset_elm in enumerate(tset):
90
+ logger.debug("buff: %s", buff)
91
+ # postion max in ymax and insert in buff
92
+ # if with range given by iset+-delta and
93
+ # it's valid (do not exceed constraint
94
+ # by neighboring points
95
+
96
+ # argmax = int(np.argmax(ymax))
97
+
98
+ # logger.debug("=== %s,%s === %s", _, argmax, tset[_])
99
+ logger.debug("=== %s === %s", _, tset_elm)
100
+
101
+ # ymax[_] = low_
102
+ # elm = tset[argmax]
103
+ # elm0, *_ = elm
104
+
105
+ elm0, *_ = tset_elm
106
+
107
+ # position elm in buff
108
+ idx = -1 # for making pyright happy
109
+ for idx, loc in enumerate(buff):
110
+ if loc[0] > elm0:
111
+ break
112
+ else:
113
+ idx += 1 # last
114
+
115
+ # insert elm in for valid elm
116
+ # (within range inside two neighboring points)
117
+
118
+ # pos = int(tset[argmax][0])
119
+ pos = int(tset_elm[0])
120
+ logger.debug(" %s <=> %s ", tset_elm, iset[pos])
121
+
122
+ # if abs(tset[argmax][1] - iset[pos][1]) <= delta:
123
+ if abs(tset_elm[1] - iset[pos][1]) <= delta:
124
+ if tset_elm[1] > buff[idx - 1][1] and tset_elm[1] < buff[idx][1]:
125
+ buff.insert(idx, tset_elm)
126
+ logger.debug("idx: %s, tset_elm: %s", idx, tset_elm)
127
+ else:
128
+ logger.debug("\t***\t idx: %s, tset_elm: %s", idx, tset_elm)
129
+ _ = """
130
+ if abs(tset[loc][1] - iset[loc][1]) <= delta:
131
+ if tset[loc][1] > buff[idx][1] and tset[loc][1] < buff[idx + 1][1]:
132
+ buff.insert(idx + 1, tset[loc])
133
+ # """
134
+
135
+ # remove first and last entry in buff
136
+ buff.pop(0)
137
+ buff.pop()
138
+
139
+ # return [(1, 1, "")]
140
+ return [(int(elm0), int(elm1), elm2) for elm0, elm1, elm2 in buff]
radiobee/gen_row_alignment.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gen proper alignment for a given triple_set.
2
+
3
+ cmat = fetch_sent_corr(src, tgt)
4
+ src_len, tgt_len = np.array(cmat).shape
5
+ r_ali = gen_row_alignment(cmat, tgt_len, src_len) # note the order
6
+ src[r_ali[1]], tgt[r_ali[0]], r_ali[2]
7
+
8
+ or !!! (targer, source)
9
+ cmat = fetch_sent_corr(tgt, src) # note the order
10
+ src_len, tgt_len = np.array(cmat).shape
11
+ r_ali = gen_row_alignment(cmat, src_len, tgt_len)
12
+ src[r_ali[0]], tgt[r_ali[1]], r_ali[2]
13
+
14
+ ---
15
+ src_txt = 'data/wu_ch2_en.txt'
16
+ tgt_txt = 'data/wu_ch2_zh.txt'
17
+
18
+ assert Path(src_txt).exists()
19
+ assert Path(tgt_txt).exists()
20
+
21
+ src_text, _ = load_paras(src_txt)
22
+ tgt_text, _ = load_paras(tgt_txt)
23
+
24
+ cos_matrix = gen_cos_matrix(src_text, tgt_text)
25
+ t_set, m_matrix = find_aligned_pairs(cos_matrix0, thr=0.4, matrix=True)
26
+
27
+ resu = gen_row_alignment(t_set, src_len, tgt_len)
28
+ resu = np.array(resu)
29
+
30
+ idx = -1
31
+ idx += 1; (resu[idx], src_text[int(resu[idx, 0])],
32
+ tgt_text[int(resu[idx, 1])]) if all(resu[idx]) else resu[idx]
33
+
34
+ idx += 1; i0, i1, i2 = resu[idx]; '***' if i0 == ''
35
+ else src_text[int(i0)], '***' if i1 == '' else tgt_text[int(i1)], ''
36
+ if i2 == '' else i2
37
+ """
38
+ # pylint: disable=line-too-long
39
+ from typing import List, Union
40
+
41
+ # natural extrapolation with slope equal to 1
42
+ from itertools import zip_longest as zip_longest_middle
43
+
44
+ import numpy as np
45
+
46
+ from logzero import logger
47
+
48
+ # from tinybee.zip_longest_middle import zip_longest_middle
49
+
50
+ # from tinybee.zip_longest_middle import zip_longest_middle
51
+ # from tinybee.find_pairs import find_pairs
52
+
53
+ # logger = logging.getLogger(__name__)
54
+ # logger.addHandler(logging.NullHandler())
55
+
56
+
57
+ def gen_row_alignment( # pylint: disable=too-many-locals
58
+ t_set,
59
+ src_len,
60
+ tgt_len,
61
+ # ) -> List[Tuple[Union[str, int], Union[str, int], Union[str, float]]]:
62
+ ) -> List[List[Union[str, float]]]:
63
+ """Gen proper rows for given triple_set.
64
+
65
+ Arguments:
66
+ [t_set {np.array or list}] -- [nll matrix]
67
+ [src_len {int}] -- numb of source texts (para/sents)
68
+ [tgt_len {int}] -- numb of target texts (para/sents)
69
+
70
+ Returns:
71
+ [np.array] -- [proper rows]
72
+ """
73
+ t_set = np.array(t_set, dtype="object")
74
+
75
+ # len0 = src_len
76
+
77
+ # len1 tgt text length, must be provided
78
+ len1 = tgt_len
79
+
80
+ # rearrange t_set as buff in increasing order
81
+ buff = [[-1, -1, ""]] #
82
+ idx_t = 0
83
+ # for elm in t_set:
84
+ # start with bigger value from the 3rd col
85
+
86
+ y00, yargmax, ymax = zip(*t_set)
87
+ ymax_ = np.array(ymax).copy()
88
+ reset_v = np.min(ymax_) - 1
89
+ for count in range(tgt_len):
90
+ argmax = np.argmax(ymax_)
91
+ # reset
92
+ ymax_[argmax] = reset_v
93
+ idx_t = argmax
94
+ elm = t_set[idx_t]
95
+ logger.debug("%s: %s, %s", count, idx_t, elm)
96
+
97
+ # find loc to insert
98
+ elm0, elm1, elm2 = elm
99
+ idx = -1
100
+ for idx, loc in enumerate(buff):
101
+ if loc[0] > elm0:
102
+ break
103
+ else:
104
+ idx += 1 # last
105
+
106
+ # make sure elm1 is within the range
107
+ # prev elm1 < elm1 < next elm1
108
+ if elm1 > buff[idx - 1][1]:
109
+ try: # overflow possible (idx + 1 in # last)
110
+ next_elm = buff[idx][1]
111
+ except IndexError:
112
+ next_elm = len1
113
+ if elm1 < next_elm:
114
+ # insert '' if necessary
115
+ # using zip_longest_middle
116
+ buff.insert(
117
+ idx, [elm0, elm1, elm2],
118
+ )
119
+ # logger.debug('---')
120
+
121
+ idx_t += 1
122
+ # if idx_t == 24: # 20:
123
+ # break
124
+
125
+ # remove [-1, -1]
126
+ # buff.pop(0)
127
+ # buff = np.array(buff, dtype='object')
128
+
129
+ # take care of the tail
130
+ buff += [[src_len, tgt_len, ""]]
131
+
132
+ resu = []
133
+ # merit = []
134
+
135
+ for idx, elm in enumerate(buff[1:]):
136
+ idx1 = idx + 1
137
+ elm0_, elm1_, elm2_ = buff[idx1 - 1] # idx starts from 0
138
+ elm0, elm1, elm2 = elm
139
+ del elm2_, elm2
140
+
141
+ tmp0 = zip_longest_middle(
142
+ list(range(elm0_ + 1, elm0)), list(range(elm1_ + 1, elm1)), fillvalue="",
143
+ )
144
+ # convet to list entries & attache merit
145
+ tmp = [list(t_elm) + [""] for t_elm in tmp0]
146
+
147
+ # update resu
148
+ resu += tmp + [buff[idx1]]
149
+
150
+ # remove the last entry
151
+ return resu[:-1]
radiobee/interpolate_pset.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Interpolate np.nan."""
2
+ from typing import List, Tuple
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+
7
+ # fmt: off
8
+ def interpolate_pset(
9
+ pairs: List[Tuple[int, int, float]],
10
+ tgt_len: int,
11
+ method: str = 'linear',
12
+ limit_direction: str = 'both',
13
+ ) -> List[Tuple[int, int]]:
14
+ # fmt: on
15
+ """Interpolate.
16
+
17
+ Args:
18
+ pairs: integer pairs, some np.nan
19
+ tgt_len: over 0...tgt_len-1 (x-axis, cmat.shape[1])
20
+ method: for use in pd.DataFrame.interpolate
21
+ limit_direction: for use in pd.DataFrame.interpolate
22
+ Returns:
23
+ np.nan converted
24
+ """
25
+ y00, *_ = zip(*pairs)
26
+
27
+ res = []
28
+ for idx in range(tgt_len):
29
+ if idx in y00:
30
+ loc = y00.index(idx)
31
+ res.append(tuple(pairs[loc][:2]))
32
+ else:
33
+ res.append((idx, np.nan))
34
+
35
+ df = pd.DataFrame(res, columns=["y00", "yargmax"])
36
+ _ = df.interpolate(method=method, limit_direction=limit_direction, axis=0)
37
+
38
+ _ = _.to_numpy(dtype=int)
39
+ _ = [(int(elm0), int(elm1)) for elm0, elm1 in _]
40
+
41
+ return _
requirements.txt CHANGED
@@ -12,5 +12,6 @@ seaborn
12
  cchardet
13
  tabulate
14
  git+https://github.com/ffreemt/fast-langid
15
- # dotenv
16
  varname
 
 
12
  cchardet
13
  tabulate
14
  git+https://github.com/ffreemt/fast-langid
15
+ # python-dotenv
16
  varname
17
+ openpyxl