freemt commited on
Commit
d5ff673
1 Parent(s): 077e1eb

Update plot_mat

Browse files
data/ps-cn.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/ps-en.txt ADDED
The diff for this file is too large to render. See raw diff
 
package.json CHANGED
@@ -8,7 +8,7 @@
8
  },
9
  "scripts": {
10
  "pyright": "pyright",
11
- "flake8": "flake8.bat",
12
  "test": "echo \"Error: no test specified\" && exit 1"
13
  },
14
  "repository": {
 
8
  },
9
  "scripts": {
10
  "pyright": "pyright",
11
+ "flake8": "flake8",
12
  "test": "echo \"Error: no test specified\" && exit 1"
13
  },
14
  "repository": {
radiobee/__main__.py CHANGED
@@ -10,7 +10,7 @@ from textwrap import dedent
10
  from itertools import zip_longest
11
  from socket import socket, AF_INET, SOCK_STREAM
12
 
13
- from sklearn.cluster import DBSCAN
14
  import joblib
15
  from varname import nameof
16
  from logzero import logger
@@ -18,7 +18,8 @@ from logzero import logger
18
  # import numpy as np
19
  import pandas as pd
20
  import seaborn as sns
21
- import matplotlib.pyplot as plt
 
22
 
23
  # from tabulate import tabulate
24
  from fastlid import fastlid
@@ -34,9 +35,12 @@ from radiobee.gen_pset import gen_pset
34
  from radiobee.gen_aset import gen_aset
35
  from radiobee.align_texts import align_texts
36
 
37
- # from radiobee.plot_df import plot_df
38
  from radiobee.cmat2tset import cmat2tset
39
 
 
 
 
 
40
  sns.set()
41
  sns.set_style("darkgrid")
42
  fastlid.set_languages = ["en", "zh"]
@@ -91,8 +95,12 @@ if __name__ == "__main__":
91
  ]
92
  outputs = ["dataframe"]
93
  # """
94
- # import logzero
95
- # logzero.loglevel(10)
 
 
 
 
96
  logger.debug(" debug ")
97
  logger.info(" info ")
98
 
@@ -119,9 +127,15 @@ if __name__ == "__main__":
119
  x min_df: int | float = 1
120
  x max_df: int | float = 1.0
121
  # """
122
- input_tf_type = gr.inputs.Dropdown(["linear", "sqrt", "log", "binary"], default="linear")
123
- input_idf_type = gr.inputs.Radio(["None", "standard", "smooth", "bm25"], default="None") # need to convert "None" this to None in fn
124
- input_dl_type = gr.inputs.Radio(["None", "linear", "sqrt", "log"], default="None") # ditto
 
 
 
 
 
 
125
  input_norm_type = gr.inputs.Radio(["None", "l1", "l2"], default="None") # ditto
126
 
127
  inputs = [
@@ -147,12 +161,76 @@ if __name__ == "__main__":
147
 
148
  # modi
149
  examples = [
150
- ["data/test_zh.txt", "data/test_en.txt", "linear", "None", "None", "None", 10, 6, ],
151
- ["data/test_en.txt", "data/test_zh.txt", "linear", "None", "None", "None", 10, 6, ],
152
- ["data/shakespeare_zh500.txt", "data/shakespeare_en500.txt", "linear", "None", "None", "None", 10, 6, ],
153
- ["data/shakespeare_en500.txt", "data/shakespeare_zh500.txt", "linear", "None", "None", "None", 10, 6, ],
154
- ["data/hlm-ch1-zh.txt", "data/hlm-ch1-en.txt", "linear", "None", "None", "None", 10, 6, ],
155
- ["data/hlm-ch1-en.txt", "data/hlm-ch1-zh.txt", "linear", "None", "None", "None", 10, 6, ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  ]
157
  outputs = ["dataframe", "plot"]
158
  outputs = ["plot"]
@@ -227,8 +305,8 @@ if __name__ == "__main__":
227
 
228
  lst1 = [elm for elm in df1.text1 if elm]
229
  lst2 = [elm for elm in df1.text2 if elm]
230
- len1 = len(lst1)
231
- len2 = len(lst2)
232
 
233
  cmat = lists2cmat(
234
  lst1,
@@ -242,66 +320,8 @@ if __name__ == "__main__":
242
  tset = pd.DataFrame(cmat2tset(cmat))
243
  tset.columns = ["x", "y", "cos"]
244
 
245
- df_ = tset
246
-
247
- xlabel: str = lang1
248
- ylabel: str = lang2
249
-
250
- sns.set()
251
- sns.set_style("darkgrid")
252
-
253
- # close all existing figures, necesssary for hf spaces
254
- plt.close("all")
255
- # if sys.platform not in ["win32", "linux"]:
256
- plt.switch_backend('Agg') # to cater for Mac, thanks to WhiteFox
257
-
258
- fig = plt.figure()
259
- gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
260
- ax2 = fig.add_subplot(gs[0, 0])
261
- ax0 = fig.add_subplot(gs[0, 1])
262
- ax1 = fig.add_subplot(gs[1, 0])
263
-
264
- cmap = "viridis_r"
265
- sns.heatmap(cmat, cmap=cmap, ax=ax2).invert_yaxis()
266
- ax2.set_xlabel(xlabel)
267
- ax2.set_ylabel(ylabel)
268
- ax2.set_title("cos similarity heatmap")
269
-
270
- fig.suptitle("alignment projection")
271
-
272
- _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
273
- # _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
274
- _x = ~_
275
-
276
- df_.plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)
277
-
278
- # clustered
279
- df_[_].plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax1)
280
-
281
- # outliers
282
- df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)
283
-
284
- # ax0.set_xlabel("")
285
- # ax0.set_ylabel("zh")
286
- ax0.set_xlabel(xlabel)
287
- ax0.set_ylabel(ylabel)
288
-
289
- ax0.set_xlim(0, len1)
290
- ax0.set_ylim(0, len2)
291
- ax0.set_title("max along columns ('x': outliers)")
292
-
293
- # ax1.set_xlabel("en")
294
- # ax1.set_ylabel("zh")
295
- ax1.set_xlabel(xlabel)
296
- ax1.set_ylabel(ylabel)
297
-
298
- ax1.set_xlim(0, len1)
299
- ax1.set_ylim(0, len2)
300
- ax1.set_title(f"potential aligned pairs ({round(sum(_) / len1, 2):.0%})")
301
-
302
- # return df, plot_df(pd.DataFrame(cmat))
303
- # tset.plot.scatter("x", "y", c="cos", cmap="viridis_r")
304
-
305
  df_trimmed = pd.concat(
306
  [
307
  df1.iloc[:4, :],
@@ -318,12 +338,13 @@ if __name__ == "__main__":
318
  ],
319
  ignore_index=1,
320
  )
 
321
 
322
  # process lst1, lst2 to obtained df_aligned
323
  # quick fix ValueError: not enough values to unpack (expected at least 1, got 0)
324
  # fixed in gen_pet, but we leave the loop here
325
  for min_s in range(min_samples):
326
- logger.info(" min_samples, try %s", min_samples - min_s)
327
  try:
328
  pset = gen_pset(
329
  cmat,
@@ -342,6 +363,89 @@ if __name__ == "__main__":
342
  # break should happen above when min_samples = 2
343
  raise Exception("bummer, this shouldn't happen, probably another bug")
344
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  src_len, tgt_len = cmat.shape
346
  aset = gen_aset(pset, src_len, tgt_len)
347
  final_list = align_texts(aset, lst2, lst1) # note the order
@@ -359,7 +463,9 @@ if __name__ == "__main__":
359
 
360
  # file_dl.write_text(_, encoding="gb2312") # no go
361
 
362
- file_dl_xlsx = Path(f"{Path(file1.name).stem[:-8]}-{Path(file2.name).stem[:-8]}.xlsx")
 
 
363
  df_aligned.to_excel(file_dl_xlsx)
364
 
365
  # return df_trimmed, plt
@@ -395,10 +501,13 @@ if __name__ == "__main__":
395
  * `Flag`: Should something go wrong, you can click Flag to save the output and inform the developer.
396
  """
397
  )
398
- css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
399
- css = ".output_image, .input_image {height: 20rem !important; width: 100% !important;}"
400
- css_file = (
401
- ".input_file, .output_file {height: 9rem !important; width: 100% !important;}"
 
 
 
402
  )
403
 
404
  logger.info("running at port %s", server_port)
@@ -424,15 +533,21 @@ if __name__ == "__main__":
424
  # height=150, # 500
425
  width=900, # 900
426
  allow_flagging=True,
427
- flagging_options=["fatal", "bug", "brainstorm", "excelsior", ], # "paragon"],
428
- css=f"{css} {css_file}",
 
 
 
 
 
429
  )
430
 
431
  iface.launch(
432
- # share=False,
433
- share=True,
434
- debug=True,
435
- server_name="0.0.0.0",
 
436
  server_port=server_port,
437
  # show_tips=True,
438
  enable_queue=True,
 
10
  from itertools import zip_longest
11
  from socket import socket, AF_INET, SOCK_STREAM
12
 
13
+ from sklearn.cluster import DBSCAN # noqa
14
  import joblib
15
  from varname import nameof
16
  from logzero import logger
 
18
  # import numpy as np
19
  import pandas as pd
20
  import seaborn as sns
21
+
22
+ import matplotlib.pyplot as plt # noqa
23
 
24
  # from tabulate import tabulate
25
  from fastlid import fastlid
 
35
  from radiobee.gen_aset import gen_aset
36
  from radiobee.align_texts import align_texts
37
 
 
38
  from radiobee.cmat2tset import cmat2tset
39
 
40
+ # from radiobee.plot_df import plot_df
41
+ # from radiobee.plot_cmat import plot_cmat
42
+ from radiobee.trim_df import trim_df
43
+
44
  sns.set()
45
  sns.set_style("darkgrid")
46
  fastlid.set_languages = ["en", "zh"]
 
95
  ]
96
  outputs = ["dataframe"]
97
  # """
98
+ import logzero
99
+
100
+ # debug = True
101
+ debug = False
102
+ if debug:
103
+ logzero.loglevel(10)
104
  logger.debug(" debug ")
105
  logger.info(" info ")
106
 
 
127
  x min_df: int | float = 1
128
  x max_df: int | float = 1.0
129
  # """
130
+ input_tf_type = gr.inputs.Dropdown(
131
+ ["linear", "sqrt", "log", "binary"], default="linear"
132
+ )
133
+ input_idf_type = gr.inputs.Radio(
134
+ ["None", "standard", "smooth", "bm25"], default="None"
135
+ ) # need to convert "None" this to None in fn
136
+ input_dl_type = gr.inputs.Radio(
137
+ ["None", "linear", "sqrt", "log"], default="None"
138
+ ) # ditto
139
  input_norm_type = gr.inputs.Radio(["None", "l1", "l2"], default="None") # ditto
140
 
141
  inputs = [
 
161
 
162
  # modi
163
  examples = [
164
+ [
165
+ "data/test_zh.txt",
166
+ "data/test_en.txt",
167
+ "linear",
168
+ "None",
169
+ "None",
170
+ "None",
171
+ 10,
172
+ 6,
173
+ ],
174
+ [
175
+ "data/test_en.txt",
176
+ "data/test_zh.txt",
177
+ "linear",
178
+ "None",
179
+ "None",
180
+ "None",
181
+ 10,
182
+ 6,
183
+ ],
184
+ [
185
+ "data/shakespeare_zh500.txt",
186
+ "data/shakespeare_en500.txt",
187
+ "linear",
188
+ "None",
189
+ "None",
190
+ "None",
191
+ 10,
192
+ 6,
193
+ ],
194
+ [
195
+ "data/shakespeare_en500.txt",
196
+ "data/shakespeare_zh500.txt",
197
+ "linear",
198
+ "None",
199
+ "None",
200
+ "None",
201
+ 10,
202
+ 6,
203
+ ],
204
+ [
205
+ "data/hlm-ch1-zh.txt",
206
+ "data/hlm-ch1-en.txt",
207
+ "linear",
208
+ "None",
209
+ "None",
210
+ "None",
211
+ 10,
212
+ 6,
213
+ ],
214
+ [
215
+ "data/hlm-ch1-en.txt",
216
+ "data/hlm-ch1-zh.txt",
217
+ "linear",
218
+ "None",
219
+ "None",
220
+ "None",
221
+ 10,
222
+ 6,
223
+ ],
224
+ [
225
+ "data/ps-cn.txt",
226
+ "data/ps-en.txt",
227
+ "linear",
228
+ "None",
229
+ "None",
230
+ "None",
231
+ 10,
232
+ 4,
233
+ ],
234
  ]
235
  outputs = ["dataframe", "plot"]
236
  outputs = ["plot"]
 
305
 
306
  lst1 = [elm for elm in df1.text1 if elm]
307
  lst2 = [elm for elm in df1.text2 if elm]
308
+ # len1 = len(lst1) # noqa
309
+ # len2 = len(lst2) # noqa
310
 
311
  cmat = lists2cmat(
312
  lst1,
 
320
  tset = pd.DataFrame(cmat2tset(cmat))
321
  tset.columns = ["x", "y", "cos"]
322
 
323
+ df_trimmed = trim_df(df1)
324
+ _ = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
  df_trimmed = pd.concat(
326
  [
327
  df1.iloc[:4, :],
 
338
  ],
339
  ignore_index=1,
340
  )
341
+ # """
342
 
343
  # process lst1, lst2 to obtained df_aligned
344
  # quick fix ValueError: not enough values to unpack (expected at least 1, got 0)
345
  # fixed in gen_pet, but we leave the loop here
346
  for min_s in range(min_samples):
347
+ logger.info(" min_samples, using %s", min_samples - min_s)
348
  try:
349
  pset = gen_pset(
350
  cmat,
 
363
  # break should happen above when min_samples = 2
364
  raise Exception("bummer, this shouldn't happen, probably another bug")
365
 
366
+ min_samples = gen_pset.min_samples
367
+
368
+ # will result in error message:
369
+ # UserWarning: Starting a Matplotlib GUI outside of
370
+ # the main thread will likely fail."
371
+ _ = """
372
+ plot_cmat(
373
+ cmat,
374
+ eps=eps,
375
+ min_samples=min_samples,
376
+ xlabel=lang1,
377
+ ylabel=lang2,
378
+ )
379
+ # """
380
+
381
+ # move plot_cmat's code to the main thread here
382
+ # to make it work
383
+ xlabel = lang1
384
+ ylabel = lang2
385
+
386
+ len1, len2 = cmat.shape
387
+ ylim, xlim = len1, len2
388
+
389
+ # does not seem to show up
390
+ logger.debug(" len1 (ylim): %s, len2 (xlim): %s", len1, len2)
391
+ if debug:
392
+ print(f" len1 (ylim): {len1}, len2 (xlim): {len2}")
393
+
394
+ df_ = pd.DataFrame(cmat2tset(cmat))
395
+ df_.columns = ["x", "y", "cos"]
396
+
397
+ sns.set()
398
+ sns.set_style("darkgrid")
399
+
400
+ # close all existing figures, necesssary for hf spaces
401
+ plt.close("all")
402
+ # if sys.platform not in ["win32", "linux"]:
403
+ plt.switch_backend('Agg') # to cater for Mac, thanks to WhiteFox
404
+
405
+ # figsize=(13, 8), (339, 212) mm on '1280x800+0+0'
406
+ fig = plt.figure(figsize=(13, 8))
407
+
408
+ # gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
409
+ gs = fig.add_gridspec(1, 2, wspace=0.4, hspace=0.58)
410
+ ax_heatmap = fig.add_subplot(gs[0, 0]) # ax2
411
+ ax0 = fig.add_subplot(gs[0, 1])
412
+ # ax1 = fig.add_subplot(gs[1, 0])
413
+
414
+ cmap = "viridis_r"
415
+ sns.heatmap(cmat, cmap=cmap, ax=ax_heatmap).invert_yaxis()
416
+ ax_heatmap.set_xlabel(xlabel)
417
+ ax_heatmap.set_ylabel(ylabel)
418
+ ax_heatmap.set_title("cos similarity heatmap")
419
+
420
+ fig.suptitle(f"alignment projection\n(eps={eps}, min_samples={min_samples})")
421
+
422
+ _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
423
+ # _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
424
+ _x = ~_
425
+
426
+ # max cos along columns
427
+ df_.plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)
428
+
429
+ # outliers
430
+ df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)
431
+ ax0.set_xlabel(xlabel)
432
+ ax0.set_ylabel(ylabel)
433
+ ax0.set_xlim(xmin=0, xmax=xlim)
434
+ ax0.set_ylim(ymin=0, ymax=ylim)
435
+ ax0.set_title(
436
+ "max along columns ('x': outliers)\n"
437
+ "potential aligned pairs (green line)\n"
438
+ f"({round(sum(_) / xlim, 2):.0%})"
439
+ )
440
+
441
+ # clustered
442
+ # df_[_].plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax1)
443
+ # ax1.set_xlabel(xlabel)
444
+ # ax1.set_ylabel(ylabel)
445
+ # ax1.set_xlim(0, len1)
446
+ # ax1.set_title(f"potential aligned pairs ({round(sum(_) / len1, 2):.0%})")
447
+ # end of plot_cmat
448
+
449
  src_len, tgt_len = cmat.shape
450
  aset = gen_aset(pset, src_len, tgt_len)
451
  final_list = align_texts(aset, lst2, lst1) # note the order
 
463
 
464
  # file_dl.write_text(_, encoding="gb2312") # no go
465
 
466
+ file_dl_xlsx = Path(
467
+ f"{Path(file1.name).stem[:-8]}-{Path(file2.name).stem[:-8]}.xlsx"
468
+ )
469
  df_aligned.to_excel(file_dl_xlsx)
470
 
471
  # return df_trimmed, plt
 
501
  * `Flag`: Should something go wrong, you can click Flag to save the output and inform the developer.
502
  """
503
  )
504
+ css_image = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
505
+ # css = ".output_image, .input_image {height: 20rem !important; width: 100% !important;}"
506
+ css_input_file = (
507
+ ".input_file, {height: 9rem !important; width: 100% !important;}"
508
+ )
509
+ css_output_file = (
510
+ ".output_file , {height: 4rem !important; width: 100% !important;}"
511
  )
512
 
513
  logger.info("running at port %s", server_port)
 
533
  # height=150, # 500
534
  width=900, # 900
535
  allow_flagging=True,
536
+ flagging_options=[
537
+ "fatal",
538
+ "bug",
539
+ "brainstorm",
540
+ "excelsior",
541
+ ], # "paragon"],
542
+ css=f"{css_image} {css_input_file} {css_output_file}",
543
  )
544
 
545
  iface.launch(
546
+ share=False,
547
+ # share=True,
548
+ debug=debug,
549
+ # server_name="0.0.0.0",
550
+ server_name="127.0.0.1",
551
  server_port=server_port,
552
  # show_tips=True,
553
  enable_queue=True,
radiobee/gen_pset.py CHANGED
@@ -152,6 +152,7 @@ def gen_pset(
152
 
153
  Refer to _gen_pset.
154
  """
 
155
  for min_s in range(min_samples):
156
  logger.debug(" min_samples, try %s", min_samples - min_s)
157
  try:
@@ -171,4 +172,8 @@ def gen_pset(
171
  else:
172
  # break should happen above when min_samples = 2
173
  raise Exception("bummer, this shouldn't happen, probably another bug")
 
 
 
 
174
  return pset
 
152
 
153
  Refer to _gen_pset.
154
  """
155
+ gen_pset.min_samples = min_samples
156
  for min_s in range(min_samples):
157
  logger.debug(" min_samples, try %s", min_samples - min_s)
158
  try:
 
172
  else:
173
  # break should happen above when min_samples = 2
174
  raise Exception("bummer, this shouldn't happen, probably another bug")
175
+
176
+ # store new min_samples
177
+ gen_pset.min_samples = min_samples - min_s
178
+
179
  return pset
radiobee/plot_cmat.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Plot pandas.DataFrame with DBSCAN clustering."""
2
+ # pylint: disable=invalid-name, too-many-arguments
3
+ import numpy as np
4
+ import pandas as pd
5
+ import matplotlib
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+ from sklearn.cluster import DBSCAN
9
+
10
+ from fastlid import fastlid
11
+ import logzero
12
+ from logzero import logger
13
+
14
+ from radiobee.cmat2tset import cmat2tset
15
+
16
+ # turn interactive when in ipython session
17
+ _ = """
18
+ if "get_ipython" in globals():
19
+ plt.ion()
20
+ else:
21
+ plt.switch_backend("Agg")
22
+ # """
23
+
24
+ logzero.loglevel(20) # 10: debug on
25
+ fastlid.set_languages = ["en", "zh"]
26
+
27
+
28
+ # fmt: off
29
+ def plot_cmat(
30
+ # df_: pd.DataFrame,
31
+ cmat: np.ndarray,
32
+ eps: float = 10,
33
+ min_samples: int = 6,
34
+ # ylim: int = None,
35
+ xlabel: str = "zh",
36
+ ylabel: str = "en",
37
+ backend: str = "Agg",
38
+ showfig: bool = False,
39
+ ):
40
+ # ) -> plt:
41
+ # fmt: on
42
+ """Plot df with DBSCAN clustering.
43
+
44
+ Args:
45
+ df_: pandas.DataFrame, with three columns columns=["x", "y", "cos"]
46
+ Returns:
47
+ matplotlib.pyplot: for possible use in gradio
48
+
49
+ plot_df(pd.DataFrame(cmat2tset(smat), columns=['x', 'y', 'cos']))
50
+ df_ = pd.DataFrame(cmat2tset(smat), columns=['x', 'y', 'cos'])
51
+
52
+ # sort 'x', axis 0 changes, index regenerated
53
+ df_s = df_.sort_values('x', axis=0, ignore_index=True)
54
+
55
+ # sorintg does not seem to impact clustering
56
+ DBSCAN(1.5, min_samples=3).fit(df_).labels_
57
+ DBSCAN(1.5, min_samples=3).fit(df_s).labels_
58
+
59
+ """
60
+ logger.debug(
61
+ '"get_ipython" in globals(): %s', "get_ipython" in globals()
62
+ )
63
+
64
+ len1, len2 = cmat.shape
65
+
66
+ df_ = pd.DataFrame(cmat2tset(cmat))
67
+ df_.columns = ["x", "y", "cos"]
68
+
69
+ backend_saved = matplotlib.get_backend()
70
+
71
+ # switch backend if necessary
72
+ if backend_saved != backend:
73
+ plt.switch_backend(backend)
74
+
75
+ # len1 = len(lst1) # noqa
76
+ # len2 = len(lst2) # noqa
77
+
78
+ # lang1, _ = fastlid(" ".join(lst1))
79
+ # lang2, _ = fastlid(" ".join(lst2))
80
+ # xlabel: str = lang1
81
+ # ylabel: str = lang2
82
+
83
+ sns.set()
84
+ sns.set_style("darkgrid")
85
+
86
+ # close all existing figures, necesssary for hf spaces
87
+ plt.close("all")
88
+ # if sys.platform not in ["win32", "linux"]:
89
+ # plt.switch_backend('Agg') # to cater for Mac, thanks to WhiteFox
90
+
91
+ fig = plt.figure()
92
+ gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
93
+ ax2 = fig.add_subplot(gs[0, 0])
94
+ ax0 = fig.add_subplot(gs[0, 1])
95
+ ax1 = fig.add_subplot(gs[1, 0])
96
+
97
+ cmap = "viridis_r"
98
+ sns.heatmap(cmat, cmap=cmap, ax=ax2).invert_yaxis()
99
+ ax2.set_xlabel(xlabel)
100
+ ax2.set_ylabel(ylabel)
101
+ ax2.set_title("cos similarity heatmap")
102
+
103
+ fig.suptitle("alignment projection")
104
+
105
+ _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
106
+ # _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
107
+ _x = ~_
108
+
109
+ df_.plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)
110
+
111
+ # clustered
112
+ df_[_].plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax1)
113
+
114
+ # outliers
115
+ df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)
116
+
117
+ ax0.set_xlabel(xlabel)
118
+ ax0.set_ylabel(ylabel)
119
+
120
+ ax0.set_xlim(0, len1)
121
+ ax0.set_ylim(0, len2)
122
+ ax0.set_title("max along columns ('x': outliers)")
123
+
124
+ # ax1.set_xlabel("en")
125
+ # ax1.set_ylabel("zh")
126
+ ax1.set_xlabel(xlabel)
127
+ ax1.set_ylabel(ylabel)
128
+
129
+ ax1.set_xlim(0, len1)
130
+ ax1.set_ylim(0, len2)
131
+ ax1.set_title(f"potential aligned pairs ({round(sum(_) / len1, 2):.0%})")
132
+
133
+ logger.debug(" matplotlib.get_backend(): %s", matplotlib.get_backend())
134
+
135
+ # if matplotlib.get_backend() not in ["Agg"]:
136
+ if showfig:
137
+ # plt.ioff() # or we'll just see the plot show and disappear
138
+ # plt.show()
139
+ plt.show(block=True)
140
+
141
+ # restore if necessary
142
+ if backend_saved != backend:
143
+ plt.switch_backend(backend_saved)
144
+
145
+ # return plt
radiobee/plot_df.py CHANGED
@@ -1,26 +1,37 @@
1
  """Plot pandas.DataFrame with DBSCAN clustering."""
2
  # pylint: disable=invalid-name, too-many-arguments
3
- # import numpy as np
4
  import pandas as pd
 
5
  import matplotlib.pyplot as plt
6
  import seaborn as sns
7
  from sklearn.cluster import DBSCAN
8
 
9
- from logzero import logger
 
 
10
 
11
  # turn interactive when in ipython session
 
12
  if "get_ipython" in globals():
13
  plt.ion()
 
 
 
 
14
 
15
 
16
  # fmt: off
17
  def plot_df(
18
  df_: pd.DataFrame,
19
- min_samples: int = 6,
20
  eps: float = 10,
21
- ylim: int = None,
22
- xlabel: str = "en",
23
- ylabel: str = "zh",
 
 
 
24
  ) -> plt:
25
  # fmt: on
26
  """Plot df with DBSCAN clustering.
@@ -41,60 +52,76 @@ def plot_df(
41
  DBSCAN(1.5, min_samples=3).fit(df_s).labels_
42
 
43
  """
44
- df_ = pd.DataFrame(df_)
45
- if df_.columns.__len__() < 3:
46
- logger.error(
47
- "expected 3 columns DataFram, got: %s, cant proceed, returninng None",
48
- df_.columns.tolist(),
49
- )
50
- return None
51
-
52
- # take first three columns
53
- columns = df_.columns[:3]
54
- df_ = df_[columns]
55
-
56
- # rename columns to "x", "y", "cos"
57
- df_.columns = ["x", "y", "cos"]
 
 
 
 
 
 
 
 
 
58
 
59
  sns.set()
60
  sns.set_style("darkgrid")
61
- # fig, (ax0, ax1) = plt.subplots(2, figsize=(11.69, 8.27))
62
- fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(11.69, 8.27))
63
 
64
- fig.suptitle("alignment projection")
65
- _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
66
- _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
67
 
68
- # ax0.scatter(df_[_].x, df_[_].y, marker='o', c='g', alpha=0.5)
69
- # ax0.grid()
70
- # print("ratio: %.2f%%" % (100 * sum(_)/len(df_)))
 
71
 
72
- df_.plot.scatter("x", "y", c="cos", cmap="viridis_r", ax=ax0)
 
73
 
74
- # clustered
75
- df_[_].plot.scatter("x", "y", c="cos", cmap="viridis_r", ax=ax1)
 
 
76
 
 
 
77
  # outliers
78
  df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)
79
 
80
- # ax0.set_xlabel("")
81
- # ax0.set_ylabel("zh")
82
- ax0.set_xlabel("")
83
- ax0.set_ylabel(ylabel)
84
- xlim = len(df_)
85
- ax0.set_xlim(0, xlim)
86
- if ylim:
87
- ax0.set_ylim(0, ylim)
88
- ax0.set_title("max similarity along columns (outliers denoted by 'x')")
89
-
90
  # ax1.set_xlabel("en")
91
  # ax1.set_ylabel("zh")
92
- ax1.set_xlabel(xlabel)
93
- ax1.set_ylabel(ylabel)
94
 
95
- ax1.set_xlim(0, xlim)
96
- if ylim:
97
- ax1.set_ylim(0, ylim)
98
- ax1.set_title(f"potential aligned pairs ({round(sum(_) / len(df_), 2):.0%})")
 
 
 
 
 
99
 
100
  return plt
 
 
 
 
 
 
 
 
 
 
 
1
  """Plot pandas.DataFrame with DBSCAN clustering."""
2
  # pylint: disable=invalid-name, too-many-arguments
3
+ import numpy as np # noqa
4
  import pandas as pd
5
+ import matplotlib
6
  import matplotlib.pyplot as plt
7
  import seaborn as sns
8
  from sklearn.cluster import DBSCAN
9
 
10
+ from logzero import logger # noqa
11
+
12
+ # from radiobee.cmat2tset import cmat2tset
13
 
14
  # turn interactive when in ipython session
15
+ _ = """
16
  if "get_ipython" in globals():
17
  plt.ion()
18
+ else:
19
+ plt.switch_backend('Agg')
20
+ # """
21
+ # fastlid.set_languages = ["en", "zh"]
22
 
23
 
24
  # fmt: off
25
  def plot_df(
26
  df_: pd.DataFrame,
27
+ # cmat: np.ndarray,
28
  eps: float = 10,
29
+ min_samples: int = 6,
30
+ xlabel: str = "",
31
+ ylabel: str = "",
32
+ xlim: int = 0,
33
+ ylim: int = 0,
34
+ backend: str = "TkAgg",
35
  ) -> plt:
36
  # fmt: on
37
  """Plot df with DBSCAN clustering.
 
52
  DBSCAN(1.5, min_samples=3).fit(df_s).labels_
53
 
54
  """
55
+ # df_ = pd.DataFrame(cmat2tset(cmat))
56
+ if df_.shape[1] == 3:
57
+ df_.columns = ["x", "y", "cos"]
58
+ else:
59
+ logger.error(" shape mismatch: %s, expected (x, 3)", df_.shape)
60
+ # return None
61
+ raise Exception(" df_.shape[1] not equal to 3 ")
62
+
63
+ if not xlim:
64
+ xlim = len(df_)
65
+ if not ylim:
66
+ ylim = df_.y.max()
67
+
68
+ if not xlabel:
69
+ xlabel = str(xlim)
70
+ if not ylabel:
71
+ ylabel = str(ylim)
72
+
73
+ backend_saved = matplotlib.get_backend()
74
+
75
+ # switch if necessary
76
+ if backend_saved != backend:
77
+ plt.switch_backend(backend)
78
 
79
  sns.set()
80
  sns.set_style("darkgrid")
 
 
81
 
82
+ fig = plt.figure(figsize=(13, 8))
 
 
83
 
84
+ # gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
85
+ # ax2 = fig.add_subplot(gs[0, 0])
86
+ # ax0 = fig.add_subplot(gs[0, 1])
87
+ # ax1 = fig.add_subplot(gs[1, 0])
88
 
89
+ gs = fig.add_gridspec(1, 1, wspace=0.4, hspace=0.58)
90
+ ax0 = fig.add_subplot(gs[0, 0])
91
 
92
+ cmap = "viridis_r"
93
+
94
+ _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
95
+ _x = ~_
96
 
97
+ # clustered
98
+ df_[_].plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)
99
  # outliers
100
  df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)
101
 
 
 
 
 
 
 
 
 
 
 
102
  # ax1.set_xlabel("en")
103
  # ax1.set_ylabel("zh")
104
+ ax0.set_xlabel(xlabel)
105
+ ax0.set_ylabel(ylabel)
106
 
107
+ # ax0.set_xlim(0, xlim)
108
+ # ax0.set_ylim(0, ylim)
109
+ ax0.set_title("max cos ('x': outliers)")
110
+
111
+ # ax1.set_title(f"potential aligned pairs ({round(sum(_) / xlim, 2):.0%})")
112
+
113
+ # restore if necessary
114
+ if backend_saved != backend:
115
+ plt.switch_backend(backend_saved)
116
 
117
  return plt
118
+
119
+
120
+ _ = """
121
+ eps: float = 10
122
+ min_samples: int = 6
123
+ xlabel: str = ""
124
+ ylabel: str = ""
125
+ xlim: int = 0
126
+ ylim: int = 0
127
+ """
radiobee/trim_df.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Trim df."""
2
+ import pandas as pd
3
+
4
+
5
+ # fmt: off
6
+ def trim_df(
7
+ df1: pd.DataFrame,
8
+ len_: int = 4,
9
+ ) -> pd.DataFrame:
10
+ # fmt: on
11
+ """Trim df."""
12
+ if len(df1) > 2 * len_:
13
+ df_trimmed = pd.concat(
14
+ [
15
+ df1.iloc[:len_, :],
16
+ pd.DataFrame(
17
+ [
18
+ [
19
+ "...",
20
+ "...",
21
+ ]
22
+ ],
23
+ columns=df1.columns,
24
+ ),
25
+ df1.iloc[-len_:, :],
26
+ ],
27
+ ignore_index=1,
28
+ )
29
+ return df_trimmed
30
+ return df1
run-radiobee.bat CHANGED
@@ -1,3 +1,4 @@
1
  REM nodemon -V -w radiobee -x "sleep 3 && python -m radiobee"
2
  REM nodemon -V -w radiobee -x python -m radiobee
3
- nodemon -V -w radiobee -x py -3.8 -m radiobee
 
 
1
  REM nodemon -V -w radiobee -x "sleep 3 && python -m radiobee"
2
  REM nodemon -V -w radiobee -x python -m radiobee
3
+ REM nodemon -V -w radiobee -x py -3.8 -m radiobee
4
+ nodemon -V -w radiobee -x "run-p pyright flake8 && py -3.8 -m radiobee"