freemt commited on
Commit
52a9494
1 Parent(s): 4c04f50
img/plt.png CHANGED
radiobee/__main__.py CHANGED
@@ -4,6 +4,8 @@ from typing import Any, Tuple, Optional, Union # noqa
4
 
5
  import sys
6
  from pathlib import Path # noqa
 
 
7
  import platform
8
  import signal
9
  from random import randint
@@ -41,7 +43,10 @@ from radiobee.process_upload import process_upload
41
  from radiobee.gradiobee import gradiobee
42
 
43
  ic_install()
44
- ic.configureOutput(includeContext=True)
 
 
 
45
  ic.enable()
46
  # ic.disenable() # to turn off
47
 
@@ -105,6 +110,12 @@ if __name__ == "__main__":
105
  debug = False
106
  debug = True
107
  share = True
 
 
 
 
 
 
108
  else:
109
  server_name = "127.0.0.1"
110
  share = False
@@ -128,7 +139,6 @@ if __name__ == "__main__":
128
  gr.inputs.File(label="file 2", optional=True),
129
  ]
130
 
131
- # modi 1
132
  _ = """
133
  tf_type: Literal[linear, sqrt, log, binary] = 'linear'
134
  idf_type: Optional[Literal[standard, smooth, bm25]] = None
@@ -148,10 +158,13 @@ if __name__ == "__main__":
148
  ) # ditto
149
  input_norm_type = gr.inputs.Radio(["None", "l1", "l2"], default="None") # ditto
150
 
151
- inputs = [
 
 
 
152
  gr.inputs.File(label="file 1"),
153
  gr.inputs.File(label="file 2", optional=True),
154
- input_tf_type, # modi inputs
155
  input_idf_type,
156
  input_dl_type,
157
  input_norm_type,
@@ -167,6 +180,7 @@ if __name__ == "__main__":
167
  step=1,
168
  default=6,
169
  ),
 
170
  ]
171
 
172
  examples = [
@@ -179,16 +193,40 @@ if __name__ == "__main__":
179
  "None",
180
  10,
181
  6,
 
182
  ],
183
  [
 
184
  "data/test_en.txt",
 
 
 
 
 
 
 
 
 
185
  "data/test_zh.txt",
 
186
  "linear",
187
  "None",
188
  "None",
189
  "None",
190
  10,
191
  6,
 
 
 
 
 
 
 
 
 
 
 
 
192
  ],
193
  [
194
  "data/shakespeare_zh500.txt",
@@ -199,6 +237,7 @@ if __name__ == "__main__":
199
  "None",
200
  10,
201
  6,
 
202
  ],
203
  [
204
  "data/shakespeare_en500.txt",
@@ -209,6 +248,7 @@ if __name__ == "__main__":
209
  "None",
210
  10,
211
  6,
 
212
  ],
213
  [
214
  "data/hlm-ch1-zh.txt",
@@ -219,6 +259,7 @@ if __name__ == "__main__":
219
  "None",
220
  10,
221
  6,
 
222
  ],
223
  [
224
  "data/hlm-ch1-en.txt",
@@ -229,6 +270,7 @@ if __name__ == "__main__":
229
  "None",
230
  10,
231
  6,
 
232
  ],
233
  [
234
  "data/ps-cn.txt",
@@ -239,6 +281,7 @@ if __name__ == "__main__":
239
  "None",
240
  10,
241
  4,
 
242
  ],
243
  [
244
  "data/test-dual.txt",
@@ -249,6 +292,7 @@ if __name__ == "__main__":
249
  "None",
250
  10,
251
  6,
 
252
  ],
253
  [
254
  "data/英译中国现代散文选1(汉外对照丛书).txt",
@@ -259,6 +303,7 @@ if __name__ == "__main__":
259
  "None",
260
  10,
261
  6,
 
262
  ],
263
  [
264
  "data/test-zh-ja.txt",
@@ -269,6 +314,7 @@ if __name__ == "__main__":
269
  "None",
270
  10,
271
  6,
 
272
  ],
273
  [
274
  "data/xiyouji-ch1-zh.txt",
@@ -279,6 +325,7 @@ if __name__ == "__main__":
279
  "None",
280
  10,
281
  6,
 
282
  ],
283
  [
284
  "data/demian-hesse-de.txt",
@@ -289,6 +336,7 @@ if __name__ == "__main__":
289
  "None",
290
  10,
291
  6,
 
292
  ],
293
  [
294
  "data/catcher-in-the-rye-shixianrong-zh.txt",
@@ -299,6 +347,7 @@ if __name__ == "__main__":
299
  "None",
300
  10,
301
  6,
 
302
  ],
303
  ]
304
 
@@ -329,14 +378,23 @@ if __name__ == "__main__":
329
  out_file_dl_excel = gr.outputs.File(
330
  label="Click to download xlsx",
331
  )
 
 
 
 
 
 
 
 
332
 
333
- # modi outputs
334
- outputs = [
335
  out_df,
336
- # "plot",
337
  gr.outputs.Image(label="plot"),
338
  out_file_dl,
339
  out_file_dl_excel,
 
 
340
  out_df_aligned,
341
  gr.outputs.HTML(),
342
  ]
4
 
5
  import sys
6
  from pathlib import Path # noqa
7
+ import subprocess as sp
8
+ import shlex
9
  import platform
10
  import signal
11
  from random import randint
43
  from radiobee.gradiobee import gradiobee
44
 
45
  ic_install()
46
+ ic.configureOutput(
47
+ includeContext=True,
48
+ outputFunction=logger.info,
49
+ )
50
  ic.enable()
51
  # ic.disenable() # to turn off
52
 
110
  debug = False
111
  debug = True
112
  share = True
113
+
114
+ # set UTC+8, probably wont work in hf spaces, no permission
115
+ try:
116
+ sp.check_output(shlex.split("ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime"))
117
+ except Exception as exc:
118
+ logger.error(" set timezonef failed: %s", exc)
119
  else:
120
  server_name = "127.0.0.1"
121
  share = False
139
  gr.inputs.File(label="file 2", optional=True),
140
  ]
141
 
 
142
  _ = """
143
  tf_type: Literal[linear, sqrt, log, binary] = 'linear'
144
  idf_type: Optional[Literal[standard, smooth, bm25]] = None
158
  ) # ditto
159
  input_norm_type = gr.inputs.Radio(["None", "l1", "l2"], default="None") # ditto
160
 
161
+ # modi inputs 1, definitions
162
+ sent_ali_algo = gr.inputs.Radio(["None", "fast", "slow"], default="None")
163
+
164
+ inputs = [ # tot. 9, meed to modify input of gradio & examples
165
  gr.inputs.File(label="file 1"),
166
  gr.inputs.File(label="file 2", optional=True),
167
+ input_tf_type, # modi inputs 2
168
  input_idf_type,
169
  input_dl_type,
170
  input_norm_type,
180
  step=1,
181
  default=6,
182
  ),
183
+ sent_ali_algo,
184
  ]
185
 
186
  examples = [
193
  "None",
194
  10,
195
  6,
196
+ "None",
197
  ],
198
  [
199
+ "data/test_zh.txt",
200
  "data/test_en.txt",
201
+ "linear",
202
+ "None",
203
+ "None",
204
+ "None",
205
+ 10,
206
+ 6,
207
+ "fast",
208
+ ],
209
+ [
210
  "data/test_zh.txt",
211
+ "data/test_en.txt",
212
  "linear",
213
  "None",
214
  "None",
215
  "None",
216
  10,
217
  6,
218
+ "slow",
219
+ ],
220
+ [
221
+ "data/test_en.txt",
222
+ "data/test_zh.txt",
223
+ "linear",
224
+ "None",
225
+ "None",
226
+ "None",
227
+ 10,
228
+ 6,
229
+ "None",
230
  ],
231
  [
232
  "data/shakespeare_zh500.txt",
237
  "None",
238
  10,
239
  6,
240
+ "None",
241
  ],
242
  [
243
  "data/shakespeare_en500.txt",
248
  "None",
249
  10,
250
  6,
251
+ "None",
252
  ],
253
  [
254
  "data/hlm-ch1-zh.txt",
259
  "None",
260
  10,
261
  6,
262
+ "None",
263
  ],
264
  [
265
  "data/hlm-ch1-en.txt",
270
  "None",
271
  10,
272
  6,
273
+ "None",
274
  ],
275
  [
276
  "data/ps-cn.txt",
281
  "None",
282
  10,
283
  4,
284
+ "None",
285
  ],
286
  [
287
  "data/test-dual.txt",
292
  "None",
293
  10,
294
  6,
295
+ "None",
296
  ],
297
  [
298
  "data/英译中国现代散文选1(汉外对照丛书).txt",
303
  "None",
304
  10,
305
  6,
306
+ "None",
307
  ],
308
  [
309
  "data/test-zh-ja.txt",
314
  "None",
315
  10,
316
  6,
317
+ "None",
318
  ],
319
  [
320
  "data/xiyouji-ch1-zh.txt",
325
  "None",
326
  10,
327
  6,
328
+ "None",
329
  ],
330
  [
331
  "data/demian-hesse-de.txt",
336
  "None",
337
  10,
338
  6,
339
+ "None",
340
  ],
341
  [
342
  "data/catcher-in-the-rye-shixianrong-zh.txt",
347
  "None",
348
  10,
349
  6,
350
+ "None",
351
  ],
352
  ]
353
 
378
  out_file_dl_excel = gr.outputs.File(
379
  label="Click to download xlsx",
380
  )
381
+ out_sents_dl = gr.outputs.File(
382
+ label="Click to download sents csv",
383
+ )
384
+ out_sents_dl_excel = gr.outputs.File(
385
+ label="Click to download sents xlsx",
386
+ )
387
+
388
+ # modi outputs 1, definitions
389
 
390
+ # modi outputs 2, need to modify gradio error_msg
391
+ outputs = [ # tot. 8
392
  out_df,
 
393
  gr.outputs.Image(label="plot"),
394
  out_file_dl,
395
  out_file_dl_excel,
396
+ out_sents_dl,
397
+ out_sents_dl_excel,
398
  out_df_aligned,
399
  gr.outputs.HTML(),
400
  ]
radiobee/align_sents.py CHANGED
@@ -67,6 +67,11 @@ def align_sents(lst1: List[str], lst2: List[str]) -> List[Tuple[str, str]]:
67
 
68
  texts.append(tuple(_))
69
 
70
- return texts
 
 
 
 
 
71
 
72
- # return ["", ""]
67
 
68
  texts.append(tuple(_))
69
 
70
+ _ = """
71
+ _ = []
72
+ for elm in texts:
73
+ _.extend(elm)
74
+ return _
75
+ """
76
 
77
+ return texts
radiobee/align_sents.pyc ADDED
Binary file (1.42 kB). View file
radiobee/error_msg.py CHANGED
@@ -8,7 +8,7 @@ import pandas as pd
8
  def error_msg(
9
  msg: Optional[Union[str, Exception]],
10
  title: str = "error message",
11
- ) -> Tuple[Union[pd.DataFrame, None], None, None, None, None, None]:
12
  """Prepare an error message for gradiobee outputs."""
13
  if msg is None:
14
  msg = "none..."
@@ -21,4 +21,4 @@ def error_msg(
21
  df = pd.DataFrame([msg], columns=[title])
22
 
23
  # return df, *((None,) * 4) # pyright complains
24
- return df, None, None, None, None, None
8
  def error_msg(
9
  msg: Optional[Union[str, Exception]],
10
  title: str = "error message",
11
+ ) -> Tuple[Union[pd.DataFrame, None], None, None, None, None, None, None, None]:
12
  """Prepare an error message for gradiobee outputs."""
13
  if msg is None:
14
  msg = "none..."
21
  df = pd.DataFrame([msg], columns=[title])
22
 
23
  # return df, *((None,) * 4) # pyright complains
24
+ return df, None, None, None, None, None, None, None
radiobee/gradiobee.py CHANGED
@@ -30,6 +30,10 @@ from radiobee.trim_df import trim_df
30
  from radiobee.error_msg import error_msg
31
  from radiobee.text2lists import text2lists
32
 
 
 
 
 
33
  uname = platform.uname()
34
  HFSPACES = False
35
  if "amzn2" in uname.release: # on hf spaces
@@ -43,7 +47,7 @@ debug = False
43
  debug = True
44
 
45
 
46
- def gradiobee(
47
  file1,
48
  file2,
49
  tf_type,
@@ -53,6 +57,7 @@ def gradiobee(
53
  eps,
54
  min_samples,
55
  # debug=False,
 
56
  ):
57
  """Process inputs and return outputs."""
58
  logger.debug(" *debug* ")
@@ -382,7 +387,7 @@ def gradiobee(
382
  df_aligned = df_aligned[["text2", "text1", "likelihood"]]
383
  df_aligned.columns = ["text1", "text2", "likelihood"]
384
 
385
- ic(df_aligned.head())
386
 
387
  # round the last column to 2
388
  # df_aligned.likelihood = df_aligned.likelihood.round(2)
@@ -434,8 +439,66 @@ def gradiobee(
434
  # return df_trimmed, output_plot, file_dl, file_dl_xlsx, df_aligned
435
  # return df_trimmed, output_plot, file_dl, file_dl_xlsx, styled, df_html # gradio cant handle style
436
 
437
- ic("returning outputs")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
 
439
- return df_trimmed, output_plot, file_dl, file_dl_xlsx, df_aligned, df_html
 
440
 
441
- # modi outputs
 
30
  from radiobee.error_msg import error_msg
31
  from radiobee.text2lists import text2lists
32
 
33
+ from radiobee.align_sents import align_sents
34
+ from radiobee.shuffle_sents import shuffle_sents # type: ignore
35
+ from radiobee.paras2sents import paras2sents # type: ignore
36
+
37
  uname = platform.uname()
38
  HFSPACES = False
39
  if "amzn2" in uname.release: # on hf spaces
47
  debug = True
48
 
49
 
50
+ def gradiobee( # noqa
51
  file1,
52
  file2,
53
  tf_type,
57
  eps,
58
  min_samples,
59
  # debug=False,
60
+ sent_ali_algo,
61
  ):
62
  """Process inputs and return outputs."""
63
  logger.debug(" *debug* ")
387
  df_aligned = df_aligned[["text2", "text1", "likelihood"]]
388
  df_aligned.columns = ["text1", "text2", "likelihood"]
389
 
390
+ ic("paras aligned: ", df_aligned.head(10))
391
 
392
  # round the last column to 2
393
  # df_aligned.likelihood = df_aligned.likelihood.round(2)
439
  # return df_trimmed, output_plot, file_dl, file_dl_xlsx, df_aligned
440
  # return df_trimmed, output_plot, file_dl, file_dl_xlsx, styled, df_html # gradio cant handle style
441
 
442
+ ic("sent-ali-algo: ", sent_ali_algo)
443
+
444
+ # ### sent-ali-algo is None: para align
445
+ if sent_ali_algo in ["None"]:
446
+ ic("returning para-ali outputs")
447
+ return df_trimmed, output_plot, file_dl, file_dl_xlsx, None, None, df_aligned, df_html
448
+
449
+ # ### proceed with sent align
450
+ if sent_ali_algo in ["fast"]:
451
+ ic(sent_ali_algo)
452
+ align_func = align_sents
453
+
454
+ ic(df_aligned.shape, df_aligned.columns)
455
+
456
+ aligned_sents = paras2sents(df_aligned, align_func)
457
+
458
+ # ic(pd.DataFrame(aligned_sents).shape, aligned_sents)
459
+ ic(pd.DataFrame(aligned_sents).shape)
460
+
461
+ df_aligned_sents = pd.DataFrame(aligned_sents, columns=["text1", "text2"])
462
+ else: # ["slow"]
463
+ ic(sent_ali_algo)
464
+ align_func = shuffle_sents
465
+ aligned_sents = paras2sents(df_aligned, align_func, lang1, lang2)
466
+
467
+ # add extra entry if necessary
468
+ aligned_sents = [list(sent) + [""] if len(sent) == 2 else list(sent) for sent in aligned_sents]
469
+
470
+ df_aligned_sents = pd.DataFrame(aligned_sents, columns=["text1", "text2", "likelihood"])
471
+
472
+ # prepare sents downloads
473
+ file_dl_sents = Path(f"{file_dl.stem}-sents{file_dl.suffix}")
474
+ file_dl_xlsx_sents = Path(f"{file_dl_xlsx.stem}-sents{file_dl_xlsx.suffix}")
475
+ _ = df_aligned_sents.to_csv(index=False)
476
+ file_dl_sents.write_text(_, encoding="utf8")
477
+
478
+ df_aligned_sents.to_excel(file_dl_xlsx_sents)
479
+
480
+ # prepare html output
481
+ if len(df_aligned_sents) > 200:
482
+ df_html = None
483
+ else: # show a one-bathc table in html
484
+ # style
485
+ styled = df_aligned_sents.style.set_properties(
486
+ **{
487
+ "font-size": "10pt",
488
+ "border-color": "black",
489
+ "border": "1px black solid !important"
490
+ }
491
+ # border-color="black",
492
+ ).set_table_styles([{
493
+ "selector": "", # noqs
494
+ "props": [("border", "2px black solid !important")]}] # noqs
495
+ ).format(
496
+ precision=2
497
+ )
498
+ df_html = styled.to_html()
499
 
500
+ # aligned sents outputs
501
+ ic("aligned sents outputs")
502
 
503
+ # return df_trimmed, output_plot, file_dl, file_dl_xlsx, None, None, df_aligned, df_html
504
+ return df_trimmed, output_plot, file_dl, file_dl_xlsx, file_dl_sents, file_dl_xlsx_sents, df_aligned_sents, df_html
radiobee/paras2sents.pyc ADDED
Binary file (2.57 kB). View file
radiobee/shuffle_sents.pyc ADDED
Binary file (2.02 kB). View file
run-pydocstle.bat → run-pydocstyle.bat RENAMED
File without changes
tests/test_paras2sents.py CHANGED
@@ -1,6 +1,7 @@
1
  """Test paras2sents."""
2
  # pylint: disable=invalid-name
3
 
 
4
  import pandas as pd
5
  from radiobee.paras2sents import paras2sents
6
  from radiobee.shuffle_sents import shuffle_sents
@@ -14,15 +15,20 @@ def test_paras2sents_dual():
14
  """Test paras2sents_dual."""
15
  sents = paras2sents(paras)
16
 
 
 
17
  assert len(sents) > 202 # 208
18
  # assert not sents
19
 
20
 
21
  def test_paras2sents_dual_model_s():
22
  """Test paras2sents_dual_model_s."""
23
- sents = paras2sents(paras, shuffle_sents)
 
 
 
24
 
25
- assert len(sents) > 201 # 207
26
  # assert not sents
27
 
28
 
1
  """Test paras2sents."""
2
  # pylint: disable=invalid-name
3
 
4
+ import numpy as np
5
  import pandas as pd
6
  from radiobee.paras2sents import paras2sents
7
  from radiobee.shuffle_sents import shuffle_sents
15
  """Test paras2sents_dual."""
16
  sents = paras2sents(paras)
17
 
18
+ assert np.array(sents).shape.__len__() > 1
19
+
20
  assert len(sents) > 202 # 208
21
  # assert not sents
22
 
23
 
24
  def test_paras2sents_dual_model_s():
25
  """Test paras2sents_dual_model_s."""
26
+ sents1 = paras2sents(paras, shuffle_sents)
27
+
28
+ # assert np.array(sents1).shape.__len__() > 1
29
+ assert pd.DataFrame(sents1).shape.__len__() > 1
30
 
31
+ assert len(sents1) > 201 # 207
32
  # assert not sents
33
 
34