freemt commited on
Commit
077e1eb
1 Parent(s): 03014a3

Update short text bug fix

Browse files
data/en.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ [Young Warrior] Kingold(184283681) 2021-12-30 22:27:37
2
+ It seems that the standalone version can
3
+ omit the GUI and specify the two files to be aligned directly on the command line.
4
+
5
+
6
+ But if it's not the GUI module that's taking up space, then
7
+ removing it won't help compress the size of the whole package.
8
+
data/testen.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Superfast Computer Chip Transmits Data with Light
2
+ Computer chips have two important parts��the logic on the chip, which computes and executes programs. Then there��s the part that sends and receives��gets data to crunch, sends back the answer. And while that first part, chip logic, has gotten much faster over the years, the transmission part has lagged behind,because data gets sent via electrical signals passing through copper.
3
+ So researchers designed a chip that exchanges data with light instead. "By going into optics, we're able to relieve this fundamental bottleneck of copper, and in doing so we're able to increase the bandwidth density on the chip, so how fast the chip can take data in and out, by an order of magnitude," said Chen Sun, a computer hardware researcher at UC Berkeley, and the startup Ayar Labs.
4
+ A metal pin on the memory chip in your computer might transmit at 1.6 gigabits per second. Sun's optical connection ups that rate to 2.5 gigabits per second. Not a huge difference on the face of it. But the killer app here is that multiple wavelengths of light��up to 11��can be used simultaneously to send data through a single fiber, which means this technology has potential speeds of 27.5 gigabits per second��more than an order of magnitude faster than today��s standard. "So that's the extra dimension we have to scale bandwidth that we don't have with normal electrical signals." The findings appear in the journal Nature.
5
+ These chips with optical connections are not just high-speed��they also require less energy than the copper versions. That could be a big deal, with server farms projected to outpace every other commercial use of electricity within the next decade. Going optical could thus be a win-win: faster processing using a fraction of the energy.
data/testzh.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ �����ټ�����⵼оƬ
2
+ �����оƬ��������Ҫ���֣���һ��оƬ�ϵ��߼���·�����ڼ����ִ�г�������Ǵ�����·�����ڷ��ͺͽ������ݣ����������ݴ�����Ȼ�󷵻ؽ����ǰ����Щ�귢չʮ��Ѹ�٣����������ڳ������ݵĵ��ź���ͨ��ͭ�ߴ��͵ģ������չ��Ϊ�ͺ�
3
+ �����о���Ա��������ù�������ݽ�����оƬ�������Ǽ��ݴ�ѧ��������У�͸մ������õİ��Ƕ�ʵ���ҵ�˫Ƹ�����Ӳ���о�Ա�� ��˵����ͨ�����ù�ѧ�����������ܽ��ͭ�ߴ�����е�ƿ�����⣬�������ǿ�������оƬ�Ŀ����ܶȣ���ˣ���������������ٶȻ�쵽����һ����������
4
+ ���˵����ڴ�оƬ�Ľ�����Ŵ����ٶ�Ϊ1.6������/�룬���ԵĹ⵼оƬ�ɽ���һ�ٶ�������2.5������/�롣����濴��û��̫��IJ��죬��������ɱ֮�����ڽ�һ�����˿�ͬʱ��������Ⲩ�������ɴ�11���������������ݡ���Ҳ����ζ���䴫���ٶȿɴﵽ27.5������/�룬��Ƚ���ı�׼����һ����������������˵�������������辶����չ����ͨ���ź��޷�ӵ�еĴ������� ���о���������ڡ���Ȼ����־�ϡ�
5
+ �����ͭ�ߵ絼оƬ����Щ�⵼оƬ���������������ٶȿ죬�����ܺ�Ҳ�ϵ͡���δ��ʮ���ڣ�Ԥ�Ƶ�������Ⱥ�ĵ������ᳬ���κ�������ҵ�õ磬�����⣨һ�������£�������һ�������ش��Ͷ�ʡ� ��ѧ��Ӧ�ý���������һ��˫Ӯ�ľ���:�����ٶȵ���������Դ���Ľϵļ��١�
data/zh.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ 【少侠】Kingold(184283681) 2021-12-30 22:27:37
2
+ 单机版貌似可以省略掉图形界面,直接
3
+ 命令行指定两个待对齐文件。
4
+
5
+ 不过如果占地方的
6
+
7
+ 不是图形界面的模块,那去掉了也对压缩整个包的大小没帮助。
radiobee/__main__.py CHANGED
@@ -1,16 +1,17 @@
1
  """Run interactively."""
 
2
  from typing import Tuple # , Optional
3
 
4
-
5
  from pathlib import Path
6
- import joblib
7
  from random import randint
8
  from textwrap import dedent
9
  from itertools import zip_longest
10
- from sklearn.cluster import DBSCAN
11
-
12
  from socket import socket, AF_INET, SOCK_STREAM
13
- import signal
 
 
14
  from varname import nameof
15
  from logzero import logger
16
 
@@ -22,6 +23,8 @@ import matplotlib.pyplot as plt
22
  # from tabulate import tabulate
23
  from fastlid import fastlid
24
 
 
 
25
  import gradio as gr
26
  from radiobee.process_upload import process_upload
27
  from radiobee.files2df import files2df
@@ -43,6 +46,7 @@ print("Press Ctrl+C to quit\n")
43
 
44
 
45
  def savelzma(obj, fileloc: str = None):
 
46
  if fileloc is None:
47
  fileloc = nameof(obj) # this wont work
48
  joblib.dump(obj, f"data/{fileloc}.lzma")
@@ -55,6 +59,7 @@ def greet(input):
55
 
56
  def upfile1(file1, file2=None) -> Tuple[str, str]:
57
  """Upload file1, file2."""
 
58
  return file1.name, f"'Sup yo! (your input: {input})"
59
 
60
 
@@ -245,6 +250,11 @@ if __name__ == "__main__":
245
  sns.set()
246
  sns.set_style("darkgrid")
247
 
 
 
 
 
 
248
  fig = plt.figure()
249
  gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
250
  ax2 = fig.add_subplot(gs[0, 0])
@@ -260,7 +270,8 @@ if __name__ == "__main__":
260
  fig.suptitle("alignment projection")
261
 
262
  _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
263
- _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
 
264
 
265
  df_.plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)
266
 
@@ -309,12 +320,28 @@ if __name__ == "__main__":
309
  )
310
 
311
  # process lst1, lst2 to obtained df_aligned
312
- pset = gen_pset(
313
- cmat,
314
- eps=eps,
315
- min_samples=min_samples,
316
- delta=7,
317
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  src_len, tgt_len = cmat.shape
319
  aset = gen_aset(pset, src_len, tgt_len)
320
  final_list = align_texts(aset, lst2, lst1) # note the order
@@ -360,7 +387,10 @@ if __name__ == "__main__":
360
  * Click "Clear" first for subsequent submits when uploading files.
361
  * `tf_type` `idf_type` `dl_type` `norm`: Normally there is no need to touch these unless you know what you are doing.
362
  * Suggested `esp` and `min_samples` values -- `esp` (minimum epsilon): 8-12, `min_samples`: 4-8.
363
- - Smaller larger `esp` or `min_samples` will result in more aligned pairs but also more **false positives** (pairs falsely identified as candidates). On the other hand, larger smaller `esp` or `min_samples` values tend to miss 'good' pairs.
 
 
 
364
  * If you need to have a better look at the image, you can right-click on the image and select copy-image-address and open a new tab in the browser with the copied image address.
365
  * `Flag`: Should something go wrong, you can click Flag to save the output and inform the developer.
366
  """
 
1
  """Run interactively."""
2
+ # pylint: disable=invalid-name, too-many-arguments, unused-argument, redefined-builtin, wrong-import-position, too-many-locals, too-many-statements
3
  from typing import Tuple # , Optional
4
 
5
+ import sys
6
  from pathlib import Path
7
+ import signal
8
  from random import randint
9
  from textwrap import dedent
10
  from itertools import zip_longest
 
 
11
  from socket import socket, AF_INET, SOCK_STREAM
12
+
13
+ from sklearn.cluster import DBSCAN
14
+ import joblib
15
  from varname import nameof
16
  from logzero import logger
17
 
 
23
  # from tabulate import tabulate
24
  from fastlid import fastlid
25
 
26
+ if "." not in sys.path:
27
+ sys.path.insert(0, ".")
28
  import gradio as gr
29
  from radiobee.process_upload import process_upload
30
  from radiobee.files2df import files2df
 
46
 
47
 
48
  def savelzma(obj, fileloc: str = None):
49
+ """Aux funciton."""
50
  if fileloc is None:
51
  fileloc = nameof(obj) # this wont work
52
  joblib.dump(obj, f"data/{fileloc}.lzma")
 
59
 
60
  def upfile1(file1, file2=None) -> Tuple[str, str]:
61
  """Upload file1, file2."""
62
+ del file2
63
  return file1.name, f"'Sup yo! (your input: {input})"
64
 
65
 
 
250
  sns.set()
251
  sns.set_style("darkgrid")
252
 
253
+ # close all existing figures, necesssary for hf spaces
254
+ plt.close("all")
255
+ # if sys.platform not in ["win32", "linux"]:
256
+ plt.switch_backend('Agg') # to cater for Mac, thanks to WhiteFox
257
+
258
  fig = plt.figure()
259
  gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
260
  ax2 = fig.add_subplot(gs[0, 0])
 
270
  fig.suptitle("alignment projection")
271
 
272
  _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
273
+ # _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
274
+ _x = ~_
275
 
276
  df_.plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)
277
 
 
320
  )
321
 
322
  # process lst1, lst2 to obtained df_aligned
323
+ # quick fix ValueError: not enough values to unpack (expected at least 1, got 0)
324
+ # fixed in gen_pet, but we leave the loop here
325
+ for min_s in range(min_samples):
326
+ logger.info(" min_samples, try %s", min_samples - min_s)
327
+ try:
328
+ pset = gen_pset(
329
+ cmat,
330
+ eps=eps,
331
+ min_samples=min_samples - min_s,
332
+ delta=7,
333
+ )
334
+ break
335
+ except ValueError:
336
+ logger.info(" decrease min_samples by %s", min_s + 1)
337
+ continue
338
+ except Exception as e:
339
+ logger.error(e)
340
+ continue
341
+ else:
342
+ # break should happen above when min_samples = 2
343
+ raise Exception("bummer, this shouldn't happen, probably another bug")
344
+
345
  src_len, tgt_len = cmat.shape
346
  aset = gen_aset(pset, src_len, tgt_len)
347
  final_list = align_texts(aset, lst2, lst1) # note the order
 
387
  * Click "Clear" first for subsequent submits when uploading files.
388
  * `tf_type` `idf_type` `dl_type` `norm`: Normally there is no need to touch these unless you know what you are doing.
389
  * Suggested `esp` and `min_samples` values -- `esp` (minimum epsilon): 8-12, `min_samples`: 4-8.
390
+ - Smaller larger `esp` or `min_samples` will result in more aligned pairs but also more **false positives** (pairs
391
+ falsely identified as candidates). On the other hand,
392
+ larger smaller `esp` or `min_samples` values tend to miss
393
+ 'good' pairs.
394
  * If you need to have a better look at the image, you can right-click on the image and select copy-image-address and open a new tab in the browser with the copied image address.
395
  * `Flag`: Should something go wrong, you can click Flag to save the output and inform the developer.
396
  """
radiobee/files2df.py CHANGED
@@ -25,6 +25,7 @@ def files2df(file1, file2):
25
 
26
  return df
27
 
 
28
  _ = """
29
  # return tabulate(df)
30
  # return tabulate(df, tablefmt="grid")
 
25
 
26
  return df
27
 
28
+
29
  _ = """
30
  # return tabulate(df)
31
  # return tabulate(df, tablefmt="grid")
radiobee/gen_pset.py CHANGED
@@ -13,7 +13,7 @@ from radiobee.cmat2tset import cmat2tset
13
  from radiobee.interpolate_pset import interpolate_pset
14
 
15
 
16
- def gen_pset(
17
  cmat1: Union[List[List[float]], np.ndarray, pd.DataFrame],
18
  eps: float = 10,
19
  min_samples: int = 6,
@@ -139,3 +139,36 @@ def gen_pset(
139
 
140
  # return [(1, 1, "")]
141
  return [(int(elm0), int(elm1), elm2) for elm0, elm1, elm2 in buff]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  from radiobee.interpolate_pset import interpolate_pset
14
 
15
 
16
+ def _gen_pset(
17
  cmat1: Union[List[List[float]], np.ndarray, pd.DataFrame],
18
  eps: float = 10,
19
  min_samples: int = 6,
 
139
 
140
  # return [(1, 1, "")]
141
  return [(int(elm0), int(elm1), elm2) for elm0, elm1, elm2 in buff]
142
+
143
+
144
+ def gen_pset(
145
+ cmat1: Union[List[List[float]], np.ndarray, pd.DataFrame],
146
+ eps: float = 10,
147
+ min_samples: int = 6,
148
+ delta: float = 7,
149
+ verbose: Union[bool, int] = False,
150
+ ) -> List[Tuple[Union[float, str], Union[float, str], Union[float, str]]]:
151
+ """Gen pset.
152
+
153
+ Refer to _gen_pset.
154
+ """
155
+ for min_s in range(min_samples):
156
+ logger.debug(" min_samples, try %s", min_samples - min_s)
157
+ try:
158
+ pset = _gen_pset(
159
+ cmat1,
160
+ eps=eps,
161
+ min_samples=min_samples - min_s,
162
+ delta=delta,
163
+ )
164
+ break
165
+ except ValueError:
166
+ logger.debug(" decrease min_samples by %s", min_s + 1)
167
+ continue
168
+ except Exception as e:
169
+ logger.error(e)
170
+ continue
171
+ else:
172
+ # break should happen above when min_samples = 2
173
+ raise Exception("bummer, this shouldn't happen, probably another bug")
174
+ return pset
radiobee/loadtext.py CHANGED
@@ -16,8 +16,7 @@ magic.from_file("testdata/test.pdf")
16
  original load_textrev
17
  refer to load_paras.py
18
  """
19
- from typing import Optional, Union
20
- import os
21
  from pathlib import Path
22
  import cchardet
23
 
@@ -26,24 +25,11 @@ from logzero import logger
26
  # from detect_file import detect_file
27
 
28
 
29
- def loadtext(filepath: Union[Path, str] = "") -> Optional[str]:
30
  """Load file context to text.
31
 
32
  Check encoding and load a file to text.
33
-
34
- load_paras(filepath='') ==> paralist, lenlist =
35
  """
36
- if not filepath:
37
- defaultdir = r"C:\dl\Dropbox\shuangyu_ku\txt-books"
38
- defaultfile = r"Folding_Beijing-en.txt"
39
-
40
- # filepath = defaultfile
41
- # defaultdir = r'C:\dl\Dropbox\mat-dir\snippets-mat\pyqt'
42
- # defaultfile = r'notes pyqt tkinter tktable.txt'
43
-
44
- filepath = os.path.join(defaultdir, defaultfile)
45
- filepath = Path(defaultdir) / defaultfile
46
-
47
  filepath = Path(filepath)
48
  if not filepath.is_file():
49
  logger.error(" file [%s] does not exist or is not a file.", filepath)
 
16
  original load_textrev
17
  refer to load_paras.py
18
  """
19
+ from typing import Optional, Union # noqa
 
20
  from pathlib import Path
21
  import cchardet
22
 
 
25
  # from detect_file import detect_file
26
 
27
 
28
+ def loadtext(filepath: Union[Path, str] = "") -> str:
29
  """Load file context to text.
30
 
31
  Check encoding and load a file to text.
 
 
32
  """
 
 
 
 
 
 
 
 
 
 
 
33
  filepath = Path(filepath)
34
  if not filepath.is_file():
35
  logger.error(" file [%s] does not exist or is not a file.", filepath)
requirements.txt CHANGED
@@ -3,6 +3,7 @@
3
  # charset-normalizer
4
  # idna
5
  # typing-extensions
 
6
  sklearn
7
  textacy
8
  logzero
 
3
  # charset-normalizer
4
  # idna
5
  # typing-extensions
6
+ gradio
7
  sklearn
8
  textacy
9
  logzero
tests/test_en_zh_short.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Test loadtext."""
2
+ # pylint: diable=invalid-name
3
+ import pytest
4
+
5
+ from fastlid import fastlid
6
+
7
+ from radiobee.loadtext import loadtext
8
+ from radiobee.files2df import files2df
9
+ from radiobee.file2text import file2text
10
+ from radiobee.lists2cmat import lists2cmat
11
+ from radiobee.cmat2tset import cmat2tset
12
+ from radiobee.gen_pset import gen_pset
13
+
14
+ en = loadtext("data/en.txt")
15
+ zh = loadtext("data/zh.txt")
16
+ testen = loadtext("data/testen.txt")
17
+ testzh = loadtext("data/testzh.txt")
18
+
19
+
20
+ def test_en_zh_short1():
21
+ """Test en_zh_short."""
22
+ lst1 = [elm for elm in en.splitlines() if elm.strip()]
23
+ lst2 = [elm for elm in zh.splitlines() if elm.strip()]
24
+
25
+ lang1, _ = fastlid(en)
26
+ lang2, _ = fastlid(zh)
27
+
28
+ cmat0 = lists2cmat(lst1, lst2)
29
+ pset = gen_pset(cmat0)
30
+
31
+ assert pset.__len__() > 2
32
+
33
+
34
+ def test_en_zh_short2():
35
+ """Test en_zh_short testen testzh."""
36
+ # en = testen.copy()
37
+ # zh = testzh.copy()
38
+ lst1a = [elm for elm in testen.splitlines() if elm.strip()]
39
+ lst2a = [elm for elm in testzh.splitlines() if elm.strip()]
40
+
41
+ lang1a, _ = fastlid(testen)
42
+ lang2a, _ = fastlid(testzh)
43
+
44
+ cmat1 = lists2cmat(lst1a, lst2a)
45
+ pset = gen_pset(cmat1)
46
+
47
+ assert pset.__len__() > 2
48
+
49
+
50
+ _ = """
51
+ import matplotlib
52
+ import matplotlib.pyplot as plt
53
+ import numpy as np
54
+ import pandas as pd
55
+ import seaborn as sns
56
+
57
+ sns.set()
58
+ sns.set_style("darkgrid")
59
+ cmap = "viridis_r"
60
+ plt.ion()
61
+
62
+ eps = 6
63
+ min_samples = 10
64
+
65
+
66
+ tset = pd.DataFrame(cmat2tset(cmat))
67
+ tset.columns = ["x", "y", "cos"]
68
+
69
+ df_ = tset
70
+
71
+ # """
tests/test_files2df.py CHANGED
@@ -3,6 +3,7 @@ from pathlib import Path
3
  import tempfile
4
  from radiobee.files2df import files2df
5
 
 
6
  def test_files2df():
7
  """Test files2df with tests/test_en.txt tests/test_zh.txt."""
8
  file1_ = "tests/test_en.txt"
@@ -15,8 +16,12 @@ def test_files2df():
15
 
16
  df = files2df(file1, file2)
17
 
18
- assert df.iloc[1, 0] == "Wuthering Heights"
19
- assert df.iloc[1, 1] == "呼啸山庄"
 
 
 
 
20
 
21
 
22
  def test_files2df_file2none():
@@ -29,5 +34,9 @@ def test_files2df_file2none():
29
 
30
  df = files2df(file1, file2)
31
 
32
- assert df.iloc[1, 0] == "Wuthering Heights"
33
- assert df.iloc[1, 1] == ""
 
 
 
 
 
3
  import tempfile
4
  from radiobee.files2df import files2df
5
 
6
+
7
  def test_files2df():
8
  """Test files2df with tests/test_en.txt tests/test_zh.txt."""
9
  file1_ = "tests/test_en.txt"
 
16
 
17
  df = files2df(file1, file2)
18
 
19
+ # with filenames as frist row
20
+ # assert df.iloc[1, 0] == "Wuthering Heights"
21
+ # assert df.iloc[1, 1] == "呼啸山庄"
22
+
23
+ assert df.iloc[0, 0] == "Wuthering Heights"
24
+ assert df.iloc[0, 1] == "呼啸山庄"
25
 
26
 
27
  def test_files2df_file2none():
 
34
 
35
  df = files2df(file1, file2)
36
 
37
+ # with filename as first row
38
+ # assert df.iloc[1, 0] == "Wuthering Heights"
39
+ # assert df.iloc[1, 1] == ""
40
+
41
+ assert df.iloc[0, 0] == "Wuthering Heights"
42
+ assert df.iloc[0, 1] == ""
tests/test_main.py CHANGED
@@ -18,6 +18,7 @@ file2loc = "data/test_en.txt"
18
  file1 = tempfile._TemporaryFileWrapper(open(file1loc, "rb"), file1loc)
19
  file2 = tempfile._TemporaryFileWrapper(open(file2loc, "rb"), file2loc)
20
 
 
21
  def test_file2file1():
22
  """Test cmat file2 file1."""
23
  # logger.info("file1: *%s*, file2: *%s*", file1, file2)
 
18
  file1 = tempfile._TemporaryFileWrapper(open(file1loc, "rb"), file1loc)
19
  file2 = tempfile._TemporaryFileWrapper(open(file2loc, "rb"), file2loc)
20
 
21
+
22
  def test_file2file1():
23
  """Test cmat file2 file1."""
24
  # logger.info("file1: *%s*, file2: *%s*", file1, file2)