Spaces:

mikeee
/

radiobee-aligner

Build error

App Files Files Community

freemt commited on Dec 31, 2021

Commit

077e1eb

1 Parent(s): 03014a3

Update short text bug fix

Browse files

Files changed (12) hide show

data/en.txt +8 -0
data/testen.txt +5 -0
data/testzh.txt +5 -0
data/zh.txt +7 -0
radiobee/__main__.py +43 -13
radiobee/files2df.py +1 -0
radiobee/gen_pset.py +34 -1
radiobee/loadtext.py +2 -16
requirements.txt +1 -0
tests/test_en_zh_short.py +71 -0
tests/test_files2df.py +13 -4
tests/test_main.py +1 -0

data/en.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+[Young Warrior] Kingold(184283681) 2021-12-30 22:27:37
+It seems that the standalone version can
+omit the GUI and specify the two files to be aligned directly on the command line.
+But if it's not the GUI module that's taking up space, then
+ removing it won't help compress the size of the whole package.

data/testen.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+Superfast Computer Chip Transmits Data with Light
+Computer chips have two important parts��the logic on the chip, which computes and executes programs. Then there��s the part that sends and receives��gets data to crunch, sends back the answer. And while that first part, chip logic, has gotten much faster over the years, the transmission part has lagged behind,because data gets sent via electrical signals passing through copper.
+So researchers designed a chip that exchanges data with light instead. "By going into optics, we're able to relieve this fundamental bottleneck of copper, and in doing so we're able to increase the bandwidth density on the chip, so how fast the chip can take data in and out, by an order of magnitude," said Chen Sun, a computer hardware researcher at UC Berkeley, and the startup Ayar Labs.
+A metal pin on the memory chip in your computer might transmit at 1.6 gigabits per second. Sun's optical connection ups that rate to 2.5 gigabits per second. Not a huge difference on the face of it. But the killer app here is that multiple wavelengths of light��up to 11��can be used simultaneously to send data through a single fiber, which means this technology has potential speeds of 27.5 gigabits per second��more than an order of magnitude faster than today��s standard. "So that's the extra dimension we have to scale bandwidth that we don't have with normal electrical signals." The findings appear in the journal Nature.
+These chips with optical connections are not just high-speed��they also require less energy than the copper versions. That could be a big deal, with server farms projected to outpace every other commercial use of electricity within the next decade. Going optical could thus be a win-win: faster processing using a fraction of the energy.

data/testzh.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+�����ټ�����⵼оƬ
+�����оƬ��������Ҫ���֣���һ��оƬ�ϵ��߼���·�����ڼ����ִ�г�������Ǵ�����·�����ڷ��ͺͽ������ݣ����������ݴ�����Ȼ�󷵻ؽ����ǰ����Щ�귢չʮ��Ѹ�٣����������ڳ������ݵĵ��ź���ͨ��ͭ�ߴ��͵ģ������չ��Ϊ�ͺ�
+�����о���Ա��������ù�������ݽ�����оƬ�������Ǽ��ݴ�ѧ��������У�͸մ������õİ��Ƕ�ʵ���ҵ�˫Ƹ�����Ӳ���о�Ա�� ��˵����ͨ�����ù�ѧ�����������ܽ��ͭ�ߴ�����е�ƿ�����⣬�������ǿ�������оƬ�Ŀ����ܶȣ���ˣ���������������ٶȻ�쵽����һ����������
+���˵����ڴ�оƬ�Ľ�����Ŵ����ٶ�Ϊ1.6������/�룬���ԵĹ⵼оƬ�ɽ���һ�ٶ�������2.5������/�롣����濴��û��̫��Ĳ��죬��������ɱ֮�����ڽ�һ�����˿�ͬʱ��������Ⲩ�������ɴ�11���������������ݡ���Ҳ����ζ���䴫���ٶȿɴﵽ27.5������/�룬��Ƚ���ı�׼����һ����������������˵�������������辶����չ����ͨ���ź��޷�ӵ�еĴ������� ���о���������ڡ���Ȼ����־�ϡ�
+�����ͭ�ߵ絼оƬ����Щ�⵼оƬ���������������ٶȿ죬�����ܺ�Ҳ�ϵ͡���δ��ʮ���ڣ�Ԥ�Ƶ�������Ⱥ�ĵ������ᳬ���κ�������ҵ�õ磬�����⣨һ�������£�������һ�������ش��Ͷ�ʡ� ��ѧ��Ӧ�ý���������һ��˫Ӯ�ľ���:�����ٶȵ���������Դ���Ľϵļ��١�

data/zh.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+【少侠】Kingold(184283681) 2021-12-30 22:27:37
+单机版貌似可以省略掉图形界面，直接
+命令行指定两个待对齐文件。
+不过如果占地方的
+不是图形界面的模块，那去掉了也对压缩整个包的大小没帮助。

radiobee/__main__.py CHANGED Viewed

@@ -1,16 +1,17 @@
 """Run interactively."""
 from typing import Tuple  # , Optional
 from pathlib import Path
-import joblib
 from random import randint
 from textwrap import dedent
 from itertools import zip_longest
-from sklearn.cluster import DBSCAN
 from socket import socket, AF_INET, SOCK_STREAM
-import signal
 from varname import nameof
 from logzero import logger
@@ -22,6 +23,8 @@ import matplotlib.pyplot as plt
 # from tabulate import tabulate
 from fastlid import fastlid
 import gradio as gr
 from radiobee.process_upload import process_upload
 from radiobee.files2df import files2df
@@ -43,6 +46,7 @@ print("Press Ctrl+C to quit\n")
 def savelzma(obj, fileloc: str = None):
     if fileloc is None:
         fileloc = nameof(obj)  # this wont work
     joblib.dump(obj, f"data/{fileloc}.lzma")
@@ -55,6 +59,7 @@ def greet(input):
 def upfile1(file1, file2=None) -> Tuple[str, str]:
     """Upload file1, file2."""
     return file1.name, f"'Sup yo! (your input: {input})"
@@ -245,6 +250,11 @@ if __name__ == "__main__":
         sns.set()
         sns.set_style("darkgrid")
         fig = plt.figure()
         gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
         ax2 = fig.add_subplot(gs[0, 0])
@@ -260,7 +270,8 @@ if __name__ == "__main__":
         fig.suptitle("alignment projection")
         _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
-        _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
         df_.plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)
@@ -309,12 +320,28 @@ if __name__ == "__main__":
         )
         # process lst1, lst2 to obtained df_aligned
-        pset = gen_pset(
-            cmat,
-            eps=eps,
-            min_samples=min_samples,
-            delta=7,
-        )
         src_len, tgt_len = cmat.shape
         aset = gen_aset(pset, src_len, tgt_len)
         final_list = align_texts(aset, lst2, lst1)  # note the order
@@ -360,7 +387,10 @@ if __name__ == "__main__":
         *   Click "Clear" first for subsequent submits when uploading files.
         *   `tf_type` `idf_type` `dl_type` `norm`: Normally there is no need to touch these unless you know what you are doing.
         *   Suggested `esp` and `min_samples` values -- `esp` (minimum epsilon): 8-12, `min_samples`: 4-8.
-           -   Smaller larger `esp` or `min_samples` will result in more aligned pairs but also more **false positives** (pairs falsely identified as candidates). On the other hand, larger smaller `esp` or `min_samples` values tend to miss 'good' pairs.
         *   If you need to have a better look at the image, you can right-click on the image and select copy-image-address and open a new tab in the browser with the copied image address.
         *   `Flag`: Should something go wrong, you can click Flag to save the output and inform the developer.
     """

 """Run interactively."""
+# pylint: disable=invalid-name, too-many-arguments, unused-argument, redefined-builtin, wrong-import-position, too-many-locals, too-many-statements
 from typing import Tuple  # , Optional
+import sys
 from pathlib import Path
+import signal
 from random import randint
 from textwrap import dedent
 from itertools import zip_longest
 from socket import socket, AF_INET, SOCK_STREAM
+from sklearn.cluster import DBSCAN
+import joblib
 from varname import nameof
 from logzero import logger
 # from tabulate import tabulate
 from fastlid import fastlid
+if "." not in sys.path:
+    sys.path.insert(0, ".")
 import gradio as gr
 from radiobee.process_upload import process_upload
 from radiobee.files2df import files2df
 def savelzma(obj, fileloc: str = None):
+    """Aux funciton."""
     if fileloc is None:
         fileloc = nameof(obj)  # this wont work
     joblib.dump(obj, f"data/{fileloc}.lzma")
 def upfile1(file1, file2=None) -> Tuple[str, str]:
     """Upload file1, file2."""
+    del file2
     return file1.name, f"'Sup yo! (your input: {input})"
         sns.set()
         sns.set_style("darkgrid")
+        # close all existing figures, necesssary for hf spaces
+        plt.close("all")
+        # if sys.platform not in ["win32", "linux"]:
+        plt.switch_backend('Agg')  # to cater for Mac, thanks to WhiteFox
         fig = plt.figure()
         gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
         ax2 = fig.add_subplot(gs[0, 0])
         fig.suptitle("alignment projection")
         _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
+        # _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
+        _x = ~_
         df_.plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)
         )
         # process lst1, lst2 to obtained df_aligned
+        # quick fix ValueError: not enough values to unpack (expected at least 1, got 0)
+        # fixed in gen_pet, but we leave the loop here
+        for min_s in range(min_samples):
+            logger.info(" min_samples, try %s", min_samples - min_s)
+            try:
+                pset = gen_pset(
+                    cmat,
+                    eps=eps,
+                    min_samples=min_samples - min_s,
+                    delta=7,
+                )
+                break
+            except ValueError:
+                logger.info(" decrease min_samples by %s", min_s + 1)
+                continue
+            except Exception as e:
+                logger.error(e)
+                continue
+        else:
+            # break should happen above when min_samples = 2
+            raise Exception("bummer, this shouldn't happen, probably another bug")
         src_len, tgt_len = cmat.shape
         aset = gen_aset(pset, src_len, tgt_len)
         final_list = align_texts(aset, lst2, lst1)  # note the order
         *   Click "Clear" first for subsequent submits when uploading files.
         *   `tf_type` `idf_type` `dl_type` `norm`: Normally there is no need to touch these unless you know what you are doing.
         *   Suggested `esp` and `min_samples` values -- `esp` (minimum epsilon): 8-12, `min_samples`: 4-8.
+           -   Smaller larger `esp` or `min_samples` will result in more aligned pairs but also more **false positives** (pairs
+           falsely identified as candidates). On the other hand,
+           larger smaller `esp` or `min_samples` values tend to miss
+           'good' pairs.
         *   If you need to have a better look at the image, you can right-click on the image and select copy-image-address and open a new tab in the browser with the copied image address.
         *   `Flag`: Should something go wrong, you can click Flag to save the output and inform the developer.
     """

radiobee/files2df.py CHANGED Viewed

@@ -25,6 +25,7 @@ def files2df(file1, file2):
     return df
 _ = """
     # return tabulate(df)
     # return tabulate(df, tablefmt="grid")

     return df
 _ = """
     # return tabulate(df)
     # return tabulate(df, tablefmt="grid")

radiobee/gen_pset.py CHANGED Viewed

@@ -13,7 +13,7 @@ from radiobee.cmat2tset import cmat2tset
 from radiobee.interpolate_pset import interpolate_pset
-def gen_pset(
     cmat1: Union[List[List[float]], np.ndarray, pd.DataFrame],
     eps: float = 10,
     min_samples: int = 6,
@@ -139,3 +139,36 @@ def gen_pset(
     # return [(1, 1, "")]
     return [(int(elm0), int(elm1), elm2) for elm0, elm1, elm2 in buff]

 from radiobee.interpolate_pset import interpolate_pset
+def _gen_pset(
     cmat1: Union[List[List[float]], np.ndarray, pd.DataFrame],
     eps: float = 10,
     min_samples: int = 6,
     # return [(1, 1, "")]
     return [(int(elm0), int(elm1), elm2) for elm0, elm1, elm2 in buff]
+def gen_pset(
+    cmat1: Union[List[List[float]], np.ndarray, pd.DataFrame],
+    eps: float = 10,
+    min_samples: int = 6,
+    delta: float = 7,
+    verbose: Union[bool, int] = False,
+) -> List[Tuple[Union[float, str], Union[float, str], Union[float, str]]]:
+    """Gen pset.
+    Refer to _gen_pset.
+    """
+    for min_s in range(min_samples):
+        logger.debug(" min_samples, try %s", min_samples - min_s)
+        try:
+            pset = _gen_pset(
+                cmat1,
+                eps=eps,
+                min_samples=min_samples - min_s,
+                delta=delta,
+            )
+            break
+        except ValueError:
+            logger.debug(" decrease min_samples by %s", min_s + 1)
+            continue
+        except Exception as e:
+            logger.error(e)
+            continue
+    else:
+        # break should happen above when min_samples = 2
+        raise Exception("bummer, this shouldn't happen, probably another bug")
+    return pset

radiobee/loadtext.py CHANGED Viewed

@@ -16,8 +16,7 @@ magic.from_file("testdata/test.pdf")
 original load_textrev
 refer to load_paras.py
 """
-from typing import Optional, Union
-import os
 from pathlib import Path
 import cchardet
@@ -26,24 +25,11 @@ from logzero import logger
 # from detect_file import detect_file
-def loadtext(filepath: Union[Path, str] = "") -> Optional[str]:
     """Load file context to text.
     Check encoding and load a file to text.
-    load_paras(filepath='') ==> paralist, lenlist =
     """
-    if not filepath:
-        defaultdir = r"C:\dl\Dropbox\shuangyu_ku\txt-books"
-        defaultfile = r"Folding_Beijing-en.txt"
-        # filepath = defaultfile
-        # defaultdir = r'C:\dl\Dropbox\mat-dir\snippets-mat\pyqt'
-        # defaultfile = r'notes pyqt tkinter tktable.txt'
-        filepath = os.path.join(defaultdir, defaultfile)
-        filepath = Path(defaultdir) / defaultfile
     filepath = Path(filepath)
     if not filepath.is_file():
         logger.error(" file [%s] does not exist or is not a file.", filepath)

 original load_textrev
 refer to load_paras.py
 """
+from typing import Optional, Union  # noqa
 from pathlib import Path
 import cchardet
 # from detect_file import detect_file
+def loadtext(filepath: Union[Path, str] = "") -> str:
     """Load file context to text.
     Check encoding and load a file to text.
     """
     filepath = Path(filepath)
     if not filepath.is_file():
         logger.error(" file [%s] does not exist or is not a file.", filepath)

requirements.txt CHANGED Viewed

@@ -3,6 +3,7 @@
 # charset-normalizer
 # idna
 # typing-extensions
 sklearn
 textacy
 logzero

 # charset-normalizer
 # idna
 # typing-extensions
+gradio
 sklearn
 textacy
 logzero

tests/test_en_zh_short.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""Test loadtext."""
+# pylint: diable=invalid-name
+import pytest
+from fastlid import fastlid
+from radiobee.loadtext import loadtext
+from radiobee.files2df import files2df
+from radiobee.file2text import file2text
+from radiobee.lists2cmat import lists2cmat
+from radiobee.cmat2tset import cmat2tset
+from radiobee.gen_pset import gen_pset
+en = loadtext("data/en.txt")
+zh = loadtext("data/zh.txt")
+testen = loadtext("data/testen.txt")
+testzh = loadtext("data/testzh.txt")
+def test_en_zh_short1():
+    """Test en_zh_short."""
+    lst1 = [elm for elm in en.splitlines() if elm.strip()]
+    lst2 = [elm for elm in zh.splitlines() if elm.strip()]
+    lang1, _ = fastlid(en)
+    lang2, _ = fastlid(zh)
+    cmat0 = lists2cmat(lst1, lst2)
+    pset = gen_pset(cmat0)
+    assert pset.__len__() > 2
+def test_en_zh_short2():
+    """Test en_zh_short testen testzh."""
+    # en = testen.copy()
+    # zh = testzh.copy()
+    lst1a = [elm for elm in testen.splitlines() if elm.strip()]
+    lst2a = [elm for elm in testzh.splitlines() if elm.strip()]
+    lang1a, _ = fastlid(testen)
+    lang2a, _ = fastlid(testzh)
+    cmat1 = lists2cmat(lst1a, lst2a)
+    pset = gen_pset(cmat1)
+    assert pset.__len__() > 2
+_ = """
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+sns.set()
+sns.set_style("darkgrid")
+cmap = "viridis_r"
+plt.ion()
+eps = 6
+min_samples = 10
+tset = pd.DataFrame(cmat2tset(cmat))
+tset.columns = ["x", "y", "cos"]
+df_ = tset
+# """

tests/test_files2df.py CHANGED Viewed

@@ -3,6 +3,7 @@ from pathlib import Path
 import tempfile
 from radiobee.files2df import files2df
 def test_files2df():
     """Test files2df with tests/test_en.txt tests/test_zh.txt."""
     file1_ = "tests/test_en.txt"
@@ -15,8 +16,12 @@ def test_files2df():
         df = files2df(file1, file2)
-    assert df.iloc[1, 0] == "Wuthering Heights"
-    assert df.iloc[1, 1] == "呼啸山庄"
 def test_files2df_file2none():
@@ -29,5 +34,9 @@ def test_files2df_file2none():
         df = files2df(file1, file2)
-    assert df.iloc[1, 0] == "Wuthering Heights"
-    assert df.iloc[1, 1] == ""

 import tempfile
 from radiobee.files2df import files2df
 def test_files2df():
     """Test files2df with tests/test_en.txt tests/test_zh.txt."""
     file1_ = "tests/test_en.txt"
         df = files2df(file1, file2)
+    # with filenames as frist row
+    # assert df.iloc[1, 0] == "Wuthering Heights"
+    # assert df.iloc[1, 1] == "呼啸山庄"
+    assert df.iloc[0, 0] == "Wuthering Heights"
+    assert df.iloc[0, 1] == "呼啸山庄"
 def test_files2df_file2none():
         df = files2df(file1, file2)
+    # with filename as first row
+    # assert df.iloc[1, 0] == "Wuthering Heights"
+    # assert df.iloc[1, 1] == ""
+    assert df.iloc[0, 0] == "Wuthering Heights"
+    assert df.iloc[0, 1] == ""

tests/test_main.py CHANGED Viewed

@@ -18,6 +18,7 @@ file2loc = "data/test_en.txt"
 file1 = tempfile._TemporaryFileWrapper(open(file1loc, "rb"), file1loc)
 file2 = tempfile._TemporaryFileWrapper(open(file2loc, "rb"), file2loc)
 def test_file2file1():
     """Test cmat file2 file1."""
     # logger.info("file1: *%s*, file2: *%s*", file1, file2)

 file1 = tempfile._TemporaryFileWrapper(open(file1loc, "rb"), file1loc)
 file2 = tempfile._TemporaryFileWrapper(open(file2loc, "rb"), file2loc)
 def test_file2file1():
     """Test cmat file2 file1."""
     # logger.info("file1: *%s*, file2: *%s*", file1, file2)