Spaces:

mikeee
/

radiobee-dev

Runtime error

App Files Files Community

freemt commited on Jan 20, 2022

Commit

5a186f5

•

1 Parent(s): 3fae3e0

before sent-level

Browse files

Files changed (6) hide show

radiobee/__main__.py +8 -0
radiobee/paras2sents.py +0 -110
radiobee/paras2sents.pyc +0 -0
radiobee/shuffle_sents.py +0 -97
radiobee/shuffle_sents.pyc +0 -0
rsyn-to-radiobee-aligner.bat +1 -1

radiobee/__main__.py CHANGED Viewed

@@ -4,6 +4,8 @@ from typing import Any, Tuple, Optional, Union  # noqa
 import sys
 from pathlib import Path  # noqa
 import platform
 import signal
 from random import randint
@@ -108,6 +110,12 @@ if __name__ == "__main__":
         debug = False
         debug = True
         share = True
     else:
         server_name = "127.0.0.1"
         share = False

 import sys
 from pathlib import Path  # noqa
+import subprocess as sp
+import shlex
 import platform
 import signal
 from random import randint
         debug = False
         debug = True
         share = True
+        # set UTC+8, probably wont work in hf spaces, no permission
+        try:
+            sp.check_output(shlex.split("ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime"))
+        except Exception as exc:
+            logger.error(" set timezonef failed: %s", exc)
     else:
         server_name = "127.0.0.1"
         share = False

radiobee/paras2sents.py DELETED Viewed

@@ -1,110 +0,0 @@
-"""Convert paras to sents."""
-# pylint: disable=unused-import, too-many-branches, ungrouped-imports
-from typing import Callable, List, Optional, Tuple, Union
-from itertools import zip_longest
-import numpy as np
-import pandas as pd
-from logzero import logger
-from radiobee.align_sents import align_sents
-from radiobee.seg_text import seg_text
-from radiobee.detect import detect
-try:
-    from radiobee.shuffle_sents import shuffle_sents
-except Exception as exc:
-    logger.error("shuffle_sents not available: %s, using align_sents", exc)
-    shuffle_sents = lambda x1, x2, lang1="", lang2="": align_sents(x1, x2)  # noqa
-def paras2sents(
-    paras_: Union[pd.DataFrame, List[Tuple[str, str, Union[str, float]]], np.ndarray],
-    align_func: Optional[Union[Callable, str]] = None,
-    lang1: Optional[str] = None,
-    lang2: Optional[str] = None,
-) -> List[Tuple[str, str, Union[str, float]]]:
-    """Convert paras to sents using align_func.
-    Args:
-        paras_: list of 3-tuples or numpy or pd.DataFrame
-        lang1: fisrt lang code
-        lang2: second lang code
-        align_func: func used in the sent level
-            if set to None, default to align_sents
-    Returns:
-        list of sents (possible with likelihood for shuffle_sents)
-    """
-    # wrap everything in pd.DataFrame
-    # necessary to make pyright happy
-    paras = pd.DataFrame(paras_).fillna("")
-    # take the first three columns at maximum
-    paras = paras.iloc[:, :3]
-    if len(paras.columns) < 2:
-        logger.error(
-            "Need at least two columns, got %s",
-            len(paras.columns)
-        )
-        raise Exception("wrong data")
-    # append the third col (all "") if there are only two cols
-    if len(paras.columns) < 3:
-        paras.insert(2, "likelihood", [""] * len(paras))
-    if lang1 is None:
-        lang1 = detect(" ".join(paras.iloc[:, 0]))
-    if lang2 is None:
-        lang2 = detect(" ".join(paras.iloc[:, 1]))
-    left, right = [], []
-    row0, row1 = [], []
-    for elm0, elm1, elm2 in paras.values:
-        sents0 = seg_text(elm0, lang1)
-        sents1 = seg_text(elm1, lang2)
-        if isinstance(elm2, float) and elm2 > 0:
-            if row0 or row1:
-                left.append(row0)
-                right.append(row1)
-            row0, row1 = [], []  # collect and prepare
-            if sents0:
-                left.append(sents0)
-            if sents1:
-                right.append(sents1)
-        else:
-            if sents0:
-                row0.extend(sents0)
-            if sents1:
-                row1.extend(sents1)
-    # collect possible last batch
-    if row0 or row1:
-        left.append(row0)
-        right.append(row1)
-    # res = [*zip(left, right)]
-    # align each batch using align_func
-    # ready align_func
-    if align_func is None:
-        align_func = align_sents
-    if isinstance(align_func, str) and align_func.startswith("shuffle") or not isinstance(align_func, str) and align_func.__name__ in ["shuffle_sents"]:
-        align_func = lambda row0, row1: shuffle_sents(row0, row1, lang1=lang1, lang2=lang2)  # noqa
-    else:
-        align_func = align_sents
-    res = []
-    for row0, row1 in zip(left, right):
-        try:
-            _ = align_func(row0, row1)
-        except Exception as exc:
-            logger.error("errors: %s, resorting to zip_longest", exc)
-            _ = [*zip_longest(row0, row1, fillvalue="")]
-        # res.append(_)
-        res.extend(_)
-    return res

radiobee/paras2sents.pyc ADDED Viewed

Binary file (2.57 kB). View file

radiobee/shuffle_sents.py DELETED Viewed

@@ -1,97 +0,0 @@
-"""Shuffle sents."""
-# pylint: disable=unused-import, too-many-arguments, too-many-locals,
-from typing import List, Optional, Tuple, Union
-import pandas as pd
-from fastlid import fastlid
-from logzero import logger  # noqa
-from radiobee.lists2cmat import lists2cmat
-from radiobee.gen_pset import gen_pset
-from radiobee.gen_aset import gen_aset
-from radiobee.align_texts import align_texts
-# fmt: off
-def shuffle_sents(
-        lst1: List[str],
-        lst2: List[str],
-        eps: float = 6,
-        min_samples: int = 4,
-        tf_type: str = "linear",
-        idf_type: Optional[str] = None,
-        dl_type: Optional[str] = None,
-        norm: Optional[str] = None,
-        lang1: Optional[str] = None,
-        lang2: Optional[str] = None,
-) -> List[Tuple[str, str, Union[str, float]]]:
-    # fmt: on
-    """Shuffle sents to the right positions.
-    Based on __main__.py.
-    eps: float = 6
-    min_samples: int = 4
-    tf_type: str = "linear"
-    idf_type: Optional[str] = None
-    dl_type: Optional[str] = None
-    norm: Optional[str] = None
-    lang1: Optional[str] = "en"
-    lang2: Optional[str] = "zh"
-    """
-    set_languages = fastlid.set_languages
-    # fastlid.set_languages = ["en", "zh"]
-    fastlid.set_languages = None
-    if lang1 is None:
-        lang1, _ = fastlid(" ".join(lst1))
-    if lang2 is None:
-        lang2, _ = fastlid(" ".join(lst2))
-    # restore fastlid.set_languages
-    fastlid.set_languages = set_languages
-    lang_dicts = ["en", "zh"]
-    if lang1 in lang_dicts and lang2 in lang_dicts:
-        cmat = lists2cmat(
-            lst1,
-            lst2,
-            tf_type=tf_type,
-            idf_type=idf_type,
-            dl_type=dl_type,
-            norm=norm,
-            lang1=lang1,
-            lang2=lang2,
-        )
-    else:  # use model_s
-        from radiobee.model_s import model_s  # pylint: disable=import-outside-toplevel
-        vec1 = model_s.encode(lst1)
-        vec2 = model_s.encode(lst2)
-        # cmat = vec1.dot(vec2.T)
-        cmat = vec2.dot(vec1.T)
-    shuffle_sents.cmat = cmat
-    shuffle_sents.lang1 = lang1
-    shuffle_sents.lang2 = lang2
-    pset = gen_pset(
-        cmat,
-        eps=eps,
-        min_samples=min_samples,
-        delta=7,
-    )
-    src_len, tgt_len = cmat.shape
-    aset = gen_aset(pset, src_len, tgt_len)
-    final_list = align_texts(aset, lst2, lst1)
-    # return final_list
-    # swap columns 0, 1
-    _ = pd.DataFrame(final_list)
-    _ = _.iloc[:, [1, 0] + [*range(2, _.shape[1])]]
-    return _.to_numpy().tolist()

radiobee/shuffle_sents.pyc ADDED Viewed

Binary file (2.02 kB). View file

rsyn-to-radiobee-aligner.bat CHANGED Viewed

	@@ -1 +1 @@
1	- rsync ~~-uvazn~~ ./ ~~..\~~radiobee-aligner/ --exclude-from=exclude-from


1	+ rsync ./ ../radiobee-aligner/ --exclude-from=exclude-from -uvazn