freemt commited on
Commit
5a186f5
1 Parent(s): 3fae3e0

before sent-level

Browse files
radiobee/__main__.py CHANGED
@@ -4,6 +4,8 @@ from typing import Any, Tuple, Optional, Union # noqa
4
 
5
  import sys
6
  from pathlib import Path # noqa
 
 
7
  import platform
8
  import signal
9
  from random import randint
@@ -108,6 +110,12 @@ if __name__ == "__main__":
108
  debug = False
109
  debug = True
110
  share = True
 
 
 
 
 
 
111
  else:
112
  server_name = "127.0.0.1"
113
  share = False
 
4
 
5
  import sys
6
  from pathlib import Path # noqa
7
+ import subprocess as sp
8
+ import shlex
9
  import platform
10
  import signal
11
  from random import randint
 
110
  debug = False
111
  debug = True
112
  share = True
113
+
114
+ # set UTC+8, probably wont work in hf spaces, no permission
115
+ try:
116
+ sp.check_output(shlex.split("ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime"))
117
+ except Exception as exc:
118
+ logger.error(" set timezonef failed: %s", exc)
119
  else:
120
  server_name = "127.0.0.1"
121
  share = False
radiobee/paras2sents.py DELETED
@@ -1,110 +0,0 @@
1
- """Convert paras to sents."""
2
- # pylint: disable=unused-import, too-many-branches, ungrouped-imports
3
-
4
- from typing import Callable, List, Optional, Tuple, Union
5
-
6
- from itertools import zip_longest
7
- import numpy as np
8
- import pandas as pd
9
- from logzero import logger
10
-
11
- from radiobee.align_sents import align_sents
12
- from radiobee.seg_text import seg_text
13
- from radiobee.detect import detect
14
-
15
- try:
16
- from radiobee.shuffle_sents import shuffle_sents
17
- except Exception as exc:
18
- logger.error("shuffle_sents not available: %s, using align_sents", exc)
19
- shuffle_sents = lambda x1, x2, lang1="", lang2="": align_sents(x1, x2) # noqa
20
-
21
-
22
- def paras2sents(
23
- paras_: Union[pd.DataFrame, List[Tuple[str, str, Union[str, float]]], np.ndarray],
24
- align_func: Optional[Union[Callable, str]] = None,
25
- lang1: Optional[str] = None,
26
- lang2: Optional[str] = None,
27
- ) -> List[Tuple[str, str, Union[str, float]]]:
28
- """Convert paras to sents using align_func.
29
-
30
- Args:
31
- paras_: list of 3-tuples or numpy or pd.DataFrame
32
- lang1: fisrt lang code
33
- lang2: second lang code
34
- align_func: func used in the sent level
35
- if set to None, default to align_sents
36
- Returns:
37
- list of sents (possible with likelihood for shuffle_sents)
38
- """
39
- # wrap everything in pd.DataFrame
40
- # necessary to make pyright happy
41
- paras = pd.DataFrame(paras_).fillna("")
42
-
43
- # take the first three columns at maximum
44
- paras = paras.iloc[:, :3]
45
-
46
- if len(paras.columns) < 2:
47
- logger.error(
48
- "Need at least two columns, got %s",
49
- len(paras.columns)
50
- )
51
- raise Exception("wrong data")
52
-
53
- # append the third col (all "") if there are only two cols
54
- if len(paras.columns) < 3:
55
- paras.insert(2, "likelihood", [""] * len(paras))
56
-
57
- if lang1 is None:
58
- lang1 = detect(" ".join(paras.iloc[:, 0]))
59
- if lang2 is None:
60
- lang2 = detect(" ".join(paras.iloc[:, 1]))
61
-
62
- left, right = [], []
63
- row0, row1 = [], []
64
- for elm0, elm1, elm2 in paras.values:
65
- sents0 = seg_text(elm0, lang1)
66
- sents1 = seg_text(elm1, lang2)
67
- if isinstance(elm2, float) and elm2 > 0:
68
- if row0 or row1:
69
- left.append(row0)
70
- right.append(row1)
71
- row0, row1 = [], [] # collect and prepare
72
-
73
- if sents0:
74
- left.append(sents0)
75
- if sents1:
76
- right.append(sents1)
77
- else:
78
- if sents0:
79
- row0.extend(sents0)
80
- if sents1:
81
- row1.extend(sents1)
82
- # collect possible last batch
83
- if row0 or row1:
84
- left.append(row0)
85
- right.append(row1)
86
-
87
- # res = [*zip(left, right)]
88
-
89
- # align each batch using align_func
90
-
91
- # ready align_func
92
- if align_func is None:
93
- align_func = align_sents
94
- if isinstance(align_func, str) and align_func.startswith("shuffle") or not isinstance(align_func, str) and align_func.__name__ in ["shuffle_sents"]:
95
- align_func = lambda row0, row1: shuffle_sents(row0, row1, lang1=lang1, lang2=lang2) # noqa
96
- else:
97
- align_func = align_sents
98
-
99
- res = []
100
- for row0, row1 in zip(left, right):
101
- try:
102
- _ = align_func(row0, row1)
103
- except Exception as exc:
104
- logger.error("errors: %s, resorting to zip_longest", exc)
105
- _ = [*zip_longest(row0, row1, fillvalue="")]
106
-
107
- # res.append(_)
108
- res.extend(_)
109
-
110
- return res
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
radiobee/paras2sents.pyc ADDED
Binary file (2.57 kB). View file
 
radiobee/shuffle_sents.py DELETED
@@ -1,97 +0,0 @@
1
- """Shuffle sents."""
2
- # pylint: disable=unused-import, too-many-arguments, too-many-locals,
3
-
4
- from typing import List, Optional, Tuple, Union
5
-
6
- import pandas as pd
7
- from fastlid import fastlid
8
- from logzero import logger # noqa
9
-
10
- from radiobee.lists2cmat import lists2cmat
11
- from radiobee.gen_pset import gen_pset
12
- from radiobee.gen_aset import gen_aset
13
- from radiobee.align_texts import align_texts
14
-
15
-
16
- # fmt: off
17
- def shuffle_sents(
18
- lst1: List[str],
19
- lst2: List[str],
20
- eps: float = 6,
21
- min_samples: int = 4,
22
- tf_type: str = "linear",
23
- idf_type: Optional[str] = None,
24
- dl_type: Optional[str] = None,
25
- norm: Optional[str] = None,
26
- lang1: Optional[str] = None,
27
- lang2: Optional[str] = None,
28
- ) -> List[Tuple[str, str, Union[str, float]]]:
29
- # fmt: on
30
- """Shuffle sents to the right positions.
31
-
32
- Based on __main__.py.
33
-
34
- eps: float = 6
35
- min_samples: int = 4
36
- tf_type: str = "linear"
37
- idf_type: Optional[str] = None
38
- dl_type: Optional[str] = None
39
- norm: Optional[str] = None
40
- lang1: Optional[str] = "en"
41
- lang2: Optional[str] = "zh"
42
- """
43
- set_languages = fastlid.set_languages
44
- # fastlid.set_languages = ["en", "zh"]
45
- fastlid.set_languages = None
46
-
47
- if lang1 is None:
48
- lang1, _ = fastlid(" ".join(lst1))
49
- if lang2 is None:
50
- lang2, _ = fastlid(" ".join(lst2))
51
-
52
- # restore fastlid.set_languages
53
- fastlid.set_languages = set_languages
54
-
55
- lang_dicts = ["en", "zh"]
56
- if lang1 in lang_dicts and lang2 in lang_dicts:
57
- cmat = lists2cmat(
58
- lst1,
59
- lst2,
60
- tf_type=tf_type,
61
- idf_type=idf_type,
62
- dl_type=dl_type,
63
- norm=norm,
64
- lang1=lang1,
65
- lang2=lang2,
66
- )
67
- else: # use model_s
68
- from radiobee.model_s import model_s # pylint: disable=import-outside-toplevel
69
- vec1 = model_s.encode(lst1)
70
- vec2 = model_s.encode(lst2)
71
- # cmat = vec1.dot(vec2.T)
72
- cmat = vec2.dot(vec1.T)
73
-
74
- shuffle_sents.cmat = cmat
75
- shuffle_sents.lang1 = lang1
76
- shuffle_sents.lang2 = lang2
77
-
78
- pset = gen_pset(
79
- cmat,
80
- eps=eps,
81
- min_samples=min_samples,
82
- delta=7,
83
- )
84
-
85
- src_len, tgt_len = cmat.shape
86
- aset = gen_aset(pset, src_len, tgt_len)
87
-
88
- final_list = align_texts(aset, lst2, lst1)
89
-
90
- # return final_list
91
-
92
- # swap columns 0, 1
93
- _ = pd.DataFrame(final_list)
94
-
95
- _ = _.iloc[:, [1, 0] + [*range(2, _.shape[1])]]
96
-
97
- return _.to_numpy().tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
radiobee/shuffle_sents.pyc ADDED
Binary file (2.02 kB). View file
 
rsyn-to-radiobee-aligner.bat CHANGED
@@ -1 +1 @@
1
- rsync -uvazn ./ ..\radiobee-aligner/ --exclude-from=exclude-from
 
1
+ rsync ./ ../radiobee-aligner/ --exclude-from=exclude-from -uvazn