m3hrdadfi commited on
Commit
7a6f591
1 Parent(s): 704394a

Update sync_streamlit_to_space.yml

Browse files
Files changed (10) hide show
  1. README.md +8 -24
  2. app.py +185 -0
  3. libs/__init__.py +0 -0
  4. libs/dummy.py +1179 -0
  5. libs/examples.py +40 -0
  6. libs/languages.py +237 -0
  7. libs/normalizer.py +86 -0
  8. libs/utils.py +25 -0
  9. meta.py +8 -0
  10. requirements.txt +5 -0
README.md CHANGED
@@ -1,33 +1,17 @@
1
  ---
2
  title: Zabanshenas
3
- emoji: 👁
4
- colorFrom: pink
5
- colorTo: pink
6
  sdk: streamlit
7
  app_file: app.py
8
  pinned: false
9
  ---
10
 
11
- # Configuration
12
 
13
- `title`: _string_
14
- Display title for the Space
15
 
16
- `emoji`: _string_
17
- Space emoji (emoji-only character allowed)
18
-
19
- `colorFrom`: _string_
20
- Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
21
-
22
- `colorTo`: _string_
23
- Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
24
-
25
- `sdk`: _string_
26
- Can be either `gradio` or `streamlit`
27
-
28
- `app_file`: _string_
29
- Path to your main application file (which contains either `gradio` or `streamlit` Python code).
30
- Path is relative to the root of the repository.
31
-
32
- `pinned`: _boolean_
33
- Whether the Space stays on top of your list.
1
  ---
2
  title: Zabanshenas
3
+ emoji: 🕵
4
+ colorFrom: blue
5
+ colorTo: blue
6
  sdk: streamlit
7
  app_file: app.py
8
  pinned: false
9
  ---
10
 
11
+ # Zabanshenas
12
 
13
+ A Transformer-based solution for identifying the most likely language of a written document/text.
14
+ **Zabanshenas** is a Persian word that has two meanings:
15
 
16
+ - A person who studies linguistics.
17
+ - A way to identify the type of written language.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from typing import Any, Dict, Optional
4
+ import numpy as np
5
+ import torch
6
+
7
+ from transformers import AutoTokenizer
8
+ from transformers import AutoModelForSequenceClassification
9
+
10
+ from libs.normalizer import Normalizer
11
+ from libs.languages import languages
12
+ from libs.examples import EXAMPLES
13
+ from libs.dummy import outputs as dummy_outputs
14
+ from libs.utils import plot_result
15
+
16
+ import meta
17
+
18
+
19
+ class Zabanshenas:
20
+ def __init__(
21
+ self,
22
+ model_name_or_path: str = "m3hrdadfi/zabanshenas-roberta-base-mix",
23
+ by_gpu: bool = False
24
+ ) -> None:
25
+ self.debug = True
26
+ self.dummy_outputs = dummy_outputs
27
+ self.device = torch.device("cpu" if not by_gpu else "cuda")
28
+ self.model_name_or_path = model_name_or_path
29
+
30
+ self.tokenizer = None
31
+ self.model = None
32
+ self.normalizer = None
33
+ self.languages = None
34
+ self.framework = "pt"
35
+ self.max_length = 512
36
+ self.top_k = 5
37
+
38
+ def load(self):
39
+ if not self.debug:
40
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
41
+ self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name_or_path).to(self.device)
42
+
43
+ self.normalizer = Normalizer()
44
+ self.languages = languages
45
+
46
+ def ensure_tensor_on_device(self, **inputs):
47
+ """
48
+ Ensure PyTorch tensors are on the specified device.
49
+ """
50
+
51
+ return {
52
+ name: tensor.to(self.device) if isinstance(tensor, torch.Tensor) else tensor
53
+ for name, tensor in inputs.items()
54
+ }
55
+
56
+ def _parse_and_tokenize(
57
+ self,
58
+ inputs,
59
+ do_normalization: bool = True,
60
+ max_length: int = 512,
61
+ padding: bool = True,
62
+ add_special_tokens: bool = True,
63
+ truncation: bool = True,
64
+ ):
65
+ """
66
+ Parse arguments and tokenize
67
+ """
68
+ inputs = [self.normalizer(item) for item in inputs]
69
+ max_length = min(max_length, self.max_length)
70
+ inputs = self.tokenizer(
71
+ inputs,
72
+ max_length=max_length,
73
+ add_special_tokens=add_special_tokens,
74
+ return_tensors=self.framework,
75
+ padding=padding,
76
+ truncation=truncation,
77
+ )
78
+
79
+ return inputs
80
+
81
+ def _forward(
82
+ self,
83
+ inputs,
84
+ return_tensors: bool = True
85
+ ):
86
+ with torch.no_grad():
87
+ inputs = self.ensure_tensor_on_device(**inputs)
88
+ predictions = self.model(**inputs)[0].cpu()
89
+
90
+ if return_tensors:
91
+ return predictions
92
+ else:
93
+ return predictions.numpy()
94
+
95
+ def detect(
96
+ self,
97
+ texts,
98
+ max_length: int = 128,
99
+ do_normalization: bool = True
100
+ ):
101
+ if self.debug:
102
+ return self.dummy_outputs
103
+
104
+ texts = [texts] if not isinstance(texts, list) else texts
105
+ inputs = self._parse_and_tokenize(texts, do_normalization=do_normalization, max_length=max_length)
106
+ outputs = self._forward(inputs, return_tensors=False)
107
+ scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True)
108
+
109
+ results = [
110
+ [
111
+ {
112
+ "language": self.languages.get(self.model.config.id2label[i], None),
113
+ "code": self.model.config.id2label[i],
114
+ "score": score.item()
115
+ } for i, score in enumerate(item)
116
+ ] for item in scores
117
+ ]
118
+ results = [list(sorted(result, key=lambda kv: kv["score"], reverse=True)) for result in results]
119
+
120
+ return results
121
+
122
+
123
+ @st.cache(allow_output_mutation=True)
124
+ def load_language_detector():
125
+ detector = Zabanshenas()
126
+ detector.load()
127
+ return detector
128
+
129
+
130
+ def main():
131
+ st.set_page_config(
132
+ page_title="Zabanshenas",
133
+ page_icon="🕵",
134
+ layout="wide",
135
+ initial_sidebar_state="expanded"
136
+ )
137
+ detector = load_language_detector()
138
+
139
+ col1, col2 = st.beta_columns([6, 4])
140
+ with col2:
141
+ st.markdown(meta.INFO, unsafe_allow_html=True)
142
+
143
+ with col1:
144
+ prompts = list(EXAMPLES.keys()) + ["Custom"]
145
+ prompt = st.selectbox(
146
+ 'Examples (select from this list)',
147
+ prompts,
148
+ # index=len(prompts) - 1,
149
+ index=0
150
+ )
151
+
152
+ if prompt == "Custom":
153
+ prompt_box = ""
154
+ else:
155
+ prompt_box = EXAMPLES[prompt]
156
+
157
+ text = st.text_area(
158
+ 'Insert your text: ',
159
+ detector.normalizer(prompt_box),
160
+ height=200
161
+ )
162
+ text = detector.normalizer(text)
163
+ entered_text = st.empty()
164
+
165
+ detect_language = st.button('Detect Language !')
166
+
167
+ st.markdown(
168
+ "<hr />",
169
+ unsafe_allow_html=True
170
+ )
171
+ if detect_language:
172
+ words = text.split()
173
+ with st.spinner("Detecting..."):
174
+ if not len(words) > 3:
175
+ entered_text.markdown(
176
+ "Insert your text (at least three words)"
177
+ )
178
+ else:
179
+ top_languages = detector.detect(text, max_length=min(len(words), detector.max_length))
180
+ top_languages = top_languages[0][:detector.top_k]
181
+ plot_result(top_languages)
182
+
183
+
184
+ if __name__ == '__main__':
185
+ main()
libs/__init__.py ADDED
File without changes
libs/dummy.py ADDED
@@ -0,0 +1,1179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ outputs = [
2
+ [
3
+ {
4
+ "language": "Persian",
5
+ "code": "fas",
6
+ "score": 0.6105580925941467
7
+ },
8
+ {
9
+ "language": "Gilaki",
10
+ "code": "glk",
11
+ "score": 0.29982829093933105
12
+ },
13
+ {
14
+ "language": "Northern Luri",
15
+ "code": "lrc",
16
+ "score": 0.04840774089097977
17
+ },
18
+ {
19
+ "language": "Mazanderani",
20
+ "code": "mzn",
21
+ "score": 0.030142733827233315
22
+ },
23
+ {
24
+ "language": "South Azerbaijani",
25
+ "code": "azb",
26
+ "score": 0.005220199003815651
27
+ },
28
+ {
29
+ "language": "Urdu",
30
+ "code": "urd",
31
+ "score": 0.0019745035097002983
32
+ },
33
+ {
34
+ "language": "Pushto",
35
+ "code": "pus",
36
+ "score": 0.0015690263826400042
37
+ },
38
+ {
39
+ "language": "Western Panjabi",
40
+ "code": "pnb",
41
+ "score": 0.0005721596535295248
42
+ },
43
+ {
44
+ "language": "Central Kurdish",
45
+ "code": "ckb",
46
+ "score": 0.00025537016335874796
47
+ },
48
+ {
49
+ "language": "Sindhi",
50
+ "code": "snd",
51
+ "score": 0.0001820324978325516
52
+ },
53
+ {
54
+ "language": "Egyptian Arabic",
55
+ "code": "arz",
56
+ "score": 0.0001247940381290391
57
+ },
58
+ {
59
+ "language": "Arabic",
60
+ "code": "ara",
61
+ "score": 7.754910620860755e-05
62
+ },
63
+ {
64
+ "language": "Korean",
65
+ "code": "kor",
66
+ "score": 5.718228203477338e-05
67
+ },
68
+ {
69
+ "language": "Fiji Hindi",
70
+ "code": "hif",
71
+ "score": 3.5903740354115143e-05
72
+ },
73
+ {
74
+ "language": "Uighur",
75
+ "code": "uig",
76
+ "score": 3.5565532016335055e-05
77
+ },
78
+ {
79
+ "language": "Maori",
80
+ "code": "mri",
81
+ "score": 2.1078320060041733e-05
82
+ },
83
+ {
84
+ "language": "Literary Chinese",
85
+ "code": "lzh",
86
+ "score": 2.09943773370469e-05
87
+ },
88
+ {
89
+ "language": "Navajo",
90
+ "code": "nav",
91
+ "score": 1.8877935872296803e-05
92
+ },
93
+ {
94
+ "language": "Mongolian",
95
+ "code": "mon",
96
+ "score": 1.783044899639208e-05
97
+ },
98
+ {
99
+ "language": "Basque",
100
+ "code": "eus",
101
+ "score": 1.2980432074982673e-05
102
+ },
103
+ {
104
+ "language": "Moksha",
105
+ "code": "mdf",
106
+ "score": 1.2325609532126691e-05
107
+ },
108
+ {
109
+ "language": "Tongan",
110
+ "code": "ton",
111
+ "score": 1.1610675755946431e-05
112
+ },
113
+ {
114
+ "language": "Min Dong",
115
+ "code": "cdo",
116
+ "score": 1.1508132956805639e-05
117
+ },
118
+ {
119
+ "language": "Sinhala",
120
+ "code": "sin",
121
+ "score": 1.0617596672091167e-05
122
+ },
123
+ {
124
+ "language": "Venetian",
125
+ "code": "vec",
126
+ "score": 1.0375520105299074e-05
127
+ },
128
+ {
129
+ "language": "Western Mari",
130
+ "code": "mrj",
131
+ "score": 1.0316403859178536e-05
132
+ },
133
+ {
134
+ "language": "Malayalam",
135
+ "code": "mal",
136
+ "score": 1.0265099263051525e-05
137
+ },
138
+ {
139
+ "language": "Interlingua",
140
+ "code": "ina",
141
+ "score": 1.0040446795755997e-05
142
+ },
143
+ {
144
+ "language": "Tatar",
145
+ "code": "tat",
146
+ "score": 9.836200661084149e-06
147
+ },
148
+ {
149
+ "language": "Cantonese",
150
+ "code": "zh-yue",
151
+ "score": 9.80662207439309e-06
152
+ },
153
+ {
154
+ "language": "Wu Chinese",
155
+ "code": "wuu",
156
+ "score": 9.661145668360405e-06
157
+ },
158
+ {
159
+ "language": "Igbo",
160
+ "code": "ibo",
161
+ "score": 9.207592484017368e-06
162
+ },
163
+ {
164
+ "language": "Waray",
165
+ "code": "war",
166
+ "score": 8.970115231932141e-06
167
+ },
168
+ {
169
+ "language": "Yiddish",
170
+ "code": "yid",
171
+ "score": 8.926748705562204e-06
172
+ },
173
+ {
174
+ "language": "Udmurt",
175
+ "code": "udm",
176
+ "score": 8.702583727426827e-06
177
+ },
178
+ {
179
+ "language": "Dhivehi",
180
+ "code": "div",
181
+ "score": 8.36203707876848e-06
182
+ },
183
+ {
184
+ "language": "Newari",
185
+ "code": "new",
186
+ "score": 8.140945283230394e-06
187
+ },
188
+ {
189
+ "language": "Karachay-Balkar",
190
+ "code": "krc",
191
+ "score": 8.123539373627864e-06
192
+ },
193
+ {
194
+ "language": "Lojban",
195
+ "code": "jbo",
196
+ "score": 8.114019692584407e-06
197
+ },
198
+ {
199
+ "language": "Sanskrit",
200
+ "code": "san",
201
+ "score": 8.087784408417065e-06
202
+ },
203
+ {
204
+ "language": "Luganda",
205
+ "code": "lug",
206
+ "score": 8.023569534998387e-06
207
+ },
208
+ {
209
+ "language": "Maithili",
210
+ "code": "mai",
211
+ "score": 7.723083399469033e-06
212
+ },
213
+ {
214
+ "language": "Kirghiz",
215
+ "code": "kir",
216
+ "score": 7.715119863860309e-06
217
+ },
218
+ {
219
+ "language": "Standard Chinese",
220
+ "code": "zho",
221
+ "score": 7.5126054071006365e-06
222
+ },
223
+ {
224
+ "language": "Amharic",
225
+ "code": "amh",
226
+ "score": 7.451813871739432e-06
227
+ },
228
+ {
229
+ "language": "Chechen",
230
+ "code": "che",
231
+ "score": 7.444541097356705e-06
232
+ },
233
+ {
234
+ "language": "Gujarati",
235
+ "code": "guj",
236
+ "score": 7.395997727144277e-06
237
+ },
238
+ {
239
+ "language": "Tibetan",
240
+ "code": "bod",
241
+ "score": 7.390805421891855e-06
242
+ },
243
+ {
244
+ "language": "Komi",
245
+ "code": "kom",
246
+ "score": 7.373077551164897e-06
247
+ },
248
+ {
249
+ "language": "Lao",
250
+ "code": "lao",
251
+ "score": 7.351867679972202e-06
252
+ },
253
+ {
254
+ "language": "Wolof",
255
+ "code": "wol",
256
+ "score": 7.305452982109273e-06
257
+ },
258
+ {
259
+ "language": "Silesian",
260
+ "code": "szl",
261
+ "score": 7.301976893359097e-06
262
+ },
263
+ {
264
+ "language": "Northern Sotho",
265
+ "code": "nso",
266
+ "score": 7.2927336987049785e-06
267
+ },
268
+ {
269
+ "language": "Armenian",
270
+ "code": "hye",
271
+ "score": 7.243447726068553e-06
272
+ },
273
+ {
274
+ "language": "Arpitan",
275
+ "code": "frp",
276
+ "score": 7.137540251278551e-06
277
+ },
278
+ {
279
+ "language": "Bishnupriya",
280
+ "code": "bpy",
281
+ "score": 7.062033091642661e-06
282
+ },
283
+ {
284
+ "language": "Azerbaijani",
285
+ "code": "aze",
286
+ "score": 6.906778253323864e-06
287
+ },
288
+ {
289
+ "language": "Tajik",
290
+ "code": "tgk",
291
+ "score": 6.730050699843559e-06
292
+ },
293
+ {
294
+ "language": "Old English ",
295
+ "code": "ang",
296
+ "score": 6.6442084971640725e-06
297
+ },
298
+ {
299
+ "language": "Marathi",
300
+ "code": "mar",
301
+ "score": 6.63194168737391e-06
302
+ },
303
+ {
304
+ "language": "Kurdish",
305
+ "code": "kur",
306
+ "score": 6.615779057028703e-06
307
+ },
308
+ {
309
+ "language": "Lithuanian",
310
+ "code": "lit",
311
+ "score": 6.561998816323467e-06
312
+ },
313
+ {
314
+ "language": "Russian",
315
+ "code": "rus",
316
+ "score": 6.4370215113740414e-06
317
+ },
318
+ {
319
+ "language": "Tulu",
320
+ "code": "tcy",
321
+ "score": 6.370255960064242e-06
322
+ },
323
+ {
324
+ "language": "Extremaduran",
325
+ "code": "ext",
326
+ "score": 6.3398160818906035e-06
327
+ },
328
+ {
329
+ "language": "Aymara",
330
+ "code": "aym",
331
+ "score": 6.288398708420573e-06
332
+ },
333
+ {
334
+ "language": "Lower Sorbian",
335
+ "code": "dsb",
336
+ "score": 6.209619641595054e-06
337
+ },
338
+ {
339
+ "language": "Classical Nahuatl",
340
+ "code": "nci",
341
+ "score": 5.954705557087436e-06
342
+ },
343
+ {
344
+ "language": "Polish",
345
+ "code": "pol",
346
+ "score": 5.952156243438367e-06
347
+ },
348
+ {
349
+ "language": "Cebuano",
350
+ "code": "ceb",
351
+ "score": 5.911888820264721e-06
352
+ },
353
+ {
354
+ "language": "Hakka Chinese",
355
+ "code": "hak",
356
+ "score": 5.756284735980444e-06
357
+ },
358
+ {
359
+ "language": "Georgian",
360
+ "code": "kat",
361
+ "score": 5.656391749653267e-06
362
+ },
363
+ {
364
+ "language": "Mingrelian",
365
+ "code": "xmf",
366
+ "score": 5.57373004994588e-06
367
+ },
368
+ {
369
+ "language": "Telugu",
370
+ "code": "tel",
371
+ "score": 5.5334053286060225e-06
372
+ },
373
+ {
374
+ "language": "Doteli",
375
+ "code": "dty",
376
+ "score": 5.510717073775595e-06
377
+ },
378
+ {
379
+ "language": "Portuguese",
380
+ "code": "por",
381
+ "score": 5.50901131646242e-06
382
+ },
383
+ {
384
+ "language": "Komi-Permyak",
385
+ "code": "koi",
386
+ "score": 5.447328476293478e-06
387
+ },
388
+ {
389
+ "language": "Eastern Mari",
390
+ "code": "mhr",
391
+ "score": 5.414771294454113e-06
392
+ },
393
+ {
394
+ "language": "Lezghian",
395
+ "code": "lez",
396
+ "score": 5.2741329454875086e-06
397
+ },
398
+ {
399
+ "language": "Nepali (macrolanguage)",
400
+ "code": "nep",
401
+ "score": 5.273408532957546e-06
402
+ },
403
+ {
404
+ "language": "Samogitian",
405
+ "code": "sgs",
406
+ "score": 5.207636149862083e-06
407
+ },
408
+ {
409
+ "language": "Bhojpuri",
410
+ "code": "bho",
411
+ "score": 5.19551804245566e-06
412
+ },
413
+ {
414
+ "language": "Occitan",
415
+ "code": "oci",
416
+ "score": 5.172901182959322e-06
417
+ },
418
+ {
419
+ "language": "Western Frisian",
420
+ "code": "fry",
421
+ "score": 5.066170615464216e-06
422
+ },
423
+ {
424
+ "language": "Vlaams",
425
+ "code": "vls",
426
+ "score": 5.014707312511746e-06
427
+ },
428
+ {
429
+ "language": "Japanese",
430
+ "code": "jpn",
431
+ "score": 4.986791282135528e-06
432
+ },
433
+ {
434
+ "language": "V\u00f5ro",
435
+ "code": "vro",
436
+ "score": 4.9785726332629565e-06
437
+ },
438
+ {
439
+ "language": "Rusyn",
440
+ "code": "rue",
441
+ "score": 4.937043286190601e-06
442
+ },
443
+ {
444
+ "language": "Hindi",
445
+ "code": "hin",
446
+ "score": 4.9325194595439825e-06
447
+ },
448
+ {
449
+ "language": "Sicilian",
450
+ "code": "scn",
451
+ "score": 4.8434171731059905e-06
452
+ },
453
+ {
454
+ "language": "Somali",
455
+ "code": "som",
456
+ "score": 4.722482117358595e-06
457
+ },
458
+ {
459
+ "language": "Galician",
460
+ "code": "glg",
461
+ "score": 4.664954758482054e-06
462
+ },
463
+ {
464
+ "language": "Kazakh",
465
+ "code": "kaz",
466
+ "score": 4.485120825847844e-06
467
+ },
468
+ {
469
+ "language": "Kannada",
470
+ "code": "kan",
471
+ "score": 4.438274572748924e-06
472
+ },
473
+ {
474
+ "language": "Oromo",
475
+ "code": "orm",
476
+ "score": 4.422903202794259e-06
477
+ },
478
+ {
479
+ "language": "Albanian",
480
+ "code": "sqi",
481
+ "score": 4.410150268085999e-06
482
+ },
483
+ {
484
+ "language": "Minangkabau",
485
+ "code": "min",
486
+ "score": 4.407007509144023e-06
487
+ },
488
+ {
489
+ "language": "Finnish",
490
+ "code": "fin",
491
+ "score": 4.374884611024754e-06
492
+ },
493
+ {
494
+ "language": "Ossetian",
495
+ "code": "oss",
496
+ "score": 4.322507265897002e-06
497
+ },
498
+ {
499
+ "language": "Volap\u00fck",
500
+ "code": "vol",
501
+ "score": 4.30220188718522e-06
502
+ },
503
+ {
504
+ "language": "Min Nan Chinese",
505
+ "code": "nan",
506
+ "score": 4.2357942220405675e-06
507
+ },
508
+ {
509
+ "language": "Bashkir",
510
+ "code": "bak",
511
+ "score": 4.212616204313235e-06
512
+ },
513
+ {
514
+ "language": "Ligurian",
515
+ "code": "lij",
516
+ "score": 4.1821313061518595e-06
517
+ },
518
+ {
519
+ "language": "Welsh",
520
+ "code": "cym",
521
+ "score": 4.174029982095817e-06
522
+ },
523
+ {
524
+ "language": "Slovene",
525
+ "code": "slv",
526
+ "score": 4.172954504610971e-06
527
+ },
528
+ {
529
+ "language": "Dimli",
530
+ "code": "diq",
531
+ "score": 4.078176516486565e-06
532
+ },
533
+ {
534
+ "language": "Chuvash",
535
+ "code": "chv",
536
+ "score": 4.048466053063748e-06
537
+ },
538
+ {
539
+ "language": "Panjabi",
540
+ "code": "pan",
541
+ "score": 3.940522674383828e-06
542
+ },
543
+ {
544
+ "language": "Cornish",
545
+ "code": "cor",
546
+ "score": 3.940297119697789e-06
547
+ },
548
+ {
549
+ "language": "West Low German",
550
+ "code": "nds-nl",
551
+ "score": 3.926987574232044e-06
552
+ },
553
+ {
554
+ "language": "Cherokee",
555
+ "code": "chr",
556
+ "score": 3.9112833292165305e-06
557
+ },
558
+ {
559
+ "language": "Ido",
560
+ "code": "ido",
561
+ "score": 3.892145286954474e-06
562
+ },
563
+ {
564
+ "language": "Friulian",
565
+ "code": "fur",
566
+ "score": 3.869370175380027e-06
567
+ },
568
+ {
569
+ "language": "Ukrainian",
570
+ "code": "ukr",
571
+ "score": 3.7814761526533403e-06
572
+ },
573
+ {
574
+ "language": "Vietnamese",
575
+ "code": "vie",
576
+ "score": 3.7795757634739857e-06
577
+ },
578
+ {
579
+ "language": "Emilian",
580
+ "code": "egl",
581
+ "score": 3.7286854421836324e-06
582
+ },
583
+ {
584
+ "language": "Hungarian",
585
+ "code": "hun",
586
+ "score": 3.706084498844575e-06
587
+ },
588
+ {
589
+ "language": "Haitian Creole",
590
+ "code": "hat",
591
+ "score": 3.6860656109638512e-06
592
+ },
593
+ {
594
+ "language": "Jamaican Patois",
595
+ "code": "jam",
596
+ "score": 3.6750652725459076e-06
597
+ },
598
+ {
599
+ "language": "Turkmen",
600
+ "code": "tuk",
601
+ "score": 3.6414037367649144e-06
602
+ },
603
+ {
604
+ "language": "Gagauz",
605
+ "code": "gag",
606
+ "score": 3.6310443647380453e-06
607
+ },
608
+ {
609
+ "language": "Yakut",
610
+ "code": "sah",
611
+ "score": 3.611620968513307e-06
612
+ },
613
+ {
614
+ "language": "Breton",
615
+ "code": "bre",
616
+ "score": 3.5204120649723336e-06
617
+ },
618
+ {
619
+ "language": "Afrikaans",
620
+ "code": "afr",
621
+ "score": 3.5164177916158224e-06
622
+ },
623
+ {
624
+ "language": "Assamese",
625
+ "code": "asm",
626
+ "score": 3.5076063795713708e-06
627
+ },
628
+ {
629
+ "language": "Crimean Tatar",
630
+ "code": "crh",
631
+ "score": 3.4974791560671292e-06
632
+ },
633
+ {
634
+ "language": "Tswana",
635
+ "code": "tsn",
636
+ "score": 3.4639840578165604e-06
637
+ },
638
+ {
639
+ "language": "Malagasy",
640
+ "code": "mlg",
641
+ "score": 3.4424308523739455e-06
642
+ },
643
+ {
644
+ "language": "Tamil",
645
+ "code": "tam",
646
+ "score": 3.433554866205668e-06
647
+ },
648
+ {
649
+ "language": "Belarusian (Taraschkewiza)",
650
+ "code": "be-tarask",
651
+ "score": 3.4065565159835387e-06
652
+ },
653
+ {
654
+ "language": "Scottish Gaelic",
655
+ "code": "gla",
656
+ "score": 3.383374632903724e-06
657
+ },
658
+ {
659
+ "language": "Latin",
660
+ "code": "lat",
661
+ "score": 3.299320724181598e-06
662
+ },
663
+ {
664
+ "language": "Chavacano",
665
+ "code": "cbk",
666
+ "score": 3.277132236689795e-06
667
+ },
668
+ {
669
+ "language": "Tarantino dialect",
670
+ "code": "roa-tara",
671
+ "score": 3.2704483601264656e-06
672
+ },
673
+ {
674
+ "language": "Modern Greek",
675
+ "code": "ell",
676
+ "score": 3.2669522624928504e-06
677
+ },
678
+ {
679
+ "language": "Ladino",
680
+ "code": "lad",
681
+ "score": 3.1890219815977616e-06
682
+ },
683
+ {
684
+ "language": "Latgalian",
685
+ "code": "ltg",
686
+ "score": 3.1830948046263075e-06
687
+ },
688
+ {
689
+ "language": "Pampanga",
690
+ "code": "pam",
691
+ "score": 3.1460281206818763e-06
692
+ },
693
+ {
694
+ "language": "Tagalog",
695
+ "code": "tgl",
696
+ "score": 3.100457433902193e-06
697
+ },
698
+ {
699
+ "language": "Hebrew",
700
+ "code": "heb",
701
+ "score": 3.0715009415871464e-06
702
+ },
703
+ {
704
+ "language": "Serbo-Croatian",
705
+ "code": "hbs",
706
+ "score": 3.050950908800587e-06
707
+ },
708
+ {
709
+ "language": "Achinese",
710
+ "code": "ace",
711
+ "score": 3.0138855890982086e-06
712
+ },
713
+ {
714
+ "language": "Italian",
715
+ "code": "ita",
716
+ "score": 3.003329993589432e-06
717
+ },
718
+ {
719
+ "language": "English",
720
+ "code": "eng",
721
+ "score": 2.97778979074792e-06
722
+ },
723
+ {
724
+ "language": "Burmese",
725
+ "code": "mya",
726
+ "score": 2.9546490623033606e-06
727
+ },
728
+ {
729
+ "language": "Spanish",
730
+ "code": "spa",
731
+ "score": 2.9272057417983888e-06
732
+ },
733
+ {
734
+ "language": "Papiamento",
735
+ "code": "pap",
736
+ "score": 2.8780641514458694e-06
737
+ },
738
+ {
739
+ "language": "Sardinian",
740
+ "code": "srd",
741
+ "score": 2.866505383281037e-06
742
+ },
743
+ {
744
+ "language": "Esperanto",
745
+ "code": "epo",
746
+ "score": 2.848199301297427e-06
747
+ },
748
+ {
749
+ "language": "Serbian",
750
+ "code": "srp",
751
+ "score": 2.7479175059852423e-06
752
+ },
753
+ {
754
+ "language": "Zeeuws",
755
+ "code": "zea",
756
+ "score": 2.7430314730736427e-06
757
+ },
758
+ {
759
+ "language": "Czech",
760
+ "code": "ces",
761
+ "score": 2.7409500944486354e-06
762
+ },
763
+ {
764
+ "language": "Bengali",
765
+ "code": "ben",
766
+ "score": 2.6958239232044434e-06
767
+ },
768
+ {
769
+ "language": "Erzya",
770
+ "code": "myv",
771
+ "score": 2.6273187359038275e-06
772
+ },
773
+ {
774
+ "language": "Croatian",
775
+ "code": "hrv",
776
+ "score": 2.6178654479735997e-06
777
+ },
778
+ {
779
+ "language": "Buryat",
780
+ "code": "bxr",
781
+ "score": 2.60430465459649e-06
782
+ },
783
+ {
784
+ "language": "Swahili (macrolanguage)",
785
+ "code": "swa",
786
+ "score": 2.6016373340098653e-06
787
+ },
788
+ {
789
+ "language": "Pangasinan",
790
+ "code": "pag",
791
+ "score": 2.60037768384791e-06
792
+ },
793
+ {
794
+ "language": "Xhosa",
795
+ "code": "xho",
796
+ "score": 2.580123918960453e-06
797
+ },
798
+ {
799
+ "language": "Bosnian",
800
+ "code": "bos",
801
+ "score": 2.5763115445442963e-06
802
+ },
803
+ {
804
+ "language": "Low German",
805
+ "code": "nds",
806
+ "score": 2.5743340756889665e-06
807
+ },
808
+ {
809
+ "language": "Kinyarwanda",
810
+ "code": "kin",
811
+ "score": 2.568235458966228e-06
812
+ },
813
+ {
814
+ "language": "Aromanian",
815
+ "code": "rup",
816
+ "score": 2.520287125662435e-06
817
+ },
818
+ {
819
+ "language": "Aragonese",
820
+ "code": "arg",
821
+ "score": 2.4836215288814856e-06
822
+ },
823
+ {
824
+ "language": "Tetum",
825
+ "code": "tet",
826
+ "score": 2.396502168267034e-06
827
+ },
828
+ {
829
+ "language": "Quechua",
830
+ "code": "que",
831
+ "score": 2.3799134396540467e-06
832
+ },
833
+ {
834
+ "language": "Livvi-Karelian",
835
+ "code": "olo",
836
+ "score": 2.3709426386631094e-06
837
+ },
838
+ {
839
+ "language": "Kashubian",
840
+ "code": "csb",
841
+ "score": 2.358733354412834e-06
842
+ },
843
+ {
844
+ "language": "Avar",
845
+ "code": "ava",
846
+ "score": 2.330698407604359e-06
847
+ },
848
+ {
849
+ "language": "Hausa",
850
+ "code": "hau",
851
+ "score": 2.286114295202424e-06
852
+ },
853
+ {
854
+ "language": "Ripuarisch",
855
+ "code": "ksh",
856
+ "score": 2.254129412904149e-06
857
+ },
858
+ {
859
+ "language": "Bulgarian",
860
+ "code": "bul",
861
+ "score": 2.2492179141408997e-06
862
+ },
863
+ {
864
+ "language": "Oriya",
865
+ "code": "ori",
866
+ "score": 2.1661755909008207e-06
867
+ },
868
+ {
869
+ "language": "Interlingue",
870
+ "code": "ile",
871
+ "score": 2.059975486190524e-06
872
+ },
873
+ {
874
+ "language": "Guarani",
875
+ "code": "grn",
876
+ "score": 2.024690957114217e-06
877
+ },
878
+ {
879
+ "language": "Banjar",
880
+ "code": "bjn",
881
+ "score": 2.0237362150510307e-06
882
+ },
883
+ {
884
+ "language": "Thai",
885
+ "code": "tha",
886
+ "score": 2.01868806470884e-06
887
+ },
888
+ {
889
+ "language": "Dutch",
890
+ "code": "nld",
891
+ "score": 1.9297158360132016e-06
892
+ },
893
+ {
894
+ "language": "Kabyle",
895
+ "code": "kab",
896
+ "score": 1.9132662600895856e-06
897
+ },
898
+ {
899
+ "language": "Palatine German",
900
+ "code": "pfl",
901
+ "score": 1.9122355752188014e-06
902
+ },
903
+ {
904
+ "language": "Javanese",
905
+ "code": "jav",
906
+ "score": 1.8900879013017402e-06
907
+ },
908
+ {
909
+ "language": "Banyumasan",
910
+ "code": "map-bms",
911
+ "score": 1.8552185565567925e-06
912
+ },
913
+ {
914
+ "language": "Faroese",
915
+ "code": "fao",
916
+ "score": 1.8414674514133367e-06
917
+ },
918
+ {
919
+ "language": "Scots",
920
+ "code": "sco",
921
+ "score": 1.818199393710529e-06
922
+ },
923
+ {
924
+ "language": "Central Khmer",
925
+ "code": "khm",
926
+ "score": 1.7993022538576042e-06
927
+ },
928
+ {
929
+ "language": "Slovak",
930
+ "code": "slk",
931
+ "score": 1.7988603531193803e-06
932
+ },
933
+ {
934
+ "language": "Belarusian",
935
+ "code": "bel",
936
+ "score": 1.782583581189101e-06
937
+ },
938
+ {
939
+ "language": "Swedish",
940
+ "code": "swe",
941
+ "score": 1.7702136574371252e-06
942
+ },
943
+ {
944
+ "language": "Saterfriesisch",
945
+ "code": "stq",
946
+ "score": 1.7663436437942437e-06
947
+ },
948
+ {
949
+ "language": "Latvian",
950
+ "code": "lav",
951
+ "score": 1.7178032294395962e-06
952
+ },
953
+ {
954
+ "language": "Konkani",
955
+ "code": "kok",
956
+ "score": 1.690383783170546e-06
957
+ },
958
+ {
959
+ "language": "Tuvan",
960
+ "code": "tyv",
961
+ "score": 1.672853159107035e-06
962
+ },
963
+ {
964
+ "language": "Walloon",
965
+ "code": "wln",
966
+ "score": 1.6722132158975e-06
967
+ },
968
+ {
969
+ "language": "Sranan",
970
+ "code": "srn",
971
+ "score": 1.646132773203135e-06
972
+ },
973
+ {
974
+ "language": "Picard",
975
+ "code": "pcd",
976
+ "score": 1.6385885146519286e-06
977
+ },
978
+ {
979
+ "language": "Limburgan",
980
+ "code": "lim",
981
+ "score": 1.6372666777897393e-06
982
+ },
983
+ {
984
+ "language": "French",
985
+ "code": "fra",
986
+ "score": 1.6239549722740776e-06
987
+ },
988
+ {
989
+ "language": "Icelandic",
990
+ "code": "isl",
991
+ "score": 1.5904075780781568e-06
992
+ },
993
+ {
994
+ "language": "Irish",
995
+ "code": "gle",
996
+ "score": 1.5750525790281245e-06
997
+ },
998
+ {
999
+ "language": "Corsican",
1000
+ "code": "cos",
1001
+ "score": 1.570832523611898e-06
1002
+ },
1003
+ {
1004
+ "language": "Alemannic German",
1005
+ "code": "als",
1006
+ "score": 1.5651218063794659e-06
1007
+ },
1008
+ {
1009
+ "language": "German",
1010
+ "code": "deu",
1011
+ "score": 1.5594737305946182e-06
1012
+ },
1013
+ {
1014
+ "language": "Upper Sorbian",
1015
+ "code": "hsb",
1016
+ "score": 1.5125158370210556e-06
1017
+ },
1018
+ {
1019
+ "language": "Romanian",
1020
+ "code": "ron",
1021
+ "score": 1.5119784393391456e-06
1022
+ },
1023
+ {
1024
+ "language": "Manx",
1025
+ "code": "glv",
1026
+ "score": 1.5035052456369158e-06
1027
+ },
1028
+ {
1029
+ "language": "Lingala",
1030
+ "code": "lin",
1031
+ "score": 1.493238073635439e-06
1032
+ },
1033
+ {
1034
+ "language": "Malay",
1035
+ "code": "msa",
1036
+ "score": 1.4067626352698426e-06
1037
+ },
1038
+ {
1039
+ "language": "Maltese",
1040
+ "code": "mlt",
1041
+ "score": 1.370485165352875e-06
1042
+ },
1043
+ {
1044
+ "language": "Luxembourgish",
1045
+ "code": "ltz",
1046
+ "score": 1.3397349221122568e-06
1047
+ },
1048
+ {
1049
+ "language": "Estonian",
1050
+ "code": "est",
1051
+ "score": 1.3280839539220324e-06
1052
+ },
1053
+ {
1054
+ "language": "Kabardian",
1055
+ "code": "kbd",
1056
+ "score": 1.3062604011793155e-06
1057
+ },
1058
+ {
1059
+ "language": "Macedonian",
1060
+ "code": "mkd",
1061
+ "score": 1.2802570381609257e-06
1062
+ },
1063
+ {
1064
+ "language": "Pennsylvania German",
1065
+ "code": "pdc",
1066
+ "score": 1.2550040082714986e-06
1067
+ },
1068
+ {
1069
+ "language": "Sundanese",
1070
+ "code": "sun",
1071
+ "score": 1.1068191270169336e-06
1072
+ },
1073
+ {
1074
+ "language": "Iloko",
1075
+ "code": "ilo",
1076
+ "score": 1.0791690101541462e-06
1077
+ },
1078
+ {
1079
+ "language": "Karakalpak",
1080
+ "code": "kaa",
1081
+ "score": 1.0603262126096524e-06
1082
+ },
1083
+ {
1084
+ "language": "Norwegian Nynorsk",
1085
+ "code": "nno",
1086
+ "score": 1.0554679192864569e-06
1087
+ },
1088
+ {
1089
+ "language": "Yoruba",
1090
+ "code": "yor",
1091
+ "score": 1.046297711582156e-06
1092
+ },
1093
+ {
1094
+ "language": "Neapolitan",
1095
+ "code": "nap",
1096
+ "score": 1.0279602520313347e-06
1097
+ },
1098
+ {
1099
+ "language": "Danish",
1100
+ "code": "dan",
1101
+ "score": 1.0038916116172913e-06
1102
+ },
1103
+ {
1104
+ "language": "Indonesian",
1105
+ "code": "ind",
1106
+ "score": 9.83746303973021e-07
1107
+ },
1108
+ {
1109
+ "language": "Mirandese",
1110
+ "code": "mwl",
1111
+ "score": 8.806521236692788e-07
1112
+ },
1113
+ {
1114
+ "language": "Catalan",
1115
+ "code": "cat",
1116
+ "score": 8.687447348165733e-07
1117
+ },
1118
+ {
1119
+ "language": "Turkish",
1120
+ "code": "tur",
1121
+ "score": 8.384120064874878e-07
1122
+ },
1123
+ {
1124
+ "language": "Veps",
1125
+ "code": "vep",
1126
+ "score": 7.812500371073838e-07
1127
+ },
1128
+ {
1129
+ "language": "Bokm\u00e5l",
1130
+ "code": "nob",
1131
+ "score": 7.427178161378833e-07
1132
+ },
1133
+ {
1134
+ "language": "Shona",
1135
+ "code": "sna",
1136
+ "score": 6.660703775196453e-07
1137
+ },
1138
+ {
1139
+ "language": "Bavarian",
1140
+ "code": "bar",
1141
+ "score": 6.222485353646334e-07
1142
+ },
1143
+ {
1144
+ "language": "Uzbek",
1145
+ "code": "uzb",
1146
+ "score": 6.021850822435226e-07
1147
+ },
1148
+ {
1149
+ "language": "Central Bikol",
1150
+ "code": "bcl",
1151
+ "score": 5.77034370508045e-07
1152
+ },
1153
+ {
1154
+ "language": "Asturian",
1155
+ "code": "ast",
1156
+ "score": 5.743918336520437e-07
1157
+ },
1158
+ {
1159
+ "language": "Lombard",
1160
+ "code": "lmo",
1161
+ "score": 4.6301857992148143e-07
1162
+ },
1163
+ {
1164
+ "language": "Romansh",
1165
+ "code": "roh",
1166
+ "score": 4.5534079617937095e-07
1167
+ },
1168
+ {
1169
+ "language": "Narom",
1170
+ "code": "nrm",
1171
+ "score": 3.6611126574825903e-07
1172
+ },
1173
+ {
1174
+ "language": "Northern Sami",
1175
+ "code": "sme",
1176
+ "score": 1.0723972820869676e-07
1177
+ }
1178
+ ]
1179
+ ]
libs/examples.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXAMPLES = {
2
+ 'Example 1 - Paragraph - Wu Chinese': '里维拉军政府 ( 西班牙文 Directorio militar de Primo de Rivera 是20世纪20年代 西班牙军事人物大里维拉立在西班牙国王阿方索十三世搭西班牙天主教个支持下建立个威权主义政府 迭个政府自称超出传统个政治党派 一切为国王搭西班牙个稳定服务 伊拉个支持者是爱国同盟 弗过 因为遇到经济大萧条 威权政府立在1930年垮台 之后西班牙拿国王废脱 成立西班牙第二共和国',
3
+ 'Example 2 - Paragraph - Scots': 'Polyandry ( frae Greek : πολυ - poly - , " " mony " " and ἀνήρ anēr , " " man " " ) involves mairiage that includes mair nor twa pairtners an can faw unner the broader category o polyamory . Mair speceefically , it is a form o polygamy , whaur a woman takes twa or mair husbands at the same time . Polyandry is contrasted wi polygyny , involvin ane male an twa or mair females . If a mairiage involves a plural nummer o " " husbands an wives " " pairteecipants o each gender , then it can be cried polyamory , group or conjynt mairiage . In its broadest uise , polyandry refers tae sexual relations wi multiple males within or withoot mairiage .',
4
+ 'Example 3 - Paragraph - Sinhala': 'ට්රූඑක්ස් ක් රියාවලියට විකල්පයක් ලෙස මැලොන්ඩයමයිඩ් ( malondiamide ) යොදා ගන්නා නිස්සාරණ ක් රමයක් හදුන්වා දී ඇත . ඩයමෙක්ස් ( DIAMEX ; DIAMide Extraction ) ක් රියාවලියේ වාසියක් වන්නේ කාබන් , හයිඩ් රජන් , නයිට් රජන් හා ඔක්සිජන් හැර වෙනත් මූල ද් රව් ය අඩංගු කාබනික අපද් රව් ය උත්පාදනය වීම වැළකීමයි . මෙම අපද් රව් ය අම්ල වැසිවලට උරදෙන ආම්ලික වායු නොසෑදෙන සේ දහනය කළ හැකිය . ඩයමෙක්ස් ක් රමය ප් රංශ CEA මගින් යුරෝපයේ භාවිතයට ගනී . ක් රියාවලිය ගැන දැනට පවතින දැනුමෙන් කාර්මික කම්හල් ඉදි කිරීමට තරම් මෙම ක් රියාවලිය පරිණත වී ඇත . මෙම ක් රියාවලිය ද භාවිතා කරන්නේ ද් රව් යතා යාන්ත් රණයක්ය .',
5
+ 'Example 4 - Paragraph - Asturian': "Presupuestos públicos pa la igualdá y el desenvolvimientu sustentable para con ello tresformar l ' actual modelu de desenvolvimientu atendiendo les causes estructurales que xeneren y reproducen desigualdaes por ello participa nel ambito internacional y rexonal nel siguimientu de los Oxetivos de Desenvolvimientu del Mileniu ( ODS ) y agora na axenda 2030 .",
6
+ 'Example 5 - Paragraph - Swahili (macrolanguage)': 'Kuna nyota nyingi katika eneo la kundinyota hii lakini zote si angavu sana . Nyota angavu zaidi ni Beta Aquarii ambayo ni nyota jitu kubwa njano mwenye uangavu unaoonekana wa 2 . 9 .',
7
+ 'Example 6 - Paragraph - Czech': 'CASSE , Gilbert ; CUNDALL , Peter ; TULLY , Anthony . IJN ARGENTINA MARU : Tabular Record of Movement [ online ] . combinedfleet . com , 2015 - 09 - 13 , [ cit . 2015 - 09 - 13 ] . Dostupné online . ( anglicky )',
8
+ 'Example 7 - Paragraph - Lingala': 'Mayanzi ekutanaka mingi mingi na Afrika , na amérika mpé na bisika ya moyi makasi . Liyanzi elingaka bisika ya zélo mpé ya salité mpo na ko kota na nzoto ya nyama mosusu to na nzoto ya moto .',
9
+ 'Example 8 - Paragraph - Thai': 'โดยที่ an > 0 สำหรับทุก n แต่ละพจน์ของอนุกรมจะมีเครื่องหมายบวกและลบสลับกัน เช่นเดียวกับอนุกรมอื่นๆ อนุกรมสลับจะลู่เข้าก็ต่อเมื่อลำดับของผลบวกจำกัดพจน์ลู่เข้า',
10
+ 'Example 9 - Paragraph - Waray': 'An Orphnus rufithorax in uska species han Coleoptera nga ginhulagway ni Benderitter hadton 1914 . An Orphnus rufithorax in nahilalakip ha genus nga Orphnus , ngan familia nga Orphnidae . Waray hini subspecies nga nakalista .',
11
+ 'Example 10 - Paragraph - Gagauz': 'Ama buna bakmadaan , masmediya milionnarca insannarın ölmesinnän korkudêr . Sansın büün medişina XIX - cu üzyılın uurunda bulunarmış gibi . Düünnedä panika başlamaya yakın .',
12
+ 'Example 11 - Paragraph - Western Panjabi': 'پینڈولم دی فزکس سب توں پہلا گلیلیو نیں 1602 دی چ سمجی سی تے اس ویلے توں 1930 تک پینڈولم نوں ویلے دے سب توں ٹھیک حساب لئی ورتیا جاندا ��ی پینڈولم نوں ویلا نپن دے علاوہ زمین دی کچھ نپن تے بھونچال دے زور نپن لئی وی ورتیا جاندا سی',
13
+ 'Example 12 - Paragraph - West Low German': 'Hoog Buurlo is n gehucht in de gemeente Apeldoorne , in de Nederlandse provinsie Gelderland . t Ligt ten westen van de stad Apeldoorne en iets ten oosten van Radio Kootwiek .',
14
+ 'Example 13 - Paragraph - Quechua': '| 5 ñiqin pachakwata | 6 ñiqin pachakwata | 7 ñiqin pachakwata | | 510 watakuna | 520 watakuna | 530 watakuna | 540 watakuna | 550 watakuna | 560 watakuna | 570 watakuna | | | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | |',
15
+ 'Example 14 - Paragraph - Serbo-Croatian': 'Sočanica je naselje u opštini Leposavić na Kosovu i Metohiji . Površina katastarske opštine Sočanica gde je atar naselja iznosi 1 . 929 ha . Sedište je mesne zajednice Sočanica . Naselje Sočanica poslednjih godina prerasta iz seoskog naselja u varošicu , nalazi se 5 km južno od Leposavića sa desne strane reke Ibar . Srednja nadmorska visina naselja iznosi 636m . U pisanim izvorima selo se prvi put pominje 1315 . godine , u povelji srpskog kralja Stefana Milutina manastiru Banjskoj .',
16
+ 'Example 15 - Paragraph - Tongan': 'ʻOku pehē ʻe Niel Gunson naʻe ʻafifio ʻa e Tuʻi Tonga ʻi he taimi ko ia ʻi Manuʻa pea naʻa nau hoko ki he Tuʻi Manuʻa . Ko ia ai ʻoku hala ha tuʻi ʻi Tongatapu , pea naʻe kamata ha holongā tuʻi foʻou .',
17
+ 'Example 16 - Paragraph - Rusyn': 'Прыпять - є єднов з найдовшых рік на Україні і Европы . Тече на теріторії Білорусії і Україны . Довжка рікы є 775 км . Коло міста Чорнобыль ся вливать до водозбіру рікы Днїпр і так там кінчыть свою путь .',
18
+ 'Example 17 - Paragraph - Ladino': 'Ciruelos de Cervera es un puevlo de la Provinsia de Burgos en la junta de Kastiya i Leon en Espanya . Tiene una povlasion de 111 avitants i una ekstension de 37 , 874 km² ( 2015 ) .',
19
+ 'Example 18 - Paragraph - Bosnian': 'Po posljednjem službenom popisu stanovništva iz 1991 . godine , općina Hadžići ( u to vrijeme jedna od 5 prigradskih općina Grada Sarajeva ) imala je 24 . 200 stanovnika , raspoređenih u 62 naselja .',
20
+ 'Example 19 - Paragraph - Chuvash': 'Мăн Агыйдел ( Агыйдел ) Раççей территоринчи юханшыв . Вологда облаçĕ , Киров облаçĕ , Коми Республики территорипе юхать . Шарженг юханшывăн сылтăм çыранĕпе 143 км вăрринчен юханшыва юхса кĕрет . Юханшыв тăршшĕ 10 км .',
21
+ 'Example 20 - Paragraph - Dhivehi': 'ގުރުދާ ބަލި ބޮޑުވަމުންދާވަރަކަށް ލޭގައި ޖަމާވަމުންދާ ބޭކާރު މާއްދާތައް ހަށިގަނޑުން ބޭރުކުރުމަށް ހަށިގަނޑު ނުކުޅެދެއެވެ . ނަތީޖާއެއްގެ ގޮތުގައި މިމާއްދާތައް ގިނަވެ ވިހަވާ މިންވަރަށް އިތުރުވެއެވެ . ލޭގެ Pްރެޝަރ އިތުރުވެފައިވާނަމަ މިމައްސަލަ އިތުރަށް ގޯސްވެއެވެ . ގުރުދާ ބަލިން ރައްކާތެރިވެ ވީހާވެސް ކުރިން ސިއްހީ ފަރުވާ ފެށުމަށްޓަކައި ބަލީގެ ކުރީކޮޅުގައި ބަލި ފާހަގަކުރުމަށް މަސައްކަތް ކުރުމަކީ މުހިއްމު ކަމެކެވެ .',
22
+ 'Example 21 - Sentence - Western Panjabi': 'یکی از این آبخورگاه ها در ضلع شرقی صحن و در مقابل مقبره راجه قرار داشته و بهره هند ( طایفه ای از اسماعیلیان ) آن را نوسازی کرده بودند و در جوار آن نیز دو درخت میوه و یک درخت سدر بوده است .',
23
+ 'Example 22 - Sentence - Tamil': 'மேற்குறிப்பிட்ட கட்சிகளைத் தவிர முஸ்லிம் லீக் , இந்திய கம்யூனிஸ்ட் கட்சி , ஃபார்வார்டு ப்ளாக் , சி .',
24
+ 'Example 23 - Sentence - Basque': 'Hala ere , garai horretan ELAk ez zituen langile etorkinak onartzen , afiliatzeko lehen lau abizenetatik bat gutxienez euskal jatorrikoa izatea eskatzen baitzuen oraindik .',
25
+ 'Example 24 - Sentence - Livvi-Karelian': 'Niilöis lapset lujendetah omua tervehytty , harjavutah vedeh .',
26
+ 'Example 25 - Sentence - Eastern Mari': '3 : Тÿнямбал да руш классике , рушлаш кусарыме сборник - влак .',
27
+ 'Example 26 - Sentence - Breton': "Brudet eo bet e Breizh abalamour d ' e enebiezh ruz ouzh Diwan hag ouzh ar brezhoneg .",
28
+ 'Example 27 - Sentence - Bosnian': 'Sa 52 godine vratio se u Veneciju i ponudio svoje usluge svojim donedavnim progoniteljima za koje je radio kao špijun i za to su ga plaćali .',
29
+ 'Example 28 - Sentence - Uzbek': 'diapazoni N .', 'Example 29 - Sentence - Esperanto': 'Loa !',
30
+ 'Example 30 - Sentence - Lezghian': 'Кабир - Казмаляр ) Дагъустан республикадин Мегьарамдхуьруьн районда авай , « КьепIир Къазмайрин » хуьруьнсоветдик акатзавай хуьр .',
31
+ 'Example 31 - Sentence - Norwegian Nynorsk': 'Morfaren var elles ein velkjend filosof og samfunnskritikar , Aleksandr Radisjtsjev .',
32
+ 'Example 32 - Sentence - Papiamento': 'Esaki tabata inaceptabel pa e politiconan di Curaçao .',
33
+ 'Example 33 - Sentence - Azerbaijani': 'Müharibədən evə sağ - salamat dönən Xındı Məmməd öz gözəl sənətini davam etdirmişdir .',
34
+ 'Example 34 - Sentence - Volapük': 'Lödanadensit äbinon mö mens 460 , 2 a km² .',
35
+ 'Example 35 - Sentence - Konkani': '13 No creature is concealed from him , but everything is naked and exposed to the eyes of him to whom we must render an account',
36
+ 'Example 36 - Sentence - Latgalian': '1990 godā solu reorganizej kai pogostu .',
37
+ 'Example 37 - Sentence - Swedish': 'Inga underarter finns listade i Catalogue of Life .',
38
+ 'Example 38 - Sentence - Tuvan': 'Ол ам оралакчы сайыт апарган кижи чүге дуза кадып шыдавас деп бодап , эжинге бүзүрел Серге - Байырны ооӊ - биле ужуражылгаже эккээр .',
39
+ 'Example 39 - Sentence - Malagasy': "200 no isan ' ny kisoa .",
40
+ 'Example 40 - Sentence - English': 'The convention followed after a request by the Bulgarian government on 24 September asking for a ceasefire .'}
libs/languages.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ languages = {
2
+ "ace": "Achinese",
3
+ "afr": "Afrikaans",
4
+ "als": "Alemannic German",
5
+ "amh": "Amharic",
6
+ "ang": "Old English ",
7
+ "ara": "Arabic",
8
+ "arg": "Aragonese",
9
+ "arz": "Egyptian Arabic",
10
+ "asm": "Assamese",
11
+ "ast": "Asturian",
12
+ "ava": "Avar",
13
+ "aym": "Aymara",
14
+ "azb": "South Azerbaijani",
15
+ "aze": "Azerbaijani",
16
+ "bak": "Bashkir",
17
+ "bar": "Bavarian",
18
+ "bcl": "Central Bikol",
19
+ "be-tarask": "Belarusian (Taraschkewiza)",
20
+ "bel": "Belarusian",
21
+ "ben": "Bengali",
22
+ "bho": "Bhojpuri",
23
+ "bjn": "Banjar",
24
+ "bod": "Tibetan",
25
+ "bos": "Bosnian",
26
+ "bpy": "Bishnupriya",
27
+ "bre": "Breton",
28
+ "bul": "Bulgarian",
29
+ "bxr": "Buryat",
30
+ "cat": "Catalan",
31
+ "cbk": "Chavacano",
32
+ "cdo": "Min Dong",
33
+ "ceb": "Cebuano",
34
+ "ces": "Czech",
35
+ "che": "Chechen",
36
+ "chr": "Cherokee",
37
+ "chv": "Chuvash",
38
+ "ckb": "Central Kurdish",
39
+ "cor": "Cornish",
40
+ "cos": "Corsican",
41
+ "crh": "Crimean Tatar",
42
+ "csb": "Kashubian",
43
+ "cym": "Welsh",
44
+ "dan": "Danish",
45
+ "deu": "German",
46
+ "diq": "Dimli",
47
+ "div": "Dhivehi",
48
+ "dsb": "Lower Sorbian",
49
+ "dty": "Doteli",
50
+ "egl": "Emilian",
51
+ "ell": "Modern Greek",
52
+ "eng": "English",
53
+ "epo": "Esperanto",
54
+ "est": "Estonian",
55
+ "eus": "Basque",
56
+ "ext": "Extremaduran",
57
+ "fao": "Faroese",
58
+ "fas": "Persian",
59
+ "fin": "Finnish",
60
+ "fra": "French",
61
+ "frp": "Arpitan",
62
+ "fry": "Western Frisian",
63
+ "fur": "Friulian",
64
+ "gag": "Gagauz",
65
+ "gla": "Scottish Gaelic",
66
+ "gle": "Irish",
67
+ "glg": "Galician",
68
+ "glk": "Gilaki",
69
+ "glv": "Manx",
70
+ "grn": "Guarani",
71
+ "guj": "Gujarati",
72
+ "hak": "Hakka Chinese",
73
+ "hat": "Haitian Creole",
74
+ "hau": "Hausa",
75
+ "hbs": "Serbo-Croatian",
76
+ "heb": "Hebrew",
77
+ "hif": "Fiji Hindi",
78
+ "hin": "Hindi",
79
+ "hrv": "Croatian",
80
+ "hsb": "Upper Sorbian",
81
+ "hun": "Hungarian",
82
+ "hye": "Armenian",
83
+ "ibo": "Igbo",
84
+ "ido": "Ido",
85
+ "ile": "Interlingue",
86
+ "ilo": "Iloko",
87
+ "ina": "Interlingua",
88
+ "ind": "Indonesian",
89
+ "isl": "Icelandic",
90
+ "ita": "Italian",
91
+ "jam": "Jamaican Patois",
92
+ "jav": "Javanese",
93
+ "jbo": "Lojban",
94
+ "jpn": "Japanese",
95
+ "kaa": "Karakalpak",
96
+ "kab": "Kabyle",
97
+ "kan": "Kannada",
98
+ "kat": "Georgian",
99
+ "kaz": "Kazakh",
100
+ "kbd": "Kabardian",
101
+ "khm": "Central Khmer",
102
+ "kin": "Kinyarwanda",
103
+ "kir": "Kirghiz",
104
+ "koi": "Komi-Permyak",
105
+ "kok": "Konkani",
106
+ "kom": "Komi",
107
+ "kor": "Korean",
108
+ "krc": "Karachay-Balkar",
109
+ "ksh": "Ripuarisch",
110
+ "kur": "Kurdish",
111
+ "lad": "Ladino",
112
+ "lao": "Lao",
113
+ "lat": "Latin",
114
+ "lav": "Latvian",
115
+ "lez": "Lezghian",
116
+ "lij": "Ligurian",
117
+ "lim": "Limburgan",
118
+ "lin": "Lingala",
119
+ "lit": "Lithuanian",
120
+ "lmo": "Lombard",
121
+ "lrc": "Northern Luri",
122
+ "ltg": "Latgalian",
123
+ "ltz": "Luxembourgish",
124
+ "lug": "Luganda",
125
+ "lzh": "Literary Chinese",
126
+ "mai": "Maithili",
127
+ "mal": "Malayalam",
128
+ "map-bms": "Banyumasan",
129
+ "mar": "Marathi",
130
+ "mdf": "Moksha",
131
+ "mhr": "Eastern Mari",
132
+ "min": "Minangkabau",
133
+ "mkd": "Macedonian",
134
+ "mlg": "Malagasy",
135
+ "mlt": "Maltese",
136
+ "mon": "Mongolian",
137
+ "mri": "Maori",
138
+ "mrj": "Western Mari",
139
+ "msa": "Malay",
140
+ "mwl": "Mirandese",
141
+ "mya": "Burmese",
142
+ "myv": "Erzya",
143
+ "mzn": "Mazanderani",
144
+ "nan": "Min Nan Chinese",
145
+ "nap": "Neapolitan",
146
+ "nav": "Navajo",
147
+ "nci": "Classical Nahuatl",
148
+ "nds": "Low German",
149
+ "nds-nl": "West Low German",
150
+ "nep": "Nepali (macrolanguage)",
151
+ "new": "Newari",
152
+ "nld": "Dutch",
153
+ "nno": "Norwegian Nynorsk",
154
+ "nob": "Bokmål",
155
+ "nrm": "Narom",
156
+ "nso": "Northern Sotho",
157
+ "oci": "Occitan",
158
+ "olo": "Livvi-Karelian",
159
+ "ori": "Oriya",
160
+ "orm": "Oromo",
161
+ "oss": "Ossetian",
162
+ "pag": "Pangasinan",
163
+ "pam": "Pampanga",
164
+ "pan": "Panjabi",
165
+ "pap": "Papiamento",
166
+ "pcd": "Picard",
167
+ "pdc": "Pennsylvania German",
168
+ "pfl": "Palatine German",
169
+ "pnb": "Western Panjabi",
170
+ "pol": "Polish",
171
+ "por": "Portuguese",
172
+ "pus": "Pushto",
173
+ "que": "Quechua",
174
+ "roa-tara": "Tarantino dialect",
175
+ "roh": "Romansh",
176
+ "ron": "Romanian",
177
+ "rue": "Rusyn",
178
+ "rup": "Aromanian",
179
+ "rus": "Russian",
180
+ "sah": "Yakut",
181
+ "san": "Sanskrit",
182
+ "scn": "Sicilian",
183
+ "sco": "Scots",
184
+ "sgs": "Samogitian",
185
+ "sin": "Sinhala",
186
+ "slk": "Slovak",
187
+ "slv": "Slovene",
188
+ "sme": "Northern Sami",
189
+ "sna": "Shona",
190
+ "snd": "Sindhi",
191
+ "som": "Somali",
192
+ "spa": "Spanish",
193
+ "sqi": "Albanian",
194
+ "srd": "Sardinian",
195
+ "srn": "Sranan",
196
+ "srp": "Serbian",
197
+ "stq": "Saterfriesisch",
198
+ "sun": "Sundanese",
199
+ "swa": "Swahili (macrolanguage)",
200
+ "swe": "Swedish",
201
+ "szl": "Silesian",
202
+ "tam": "Tamil",
203
+ "tat": "Tatar",
204
+ "tcy": "Tulu",
205
+ "tel": "Telugu",
206
+ "tet": "Tetum",
207
+ "tgk": "Tajik",
208
+ "tgl": "Tagalog",
209
+ "tha": "Thai",
210
+ "ton": "Tongan",
211
+ "tsn": "Tswana",
212
+ "tuk": "Turkmen",
213
+ "tur": "Turkish",
214
+ "tyv": "Tuvan",
215
+ "udm": "Udmurt",
216
+ "uig": "Uighur",
217
+ "ukr": "Ukrainian",
218
+ "urd": "Urdu",
219
+ "uzb": "Uzbek",
220
+ "vec": "Venetian",
221
+ "vep": "Veps",
222
+ "vie": "Vietnamese",
223
+ "vls": "Vlaams",
224
+ "vol": "Volapük",
225
+ "vro": "Võro",
226
+ "war": "Waray",
227
+ "wln": "Walloon",
228
+ "wol": "Wolof",
229
+ "wuu": "Wu Chinese",
230
+ "xho": "Xhosa",
231
+ "xmf": "Mingrelian",
232
+ "yid": "Yiddish",
233
+ "yor": "Yoruba",
234
+ "zea": "Zeeuws",
235
+ "zh-yue": "Cantonese",
236
+ "zho": "Standard Chinese",
237
+ }
libs/normalizer.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import regex
3
+ import sys
4
+ import textwrap
5
+ from typing import Any, Dict, Optional
6
+
7
+ punctuations = [
8
+ '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '.',
9
+ '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_',
10
+ '`', '{', '|', '}', '~', '»', '«', '“', '”', "-",
11
+ ]
12
+
13
+
14
+ class Normalizer:
15
+ """A general normalizer for every language"""
16
+
17
+ _whitelist = r"[" + "\p{N}\p{L}\p{M}" + re.escape("".join(punctuations)) + "]+"
18
+ _dictionary = {}
19
+
20
+ def __init__(
21
+ self,
22
+ whitelist: str = None,
23
+ dictionary: Dict[str, str] = None,
24
+ ) -> None:
25
+ self.whitelist = whitelist if whitelist and isinstance(whitelist, str) else self._whitelist
26
+ self.dictionary = dictionary if dictionary and isinstance(dictionary, dict) else self._dictionary
27
+
28
+ def chars_to_map(self, sentence: str) -> str:
29
+ """Maps every character, words, and phrase into a proper one.
30
+
31
+ Args:
32
+ sentence (str): A piece of text.
33
+ """
34
+ if not len(self.dictionary) > 0:
35
+ return sentence
36
+
37
+ pattern = "|".join(map(re.escape, self.dictionary.keys()))
38
+ return re.sub(pattern, lambda m: self.dictionary[m.group()], str(sentence))
39
+
40
+ def chars_to_preserve(
41
+ self,
42
+ sentence: str,
43
+ ) -> str:
44
+ """Keeps specified characters from sentence
45
+
46
+ Args:
47
+ sentence (str): A piece of text.
48
+ """
49
+ try:
50
+ tokenized = regex.findall(self.whitelist, sentence)
51
+ return " ".join(tokenized)
52
+ except Exception as error:
53
+ print(
54
+ textwrap.dedent(
55
+ f"""
56
+ Bad characters range {self.whitelist},
57
+ {error}
58
+ """
59
+ )
60
+ )
61
+ raise
62
+
63
+ def text_level_normalizer(self, text: str) -> str:
64
+ """A text level of normalization"""
65
+
66
+ text = regex.sub(r"([" + re.escape("".join(punctuations)) + "])", r" \1 ", text)
67
+ text = text.strip()
68
+
69
+ return text
70
+
71
+ def __call__(
72
+ self,
73
+ text: str,
74
+ do_lowercase: Optional[bool] = False
75
+ ) -> Any:
76
+ """Normalization caller"""
77
+
78
+ text = self.chars_to_map(text)
79
+ text = self.chars_to_preserve(text)
80
+ text = self.text_level_normalizer(text)
81
+ text = re.sub(r"\s+", " ", text)
82
+
83
+ if do_lowercase:
84
+ text = text.lower()
85
+
86
+ return text
libs/utils.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import plotly.express as px
4
+
5
+
6
+ def plot_result(top_languages):
7
+ top_k = len(top_languages)
8
+ languages = [f'{r["language"]} ({r["code"]})' for r in top_languages]
9
+ scores = np.array([r["score"] for r in top_languages])
10
+ scores *= 100
11
+ fig = px.bar(
12
+ x=scores,
13
+ y=languages,
14
+ orientation='h',
15
+ labels={'x': 'Confidence', 'y': 'Language'},
16
+ text=scores,
17
+ range_x=(0, 115),
18
+ title=f'Top Detections {top_k}',
19
+ color=np.linspace(0, 1, len(scores)),
20
+ color_continuous_scale='GnBu'
21
+ )
22
+ fig.update(layout_coloraxis_showscale=False)
23
+ fig.update_traces(texttemplate='%{text:0.1f}%', textposition='outside')
24
+ fig.update_layout(yaxis={'categoryorder': 'total ascending'})
25
+ st.plotly_chart(fig, use_container_width=True)
meta.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
1
+ INFO = """
2
+ # Zabanshenas 🕵
3
+
4
+ A Transformer-based solution for identifying the most likely language of a written document/text. Zabanshenas is a Persian word that has two meanings:
5
+
6
+ - A person who studies linguistics.
7
+ - A way to identify the type of written language.
8
+ """.strip()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
1
+ streamlit
2
+ transformers
3
+ torch
4
+ regex
5
+ plotly