Spaces:
Runtime error
Runtime error
Update sync_streamlit_to_space.yml
Browse files- README.md +8 -24
- app.py +185 -0
- libs/__init__.py +0 -0
- libs/dummy.py +1179 -0
- libs/examples.py +40 -0
- libs/languages.py +237 -0
- libs/normalizer.py +86 -0
- libs/utils.py +25 -0
- meta.py +8 -0
- requirements.txt +5 -0
README.md
CHANGED
@@ -1,33 +1,17 @@
|
|
1 |
---
|
2 |
title: Zabanshenas
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: streamlit
|
7 |
app_file: app.py
|
8 |
pinned: false
|
9 |
---
|
10 |
|
11 |
-
#
|
12 |
|
13 |
-
|
14 |
-
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
`colorFrom`: _string_
|
20 |
-
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
21 |
-
|
22 |
-
`colorTo`: _string_
|
23 |
-
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
24 |
-
|
25 |
-
`sdk`: _string_
|
26 |
-
Can be either `gradio` or `streamlit`
|
27 |
-
|
28 |
-
`app_file`: _string_
|
29 |
-
Path to your main application file (which contains either `gradio` or `streamlit` Python code).
|
30 |
-
Path is relative to the root of the repository.
|
31 |
-
|
32 |
-
`pinned`: _boolean_
|
33 |
-
Whether the Space stays on top of your list.
|
|
|
1 |
---
|
2 |
title: Zabanshenas
|
3 |
+
emoji: 🕵
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: blue
|
6 |
sdk: streamlit
|
7 |
app_file: app.py
|
8 |
pinned: false
|
9 |
---
|
10 |
|
11 |
+
# Zabanshenas
|
12 |
|
13 |
+
A Transformer-based solution for identifying the most likely language of a written document/text.
|
14 |
+
**Zabanshenas** is a Persian word that has two meanings:
|
15 |
|
16 |
+
- A person who studies linguistics.
|
17 |
+
- A way to identify the type of written language.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
from typing import Any, Dict, Optional
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
|
7 |
+
from transformers import AutoTokenizer
|
8 |
+
from transformers import AutoModelForSequenceClassification
|
9 |
+
|
10 |
+
from libs.normalizer import Normalizer
|
11 |
+
from libs.languages import languages
|
12 |
+
from libs.examples import EXAMPLES
|
13 |
+
from libs.dummy import outputs as dummy_outputs
|
14 |
+
from libs.utils import plot_result
|
15 |
+
|
16 |
+
import meta
|
17 |
+
|
18 |
+
|
19 |
+
class Zabanshenas:
|
20 |
+
def __init__(
|
21 |
+
self,
|
22 |
+
model_name_or_path: str = "m3hrdadfi/zabanshenas-roberta-base-mix",
|
23 |
+
by_gpu: bool = False
|
24 |
+
) -> None:
|
25 |
+
self.debug = True
|
26 |
+
self.dummy_outputs = dummy_outputs
|
27 |
+
self.device = torch.device("cpu" if not by_gpu else "cuda")
|
28 |
+
self.model_name_or_path = model_name_or_path
|
29 |
+
|
30 |
+
self.tokenizer = None
|
31 |
+
self.model = None
|
32 |
+
self.normalizer = None
|
33 |
+
self.languages = None
|
34 |
+
self.framework = "pt"
|
35 |
+
self.max_length = 512
|
36 |
+
self.top_k = 5
|
37 |
+
|
38 |
+
def load(self):
|
39 |
+
if not self.debug:
|
40 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
|
41 |
+
self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name_or_path).to(self.device)
|
42 |
+
|
43 |
+
self.normalizer = Normalizer()
|
44 |
+
self.languages = languages
|
45 |
+
|
46 |
+
def ensure_tensor_on_device(self, **inputs):
|
47 |
+
"""
|
48 |
+
Ensure PyTorch tensors are on the specified device.
|
49 |
+
"""
|
50 |
+
|
51 |
+
return {
|
52 |
+
name: tensor.to(self.device) if isinstance(tensor, torch.Tensor) else tensor
|
53 |
+
for name, tensor in inputs.items()
|
54 |
+
}
|
55 |
+
|
56 |
+
def _parse_and_tokenize(
|
57 |
+
self,
|
58 |
+
inputs,
|
59 |
+
do_normalization: bool = True,
|
60 |
+
max_length: int = 512,
|
61 |
+
padding: bool = True,
|
62 |
+
add_special_tokens: bool = True,
|
63 |
+
truncation: bool = True,
|
64 |
+
):
|
65 |
+
"""
|
66 |
+
Parse arguments and tokenize
|
67 |
+
"""
|
68 |
+
inputs = [self.normalizer(item) for item in inputs]
|
69 |
+
max_length = min(max_length, self.max_length)
|
70 |
+
inputs = self.tokenizer(
|
71 |
+
inputs,
|
72 |
+
max_length=max_length,
|
73 |
+
add_special_tokens=add_special_tokens,
|
74 |
+
return_tensors=self.framework,
|
75 |
+
padding=padding,
|
76 |
+
truncation=truncation,
|
77 |
+
)
|
78 |
+
|
79 |
+
return inputs
|
80 |
+
|
81 |
+
def _forward(
|
82 |
+
self,
|
83 |
+
inputs,
|
84 |
+
return_tensors: bool = True
|
85 |
+
):
|
86 |
+
with torch.no_grad():
|
87 |
+
inputs = self.ensure_tensor_on_device(**inputs)
|
88 |
+
predictions = self.model(**inputs)[0].cpu()
|
89 |
+
|
90 |
+
if return_tensors:
|
91 |
+
return predictions
|
92 |
+
else:
|
93 |
+
return predictions.numpy()
|
94 |
+
|
95 |
+
def detect(
|
96 |
+
self,
|
97 |
+
texts,
|
98 |
+
max_length: int = 128,
|
99 |
+
do_normalization: bool = True
|
100 |
+
):
|
101 |
+
if self.debug:
|
102 |
+
return self.dummy_outputs
|
103 |
+
|
104 |
+
texts = [texts] if not isinstance(texts, list) else texts
|
105 |
+
inputs = self._parse_and_tokenize(texts, do_normalization=do_normalization, max_length=max_length)
|
106 |
+
outputs = self._forward(inputs, return_tensors=False)
|
107 |
+
scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True)
|
108 |
+
|
109 |
+
results = [
|
110 |
+
[
|
111 |
+
{
|
112 |
+
"language": self.languages.get(self.model.config.id2label[i], None),
|
113 |
+
"code": self.model.config.id2label[i],
|
114 |
+
"score": score.item()
|
115 |
+
} for i, score in enumerate(item)
|
116 |
+
] for item in scores
|
117 |
+
]
|
118 |
+
results = [list(sorted(result, key=lambda kv: kv["score"], reverse=True)) for result in results]
|
119 |
+
|
120 |
+
return results
|
121 |
+
|
122 |
+
|
123 |
+
@st.cache(allow_output_mutation=True)
|
124 |
+
def load_language_detector():
|
125 |
+
detector = Zabanshenas()
|
126 |
+
detector.load()
|
127 |
+
return detector
|
128 |
+
|
129 |
+
|
130 |
+
def main():
|
131 |
+
st.set_page_config(
|
132 |
+
page_title="Zabanshenas",
|
133 |
+
page_icon="🕵",
|
134 |
+
layout="wide",
|
135 |
+
initial_sidebar_state="expanded"
|
136 |
+
)
|
137 |
+
detector = load_language_detector()
|
138 |
+
|
139 |
+
col1, col2 = st.beta_columns([6, 4])
|
140 |
+
with col2:
|
141 |
+
st.markdown(meta.INFO, unsafe_allow_html=True)
|
142 |
+
|
143 |
+
with col1:
|
144 |
+
prompts = list(EXAMPLES.keys()) + ["Custom"]
|
145 |
+
prompt = st.selectbox(
|
146 |
+
'Examples (select from this list)',
|
147 |
+
prompts,
|
148 |
+
# index=len(prompts) - 1,
|
149 |
+
index=0
|
150 |
+
)
|
151 |
+
|
152 |
+
if prompt == "Custom":
|
153 |
+
prompt_box = ""
|
154 |
+
else:
|
155 |
+
prompt_box = EXAMPLES[prompt]
|
156 |
+
|
157 |
+
text = st.text_area(
|
158 |
+
'Insert your text: ',
|
159 |
+
detector.normalizer(prompt_box),
|
160 |
+
height=200
|
161 |
+
)
|
162 |
+
text = detector.normalizer(text)
|
163 |
+
entered_text = st.empty()
|
164 |
+
|
165 |
+
detect_language = st.button('Detect Language !')
|
166 |
+
|
167 |
+
st.markdown(
|
168 |
+
"<hr />",
|
169 |
+
unsafe_allow_html=True
|
170 |
+
)
|
171 |
+
if detect_language:
|
172 |
+
words = text.split()
|
173 |
+
with st.spinner("Detecting..."):
|
174 |
+
if not len(words) > 3:
|
175 |
+
entered_text.markdown(
|
176 |
+
"Insert your text (at least three words)"
|
177 |
+
)
|
178 |
+
else:
|
179 |
+
top_languages = detector.detect(text, max_length=min(len(words), detector.max_length))
|
180 |
+
top_languages = top_languages[0][:detector.top_k]
|
181 |
+
plot_result(top_languages)
|
182 |
+
|
183 |
+
|
184 |
+
if __name__ == '__main__':
|
185 |
+
main()
|
libs/__init__.py
ADDED
File without changes
|
libs/dummy.py
ADDED
@@ -0,0 +1,1179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
outputs = [
|
2 |
+
[
|
3 |
+
{
|
4 |
+
"language": "Persian",
|
5 |
+
"code": "fas",
|
6 |
+
"score": 0.6105580925941467
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"language": "Gilaki",
|
10 |
+
"code": "glk",
|
11 |
+
"score": 0.29982829093933105
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"language": "Northern Luri",
|
15 |
+
"code": "lrc",
|
16 |
+
"score": 0.04840774089097977
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"language": "Mazanderani",
|
20 |
+
"code": "mzn",
|
21 |
+
"score": 0.030142733827233315
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"language": "South Azerbaijani",
|
25 |
+
"code": "azb",
|
26 |
+
"score": 0.005220199003815651
|
27 |
+
},
|
28 |
+
{
|
29 |
+
"language": "Urdu",
|
30 |
+
"code": "urd",
|
31 |
+
"score": 0.0019745035097002983
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"language": "Pushto",
|
35 |
+
"code": "pus",
|
36 |
+
"score": 0.0015690263826400042
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"language": "Western Panjabi",
|
40 |
+
"code": "pnb",
|
41 |
+
"score": 0.0005721596535295248
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"language": "Central Kurdish",
|
45 |
+
"code": "ckb",
|
46 |
+
"score": 0.00025537016335874796
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"language": "Sindhi",
|
50 |
+
"code": "snd",
|
51 |
+
"score": 0.0001820324978325516
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"language": "Egyptian Arabic",
|
55 |
+
"code": "arz",
|
56 |
+
"score": 0.0001247940381290391
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"language": "Arabic",
|
60 |
+
"code": "ara",
|
61 |
+
"score": 7.754910620860755e-05
|
62 |
+
},
|
63 |
+
{
|
64 |
+
"language": "Korean",
|
65 |
+
"code": "kor",
|
66 |
+
"score": 5.718228203477338e-05
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"language": "Fiji Hindi",
|
70 |
+
"code": "hif",
|
71 |
+
"score": 3.5903740354115143e-05
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"language": "Uighur",
|
75 |
+
"code": "uig",
|
76 |
+
"score": 3.5565532016335055e-05
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"language": "Maori",
|
80 |
+
"code": "mri",
|
81 |
+
"score": 2.1078320060041733e-05
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"language": "Literary Chinese",
|
85 |
+
"code": "lzh",
|
86 |
+
"score": 2.09943773370469e-05
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"language": "Navajo",
|
90 |
+
"code": "nav",
|
91 |
+
"score": 1.8877935872296803e-05
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"language": "Mongolian",
|
95 |
+
"code": "mon",
|
96 |
+
"score": 1.783044899639208e-05
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"language": "Basque",
|
100 |
+
"code": "eus",
|
101 |
+
"score": 1.2980432074982673e-05
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"language": "Moksha",
|
105 |
+
"code": "mdf",
|
106 |
+
"score": 1.2325609532126691e-05
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"language": "Tongan",
|
110 |
+
"code": "ton",
|
111 |
+
"score": 1.1610675755946431e-05
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"language": "Min Dong",
|
115 |
+
"code": "cdo",
|
116 |
+
"score": 1.1508132956805639e-05
|
117 |
+
},
|
118 |
+
{
|
119 |
+
"language": "Sinhala",
|
120 |
+
"code": "sin",
|
121 |
+
"score": 1.0617596672091167e-05
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"language": "Venetian",
|
125 |
+
"code": "vec",
|
126 |
+
"score": 1.0375520105299074e-05
|
127 |
+
},
|
128 |
+
{
|
129 |
+
"language": "Western Mari",
|
130 |
+
"code": "mrj",
|
131 |
+
"score": 1.0316403859178536e-05
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"language": "Malayalam",
|
135 |
+
"code": "mal",
|
136 |
+
"score": 1.0265099263051525e-05
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"language": "Interlingua",
|
140 |
+
"code": "ina",
|
141 |
+
"score": 1.0040446795755997e-05
|
142 |
+
},
|
143 |
+
{
|
144 |
+
"language": "Tatar",
|
145 |
+
"code": "tat",
|
146 |
+
"score": 9.836200661084149e-06
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"language": "Cantonese",
|
150 |
+
"code": "zh-yue",
|
151 |
+
"score": 9.80662207439309e-06
|
152 |
+
},
|
153 |
+
{
|
154 |
+
"language": "Wu Chinese",
|
155 |
+
"code": "wuu",
|
156 |
+
"score": 9.661145668360405e-06
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"language": "Igbo",
|
160 |
+
"code": "ibo",
|
161 |
+
"score": 9.207592484017368e-06
|
162 |
+
},
|
163 |
+
{
|
164 |
+
"language": "Waray",
|
165 |
+
"code": "war",
|
166 |
+
"score": 8.970115231932141e-06
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"language": "Yiddish",
|
170 |
+
"code": "yid",
|
171 |
+
"score": 8.926748705562204e-06
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"language": "Udmurt",
|
175 |
+
"code": "udm",
|
176 |
+
"score": 8.702583727426827e-06
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"language": "Dhivehi",
|
180 |
+
"code": "div",
|
181 |
+
"score": 8.36203707876848e-06
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"language": "Newari",
|
185 |
+
"code": "new",
|
186 |
+
"score": 8.140945283230394e-06
|
187 |
+
},
|
188 |
+
{
|
189 |
+
"language": "Karachay-Balkar",
|
190 |
+
"code": "krc",
|
191 |
+
"score": 8.123539373627864e-06
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"language": "Lojban",
|
195 |
+
"code": "jbo",
|
196 |
+
"score": 8.114019692584407e-06
|
197 |
+
},
|
198 |
+
{
|
199 |
+
"language": "Sanskrit",
|
200 |
+
"code": "san",
|
201 |
+
"score": 8.087784408417065e-06
|
202 |
+
},
|
203 |
+
{
|
204 |
+
"language": "Luganda",
|
205 |
+
"code": "lug",
|
206 |
+
"score": 8.023569534998387e-06
|
207 |
+
},
|
208 |
+
{
|
209 |
+
"language": "Maithili",
|
210 |
+
"code": "mai",
|
211 |
+
"score": 7.723083399469033e-06
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"language": "Kirghiz",
|
215 |
+
"code": "kir",
|
216 |
+
"score": 7.715119863860309e-06
|
217 |
+
},
|
218 |
+
{
|
219 |
+
"language": "Standard Chinese",
|
220 |
+
"code": "zho",
|
221 |
+
"score": 7.5126054071006365e-06
|
222 |
+
},
|
223 |
+
{
|
224 |
+
"language": "Amharic",
|
225 |
+
"code": "amh",
|
226 |
+
"score": 7.451813871739432e-06
|
227 |
+
},
|
228 |
+
{
|
229 |
+
"language": "Chechen",
|
230 |
+
"code": "che",
|
231 |
+
"score": 7.444541097356705e-06
|
232 |
+
},
|
233 |
+
{
|
234 |
+
"language": "Gujarati",
|
235 |
+
"code": "guj",
|
236 |
+
"score": 7.395997727144277e-06
|
237 |
+
},
|
238 |
+
{
|
239 |
+
"language": "Tibetan",
|
240 |
+
"code": "bod",
|
241 |
+
"score": 7.390805421891855e-06
|
242 |
+
},
|
243 |
+
{
|
244 |
+
"language": "Komi",
|
245 |
+
"code": "kom",
|
246 |
+
"score": 7.373077551164897e-06
|
247 |
+
},
|
248 |
+
{
|
249 |
+
"language": "Lao",
|
250 |
+
"code": "lao",
|
251 |
+
"score": 7.351867679972202e-06
|
252 |
+
},
|
253 |
+
{
|
254 |
+
"language": "Wolof",
|
255 |
+
"code": "wol",
|
256 |
+
"score": 7.305452982109273e-06
|
257 |
+
},
|
258 |
+
{
|
259 |
+
"language": "Silesian",
|
260 |
+
"code": "szl",
|
261 |
+
"score": 7.301976893359097e-06
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"language": "Northern Sotho",
|
265 |
+
"code": "nso",
|
266 |
+
"score": 7.2927336987049785e-06
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"language": "Armenian",
|
270 |
+
"code": "hye",
|
271 |
+
"score": 7.243447726068553e-06
|
272 |
+
},
|
273 |
+
{
|
274 |
+
"language": "Arpitan",
|
275 |
+
"code": "frp",
|
276 |
+
"score": 7.137540251278551e-06
|
277 |
+
},
|
278 |
+
{
|
279 |
+
"language": "Bishnupriya",
|
280 |
+
"code": "bpy",
|
281 |
+
"score": 7.062033091642661e-06
|
282 |
+
},
|
283 |
+
{
|
284 |
+
"language": "Azerbaijani",
|
285 |
+
"code": "aze",
|
286 |
+
"score": 6.906778253323864e-06
|
287 |
+
},
|
288 |
+
{
|
289 |
+
"language": "Tajik",
|
290 |
+
"code": "tgk",
|
291 |
+
"score": 6.730050699843559e-06
|
292 |
+
},
|
293 |
+
{
|
294 |
+
"language": "Old English ",
|
295 |
+
"code": "ang",
|
296 |
+
"score": 6.6442084971640725e-06
|
297 |
+
},
|
298 |
+
{
|
299 |
+
"language": "Marathi",
|
300 |
+
"code": "mar",
|
301 |
+
"score": 6.63194168737391e-06
|
302 |
+
},
|
303 |
+
{
|
304 |
+
"language": "Kurdish",
|
305 |
+
"code": "kur",
|
306 |
+
"score": 6.615779057028703e-06
|
307 |
+
},
|
308 |
+
{
|
309 |
+
"language": "Lithuanian",
|
310 |
+
"code": "lit",
|
311 |
+
"score": 6.561998816323467e-06
|
312 |
+
},
|
313 |
+
{
|
314 |
+
"language": "Russian",
|
315 |
+
"code": "rus",
|
316 |
+
"score": 6.4370215113740414e-06
|
317 |
+
},
|
318 |
+
{
|
319 |
+
"language": "Tulu",
|
320 |
+
"code": "tcy",
|
321 |
+
"score": 6.370255960064242e-06
|
322 |
+
},
|
323 |
+
{
|
324 |
+
"language": "Extremaduran",
|
325 |
+
"code": "ext",
|
326 |
+
"score": 6.3398160818906035e-06
|
327 |
+
},
|
328 |
+
{
|
329 |
+
"language": "Aymara",
|
330 |
+
"code": "aym",
|
331 |
+
"score": 6.288398708420573e-06
|
332 |
+
},
|
333 |
+
{
|
334 |
+
"language": "Lower Sorbian",
|
335 |
+
"code": "dsb",
|
336 |
+
"score": 6.209619641595054e-06
|
337 |
+
},
|
338 |
+
{
|
339 |
+
"language": "Classical Nahuatl",
|
340 |
+
"code": "nci",
|
341 |
+
"score": 5.954705557087436e-06
|
342 |
+
},
|
343 |
+
{
|
344 |
+
"language": "Polish",
|
345 |
+
"code": "pol",
|
346 |
+
"score": 5.952156243438367e-06
|
347 |
+
},
|
348 |
+
{
|
349 |
+
"language": "Cebuano",
|
350 |
+
"code": "ceb",
|
351 |
+
"score": 5.911888820264721e-06
|
352 |
+
},
|
353 |
+
{
|
354 |
+
"language": "Hakka Chinese",
|
355 |
+
"code": "hak",
|
356 |
+
"score": 5.756284735980444e-06
|
357 |
+
},
|
358 |
+
{
|
359 |
+
"language": "Georgian",
|
360 |
+
"code": "kat",
|
361 |
+
"score": 5.656391749653267e-06
|
362 |
+
},
|
363 |
+
{
|
364 |
+
"language": "Mingrelian",
|
365 |
+
"code": "xmf",
|
366 |
+
"score": 5.57373004994588e-06
|
367 |
+
},
|
368 |
+
{
|
369 |
+
"language": "Telugu",
|
370 |
+
"code": "tel",
|
371 |
+
"score": 5.5334053286060225e-06
|
372 |
+
},
|
373 |
+
{
|
374 |
+
"language": "Doteli",
|
375 |
+
"code": "dty",
|
376 |
+
"score": 5.510717073775595e-06
|
377 |
+
},
|
378 |
+
{
|
379 |
+
"language": "Portuguese",
|
380 |
+
"code": "por",
|
381 |
+
"score": 5.50901131646242e-06
|
382 |
+
},
|
383 |
+
{
|
384 |
+
"language": "Komi-Permyak",
|
385 |
+
"code": "koi",
|
386 |
+
"score": 5.447328476293478e-06
|
387 |
+
},
|
388 |
+
{
|
389 |
+
"language": "Eastern Mari",
|
390 |
+
"code": "mhr",
|
391 |
+
"score": 5.414771294454113e-06
|
392 |
+
},
|
393 |
+
{
|
394 |
+
"language": "Lezghian",
|
395 |
+
"code": "lez",
|
396 |
+
"score": 5.2741329454875086e-06
|
397 |
+
},
|
398 |
+
{
|
399 |
+
"language": "Nepali (macrolanguage)",
|
400 |
+
"code": "nep",
|
401 |
+
"score": 5.273408532957546e-06
|
402 |
+
},
|
403 |
+
{
|
404 |
+
"language": "Samogitian",
|
405 |
+
"code": "sgs",
|
406 |
+
"score": 5.207636149862083e-06
|
407 |
+
},
|
408 |
+
{
|
409 |
+
"language": "Bhojpuri",
|
410 |
+
"code": "bho",
|
411 |
+
"score": 5.19551804245566e-06
|
412 |
+
},
|
413 |
+
{
|
414 |
+
"language": "Occitan",
|
415 |
+
"code": "oci",
|
416 |
+
"score": 5.172901182959322e-06
|
417 |
+
},
|
418 |
+
{
|
419 |
+
"language": "Western Frisian",
|
420 |
+
"code": "fry",
|
421 |
+
"score": 5.066170615464216e-06
|
422 |
+
},
|
423 |
+
{
|
424 |
+
"language": "Vlaams",
|
425 |
+
"code": "vls",
|
426 |
+
"score": 5.014707312511746e-06
|
427 |
+
},
|
428 |
+
{
|
429 |
+
"language": "Japanese",
|
430 |
+
"code": "jpn",
|
431 |
+
"score": 4.986791282135528e-06
|
432 |
+
},
|
433 |
+
{
|
434 |
+
"language": "V\u00f5ro",
|
435 |
+
"code": "vro",
|
436 |
+
"score": 4.9785726332629565e-06
|
437 |
+
},
|
438 |
+
{
|
439 |
+
"language": "Rusyn",
|
440 |
+
"code": "rue",
|
441 |
+
"score": 4.937043286190601e-06
|
442 |
+
},
|
443 |
+
{
|
444 |
+
"language": "Hindi",
|
445 |
+
"code": "hin",
|
446 |
+
"score": 4.9325194595439825e-06
|
447 |
+
},
|
448 |
+
{
|
449 |
+
"language": "Sicilian",
|
450 |
+
"code": "scn",
|
451 |
+
"score": 4.8434171731059905e-06
|
452 |
+
},
|
453 |
+
{
|
454 |
+
"language": "Somali",
|
455 |
+
"code": "som",
|
456 |
+
"score": 4.722482117358595e-06
|
457 |
+
},
|
458 |
+
{
|
459 |
+
"language": "Galician",
|
460 |
+
"code": "glg",
|
461 |
+
"score": 4.664954758482054e-06
|
462 |
+
},
|
463 |
+
{
|
464 |
+
"language": "Kazakh",
|
465 |
+
"code": "kaz",
|
466 |
+
"score": 4.485120825847844e-06
|
467 |
+
},
|
468 |
+
{
|
469 |
+
"language": "Kannada",
|
470 |
+
"code": "kan",
|
471 |
+
"score": 4.438274572748924e-06
|
472 |
+
},
|
473 |
+
{
|
474 |
+
"language": "Oromo",
|
475 |
+
"code": "orm",
|
476 |
+
"score": 4.422903202794259e-06
|
477 |
+
},
|
478 |
+
{
|
479 |
+
"language": "Albanian",
|
480 |
+
"code": "sqi",
|
481 |
+
"score": 4.410150268085999e-06
|
482 |
+
},
|
483 |
+
{
|
484 |
+
"language": "Minangkabau",
|
485 |
+
"code": "min",
|
486 |
+
"score": 4.407007509144023e-06
|
487 |
+
},
|
488 |
+
{
|
489 |
+
"language": "Finnish",
|
490 |
+
"code": "fin",
|
491 |
+
"score": 4.374884611024754e-06
|
492 |
+
},
|
493 |
+
{
|
494 |
+
"language": "Ossetian",
|
495 |
+
"code": "oss",
|
496 |
+
"score": 4.322507265897002e-06
|
497 |
+
},
|
498 |
+
{
|
499 |
+
"language": "Volap\u00fck",
|
500 |
+
"code": "vol",
|
501 |
+
"score": 4.30220188718522e-06
|
502 |
+
},
|
503 |
+
{
|
504 |
+
"language": "Min Nan Chinese",
|
505 |
+
"code": "nan",
|
506 |
+
"score": 4.2357942220405675e-06
|
507 |
+
},
|
508 |
+
{
|
509 |
+
"language": "Bashkir",
|
510 |
+
"code": "bak",
|
511 |
+
"score": 4.212616204313235e-06
|
512 |
+
},
|
513 |
+
{
|
514 |
+
"language": "Ligurian",
|
515 |
+
"code": "lij",
|
516 |
+
"score": 4.1821313061518595e-06
|
517 |
+
},
|
518 |
+
{
|
519 |
+
"language": "Welsh",
|
520 |
+
"code": "cym",
|
521 |
+
"score": 4.174029982095817e-06
|
522 |
+
},
|
523 |
+
{
|
524 |
+
"language": "Slovene",
|
525 |
+
"code": "slv",
|
526 |
+
"score": 4.172954504610971e-06
|
527 |
+
},
|
528 |
+
{
|
529 |
+
"language": "Dimli",
|
530 |
+
"code": "diq",
|
531 |
+
"score": 4.078176516486565e-06
|
532 |
+
},
|
533 |
+
{
|
534 |
+
"language": "Chuvash",
|
535 |
+
"code": "chv",
|
536 |
+
"score": 4.048466053063748e-06
|
537 |
+
},
|
538 |
+
{
|
539 |
+
"language": "Panjabi",
|
540 |
+
"code": "pan",
|
541 |
+
"score": 3.940522674383828e-06
|
542 |
+
},
|
543 |
+
{
|
544 |
+
"language": "Cornish",
|
545 |
+
"code": "cor",
|
546 |
+
"score": 3.940297119697789e-06
|
547 |
+
},
|
548 |
+
{
|
549 |
+
"language": "West Low German",
|
550 |
+
"code": "nds-nl",
|
551 |
+
"score": 3.926987574232044e-06
|
552 |
+
},
|
553 |
+
{
|
554 |
+
"language": "Cherokee",
|
555 |
+
"code": "chr",
|
556 |
+
"score": 3.9112833292165305e-06
|
557 |
+
},
|
558 |
+
{
|
559 |
+
"language": "Ido",
|
560 |
+
"code": "ido",
|
561 |
+
"score": 3.892145286954474e-06
|
562 |
+
},
|
563 |
+
{
|
564 |
+
"language": "Friulian",
|
565 |
+
"code": "fur",
|
566 |
+
"score": 3.869370175380027e-06
|
567 |
+
},
|
568 |
+
{
|
569 |
+
"language": "Ukrainian",
|
570 |
+
"code": "ukr",
|
571 |
+
"score": 3.7814761526533403e-06
|
572 |
+
},
|
573 |
+
{
|
574 |
+
"language": "Vietnamese",
|
575 |
+
"code": "vie",
|
576 |
+
"score": 3.7795757634739857e-06
|
577 |
+
},
|
578 |
+
{
|
579 |
+
"language": "Emilian",
|
580 |
+
"code": "egl",
|
581 |
+
"score": 3.7286854421836324e-06
|
582 |
+
},
|
583 |
+
{
|
584 |
+
"language": "Hungarian",
|
585 |
+
"code": "hun",
|
586 |
+
"score": 3.706084498844575e-06
|
587 |
+
},
|
588 |
+
{
|
589 |
+
"language": "Haitian Creole",
|
590 |
+
"code": "hat",
|
591 |
+
"score": 3.6860656109638512e-06
|
592 |
+
},
|
593 |
+
{
|
594 |
+
"language": "Jamaican Patois",
|
595 |
+
"code": "jam",
|
596 |
+
"score": 3.6750652725459076e-06
|
597 |
+
},
|
598 |
+
{
|
599 |
+
"language": "Turkmen",
|
600 |
+
"code": "tuk",
|
601 |
+
"score": 3.6414037367649144e-06
|
602 |
+
},
|
603 |
+
{
|
604 |
+
"language": "Gagauz",
|
605 |
+
"code": "gag",
|
606 |
+
"score": 3.6310443647380453e-06
|
607 |
+
},
|
608 |
+
{
|
609 |
+
"language": "Yakut",
|
610 |
+
"code": "sah",
|
611 |
+
"score": 3.611620968513307e-06
|
612 |
+
},
|
613 |
+
{
|
614 |
+
"language": "Breton",
|
615 |
+
"code": "bre",
|
616 |
+
"score": 3.5204120649723336e-06
|
617 |
+
},
|
618 |
+
{
|
619 |
+
"language": "Afrikaans",
|
620 |
+
"code": "afr",
|
621 |
+
"score": 3.5164177916158224e-06
|
622 |
+
},
|
623 |
+
{
|
624 |
+
"language": "Assamese",
|
625 |
+
"code": "asm",
|
626 |
+
"score": 3.5076063795713708e-06
|
627 |
+
},
|
628 |
+
{
|
629 |
+
"language": "Crimean Tatar",
|
630 |
+
"code": "crh",
|
631 |
+
"score": 3.4974791560671292e-06
|
632 |
+
},
|
633 |
+
{
|
634 |
+
"language": "Tswana",
|
635 |
+
"code": "tsn",
|
636 |
+
"score": 3.4639840578165604e-06
|
637 |
+
},
|
638 |
+
{
|
639 |
+
"language": "Malagasy",
|
640 |
+
"code": "mlg",
|
641 |
+
"score": 3.4424308523739455e-06
|
642 |
+
},
|
643 |
+
{
|
644 |
+
"language": "Tamil",
|
645 |
+
"code": "tam",
|
646 |
+
"score": 3.433554866205668e-06
|
647 |
+
},
|
648 |
+
{
|
649 |
+
"language": "Belarusian (Taraschkewiza)",
|
650 |
+
"code": "be-tarask",
|
651 |
+
"score": 3.4065565159835387e-06
|
652 |
+
},
|
653 |
+
{
|
654 |
+
"language": "Scottish Gaelic",
|
655 |
+
"code": "gla",
|
656 |
+
"score": 3.383374632903724e-06
|
657 |
+
},
|
658 |
+
{
|
659 |
+
"language": "Latin",
|
660 |
+
"code": "lat",
|
661 |
+
"score": 3.299320724181598e-06
|
662 |
+
},
|
663 |
+
{
|
664 |
+
"language": "Chavacano",
|
665 |
+
"code": "cbk",
|
666 |
+
"score": 3.277132236689795e-06
|
667 |
+
},
|
668 |
+
{
|
669 |
+
"language": "Tarantino dialect",
|
670 |
+
"code": "roa-tara",
|
671 |
+
"score": 3.2704483601264656e-06
|
672 |
+
},
|
673 |
+
{
|
674 |
+
"language": "Modern Greek",
|
675 |
+
"code": "ell",
|
676 |
+
"score": 3.2669522624928504e-06
|
677 |
+
},
|
678 |
+
{
|
679 |
+
"language": "Ladino",
|
680 |
+
"code": "lad",
|
681 |
+
"score": 3.1890219815977616e-06
|
682 |
+
},
|
683 |
+
{
|
684 |
+
"language": "Latgalian",
|
685 |
+
"code": "ltg",
|
686 |
+
"score": 3.1830948046263075e-06
|
687 |
+
},
|
688 |
+
{
|
689 |
+
"language": "Pampanga",
|
690 |
+
"code": "pam",
|
691 |
+
"score": 3.1460281206818763e-06
|
692 |
+
},
|
693 |
+
{
|
694 |
+
"language": "Tagalog",
|
695 |
+
"code": "tgl",
|
696 |
+
"score": 3.100457433902193e-06
|
697 |
+
},
|
698 |
+
{
|
699 |
+
"language": "Hebrew",
|
700 |
+
"code": "heb",
|
701 |
+
"score": 3.0715009415871464e-06
|
702 |
+
},
|
703 |
+
{
|
704 |
+
"language": "Serbo-Croatian",
|
705 |
+
"code": "hbs",
|
706 |
+
"score": 3.050950908800587e-06
|
707 |
+
},
|
708 |
+
{
|
709 |
+
"language": "Achinese",
|
710 |
+
"code": "ace",
|
711 |
+
"score": 3.0138855890982086e-06
|
712 |
+
},
|
713 |
+
{
|
714 |
+
"language": "Italian",
|
715 |
+
"code": "ita",
|
716 |
+
"score": 3.003329993589432e-06
|
717 |
+
},
|
718 |
+
{
|
719 |
+
"language": "English",
|
720 |
+
"code": "eng",
|
721 |
+
"score": 2.97778979074792e-06
|
722 |
+
},
|
723 |
+
{
|
724 |
+
"language": "Burmese",
|
725 |
+
"code": "mya",
|
726 |
+
"score": 2.9546490623033606e-06
|
727 |
+
},
|
728 |
+
{
|
729 |
+
"language": "Spanish",
|
730 |
+
"code": "spa",
|
731 |
+
"score": 2.9272057417983888e-06
|
732 |
+
},
|
733 |
+
{
|
734 |
+
"language": "Papiamento",
|
735 |
+
"code": "pap",
|
736 |
+
"score": 2.8780641514458694e-06
|
737 |
+
},
|
738 |
+
{
|
739 |
+
"language": "Sardinian",
|
740 |
+
"code": "srd",
|
741 |
+
"score": 2.866505383281037e-06
|
742 |
+
},
|
743 |
+
{
|
744 |
+
"language": "Esperanto",
|
745 |
+
"code": "epo",
|
746 |
+
"score": 2.848199301297427e-06
|
747 |
+
},
|
748 |
+
{
|
749 |
+
"language": "Serbian",
|
750 |
+
"code": "srp",
|
751 |
+
"score": 2.7479175059852423e-06
|
752 |
+
},
|
753 |
+
{
|
754 |
+
"language": "Zeeuws",
|
755 |
+
"code": "zea",
|
756 |
+
"score": 2.7430314730736427e-06
|
757 |
+
},
|
758 |
+
{
|
759 |
+
"language": "Czech",
|
760 |
+
"code": "ces",
|
761 |
+
"score": 2.7409500944486354e-06
|
762 |
+
},
|
763 |
+
{
|
764 |
+
"language": "Bengali",
|
765 |
+
"code": "ben",
|
766 |
+
"score": 2.6958239232044434e-06
|
767 |
+
},
|
768 |
+
{
|
769 |
+
"language": "Erzya",
|
770 |
+
"code": "myv",
|
771 |
+
"score": 2.6273187359038275e-06
|
772 |
+
},
|
773 |
+
{
|
774 |
+
"language": "Croatian",
|
775 |
+
"code": "hrv",
|
776 |
+
"score": 2.6178654479735997e-06
|
777 |
+
},
|
778 |
+
{
|
779 |
+
"language": "Buryat",
|
780 |
+
"code": "bxr",
|
781 |
+
"score": 2.60430465459649e-06
|
782 |
+
},
|
783 |
+
{
|
784 |
+
"language": "Swahili (macrolanguage)",
|
785 |
+
"code": "swa",
|
786 |
+
"score": 2.6016373340098653e-06
|
787 |
+
},
|
788 |
+
{
|
789 |
+
"language": "Pangasinan",
|
790 |
+
"code": "pag",
|
791 |
+
"score": 2.60037768384791e-06
|
792 |
+
},
|
793 |
+
{
|
794 |
+
"language": "Xhosa",
|
795 |
+
"code": "xho",
|
796 |
+
"score": 2.580123918960453e-06
|
797 |
+
},
|
798 |
+
{
|
799 |
+
"language": "Bosnian",
|
800 |
+
"code": "bos",
|
801 |
+
"score": 2.5763115445442963e-06
|
802 |
+
},
|
803 |
+
{
|
804 |
+
"language": "Low German",
|
805 |
+
"code": "nds",
|
806 |
+
"score": 2.5743340756889665e-06
|
807 |
+
},
|
808 |
+
{
|
809 |
+
"language": "Kinyarwanda",
|
810 |
+
"code": "kin",
|
811 |
+
"score": 2.568235458966228e-06
|
812 |
+
},
|
813 |
+
{
|
814 |
+
"language": "Aromanian",
|
815 |
+
"code": "rup",
|
816 |
+
"score": 2.520287125662435e-06
|
817 |
+
},
|
818 |
+
{
|
819 |
+
"language": "Aragonese",
|
820 |
+
"code": "arg",
|
821 |
+
"score": 2.4836215288814856e-06
|
822 |
+
},
|
823 |
+
{
|
824 |
+
"language": "Tetum",
|
825 |
+
"code": "tet",
|
826 |
+
"score": 2.396502168267034e-06
|
827 |
+
},
|
828 |
+
{
|
829 |
+
"language": "Quechua",
|
830 |
+
"code": "que",
|
831 |
+
"score": 2.3799134396540467e-06
|
832 |
+
},
|
833 |
+
{
|
834 |
+
"language": "Livvi-Karelian",
|
835 |
+
"code": "olo",
|
836 |
+
"score": 2.3709426386631094e-06
|
837 |
+
},
|
838 |
+
{
|
839 |
+
"language": "Kashubian",
|
840 |
+
"code": "csb",
|
841 |
+
"score": 2.358733354412834e-06
|
842 |
+
},
|
843 |
+
{
|
844 |
+
"language": "Avar",
|
845 |
+
"code": "ava",
|
846 |
+
"score": 2.330698407604359e-06
|
847 |
+
},
|
848 |
+
{
|
849 |
+
"language": "Hausa",
|
850 |
+
"code": "hau",
|
851 |
+
"score": 2.286114295202424e-06
|
852 |
+
},
|
853 |
+
{
|
854 |
+
"language": "Ripuarisch",
|
855 |
+
"code": "ksh",
|
856 |
+
"score": 2.254129412904149e-06
|
857 |
+
},
|
858 |
+
{
|
859 |
+
"language": "Bulgarian",
|
860 |
+
"code": "bul",
|
861 |
+
"score": 2.2492179141408997e-06
|
862 |
+
},
|
863 |
+
{
|
864 |
+
"language": "Oriya",
|
865 |
+
"code": "ori",
|
866 |
+
"score": 2.1661755909008207e-06
|
867 |
+
},
|
868 |
+
{
|
869 |
+
"language": "Interlingue",
|
870 |
+
"code": "ile",
|
871 |
+
"score": 2.059975486190524e-06
|
872 |
+
},
|
873 |
+
{
|
874 |
+
"language": "Guarani",
|
875 |
+
"code": "grn",
|
876 |
+
"score": 2.024690957114217e-06
|
877 |
+
},
|
878 |
+
{
|
879 |
+
"language": "Banjar",
|
880 |
+
"code": "bjn",
|
881 |
+
"score": 2.0237362150510307e-06
|
882 |
+
},
|
883 |
+
{
|
884 |
+
"language": "Thai",
|
885 |
+
"code": "tha",
|
886 |
+
"score": 2.01868806470884e-06
|
887 |
+
},
|
888 |
+
{
|
889 |
+
"language": "Dutch",
|
890 |
+
"code": "nld",
|
891 |
+
"score": 1.9297158360132016e-06
|
892 |
+
},
|
893 |
+
{
|
894 |
+
"language": "Kabyle",
|
895 |
+
"code": "kab",
|
896 |
+
"score": 1.9132662600895856e-06
|
897 |
+
},
|
898 |
+
{
|
899 |
+
"language": "Palatine German",
|
900 |
+
"code": "pfl",
|
901 |
+
"score": 1.9122355752188014e-06
|
902 |
+
},
|
903 |
+
{
|
904 |
+
"language": "Javanese",
|
905 |
+
"code": "jav",
|
906 |
+
"score": 1.8900879013017402e-06
|
907 |
+
},
|
908 |
+
{
|
909 |
+
"language": "Banyumasan",
|
910 |
+
"code": "map-bms",
|
911 |
+
"score": 1.8552185565567925e-06
|
912 |
+
},
|
913 |
+
{
|
914 |
+
"language": "Faroese",
|
915 |
+
"code": "fao",
|
916 |
+
"score": 1.8414674514133367e-06
|
917 |
+
},
|
918 |
+
{
|
919 |
+
"language": "Scots",
|
920 |
+
"code": "sco",
|
921 |
+
"score": 1.818199393710529e-06
|
922 |
+
},
|
923 |
+
{
|
924 |
+
"language": "Central Khmer",
|
925 |
+
"code": "khm",
|
926 |
+
"score": 1.7993022538576042e-06
|
927 |
+
},
|
928 |
+
{
|
929 |
+
"language": "Slovak",
|
930 |
+
"code": "slk",
|
931 |
+
"score": 1.7988603531193803e-06
|
932 |
+
},
|
933 |
+
{
|
934 |
+
"language": "Belarusian",
|
935 |
+
"code": "bel",
|
936 |
+
"score": 1.782583581189101e-06
|
937 |
+
},
|
938 |
+
{
|
939 |
+
"language": "Swedish",
|
940 |
+
"code": "swe",
|
941 |
+
"score": 1.7702136574371252e-06
|
942 |
+
},
|
943 |
+
{
|
944 |
+
"language": "Saterfriesisch",
|
945 |
+
"code": "stq",
|
946 |
+
"score": 1.7663436437942437e-06
|
947 |
+
},
|
948 |
+
{
|
949 |
+
"language": "Latvian",
|
950 |
+
"code": "lav",
|
951 |
+
"score": 1.7178032294395962e-06
|
952 |
+
},
|
953 |
+
{
|
954 |
+
"language": "Konkani",
|
955 |
+
"code": "kok",
|
956 |
+
"score": 1.690383783170546e-06
|
957 |
+
},
|
958 |
+
{
|
959 |
+
"language": "Tuvan",
|
960 |
+
"code": "tyv",
|
961 |
+
"score": 1.672853159107035e-06
|
962 |
+
},
|
963 |
+
{
|
964 |
+
"language": "Walloon",
|
965 |
+
"code": "wln",
|
966 |
+
"score": 1.6722132158975e-06
|
967 |
+
},
|
968 |
+
{
|
969 |
+
"language": "Sranan",
|
970 |
+
"code": "srn",
|
971 |
+
"score": 1.646132773203135e-06
|
972 |
+
},
|
973 |
+
{
|
974 |
+
"language": "Picard",
|
975 |
+
"code": "pcd",
|
976 |
+
"score": 1.6385885146519286e-06
|
977 |
+
},
|
978 |
+
{
|
979 |
+
"language": "Limburgan",
|
980 |
+
"code": "lim",
|
981 |
+
"score": 1.6372666777897393e-06
|
982 |
+
},
|
983 |
+
{
|
984 |
+
"language": "French",
|
985 |
+
"code": "fra",
|
986 |
+
"score": 1.6239549722740776e-06
|
987 |
+
},
|
988 |
+
{
|
989 |
+
"language": "Icelandic",
|
990 |
+
"code": "isl",
|
991 |
+
"score": 1.5904075780781568e-06
|
992 |
+
},
|
993 |
+
{
|
994 |
+
"language": "Irish",
|
995 |
+
"code": "gle",
|
996 |
+
"score": 1.5750525790281245e-06
|
997 |
+
},
|
998 |
+
{
|
999 |
+
"language": "Corsican",
|
1000 |
+
"code": "cos",
|
1001 |
+
"score": 1.570832523611898e-06
|
1002 |
+
},
|
1003 |
+
{
|
1004 |
+
"language": "Alemannic German",
|
1005 |
+
"code": "als",
|
1006 |
+
"score": 1.5651218063794659e-06
|
1007 |
+
},
|
1008 |
+
{
|
1009 |
+
"language": "German",
|
1010 |
+
"code": "deu",
|
1011 |
+
"score": 1.5594737305946182e-06
|
1012 |
+
},
|
1013 |
+
{
|
1014 |
+
"language": "Upper Sorbian",
|
1015 |
+
"code": "hsb",
|
1016 |
+
"score": 1.5125158370210556e-06
|
1017 |
+
},
|
1018 |
+
{
|
1019 |
+
"language": "Romanian",
|
1020 |
+
"code": "ron",
|
1021 |
+
"score": 1.5119784393391456e-06
|
1022 |
+
},
|
1023 |
+
{
|
1024 |
+
"language": "Manx",
|
1025 |
+
"code": "glv",
|
1026 |
+
"score": 1.5035052456369158e-06
|
1027 |
+
},
|
1028 |
+
{
|
1029 |
+
"language": "Lingala",
|
1030 |
+
"code": "lin",
|
1031 |
+
"score": 1.493238073635439e-06
|
1032 |
+
},
|
1033 |
+
{
|
1034 |
+
"language": "Malay",
|
1035 |
+
"code": "msa",
|
1036 |
+
"score": 1.4067626352698426e-06
|
1037 |
+
},
|
1038 |
+
{
|
1039 |
+
"language": "Maltese",
|
1040 |
+
"code": "mlt",
|
1041 |
+
"score": 1.370485165352875e-06
|
1042 |
+
},
|
1043 |
+
{
|
1044 |
+
"language": "Luxembourgish",
|
1045 |
+
"code": "ltz",
|
1046 |
+
"score": 1.3397349221122568e-06
|
1047 |
+
},
|
1048 |
+
{
|
1049 |
+
"language": "Estonian",
|
1050 |
+
"code": "est",
|
1051 |
+
"score": 1.3280839539220324e-06
|
1052 |
+
},
|
1053 |
+
{
|
1054 |
+
"language": "Kabardian",
|
1055 |
+
"code": "kbd",
|
1056 |
+
"score": 1.3062604011793155e-06
|
1057 |
+
},
|
1058 |
+
{
|
1059 |
+
"language": "Macedonian",
|
1060 |
+
"code": "mkd",
|
1061 |
+
"score": 1.2802570381609257e-06
|
1062 |
+
},
|
1063 |
+
{
|
1064 |
+
"language": "Pennsylvania German",
|
1065 |
+
"code": "pdc",
|
1066 |
+
"score": 1.2550040082714986e-06
|
1067 |
+
},
|
1068 |
+
{
|
1069 |
+
"language": "Sundanese",
|
1070 |
+
"code": "sun",
|
1071 |
+
"score": 1.1068191270169336e-06
|
1072 |
+
},
|
1073 |
+
{
|
1074 |
+
"language": "Iloko",
|
1075 |
+
"code": "ilo",
|
1076 |
+
"score": 1.0791690101541462e-06
|
1077 |
+
},
|
1078 |
+
{
|
1079 |
+
"language": "Karakalpak",
|
1080 |
+
"code": "kaa",
|
1081 |
+
"score": 1.0603262126096524e-06
|
1082 |
+
},
|
1083 |
+
{
|
1084 |
+
"language": "Norwegian Nynorsk",
|
1085 |
+
"code": "nno",
|
1086 |
+
"score": 1.0554679192864569e-06
|
1087 |
+
},
|
1088 |
+
{
|
1089 |
+
"language": "Yoruba",
|
1090 |
+
"code": "yor",
|
1091 |
+
"score": 1.046297711582156e-06
|
1092 |
+
},
|
1093 |
+
{
|
1094 |
+
"language": "Neapolitan",
|
1095 |
+
"code": "nap",
|
1096 |
+
"score": 1.0279602520313347e-06
|
1097 |
+
},
|
1098 |
+
{
|
1099 |
+
"language": "Danish",
|
1100 |
+
"code": "dan",
|
1101 |
+
"score": 1.0038916116172913e-06
|
1102 |
+
},
|
1103 |
+
{
|
1104 |
+
"language": "Indonesian",
|
1105 |
+
"code": "ind",
|
1106 |
+
"score": 9.83746303973021e-07
|
1107 |
+
},
|
1108 |
+
{
|
1109 |
+
"language": "Mirandese",
|
1110 |
+
"code": "mwl",
|
1111 |
+
"score": 8.806521236692788e-07
|
1112 |
+
},
|
1113 |
+
{
|
1114 |
+
"language": "Catalan",
|
1115 |
+
"code": "cat",
|
1116 |
+
"score": 8.687447348165733e-07
|
1117 |
+
},
|
1118 |
+
{
|
1119 |
+
"language": "Turkish",
|
1120 |
+
"code": "tur",
|
1121 |
+
"score": 8.384120064874878e-07
|
1122 |
+
},
|
1123 |
+
{
|
1124 |
+
"language": "Veps",
|
1125 |
+
"code": "vep",
|
1126 |
+
"score": 7.812500371073838e-07
|
1127 |
+
},
|
1128 |
+
{
|
1129 |
+
"language": "Bokm\u00e5l",
|
1130 |
+
"code": "nob",
|
1131 |
+
"score": 7.427178161378833e-07
|
1132 |
+
},
|
1133 |
+
{
|
1134 |
+
"language": "Shona",
|
1135 |
+
"code": "sna",
|
1136 |
+
"score": 6.660703775196453e-07
|
1137 |
+
},
|
1138 |
+
{
|
1139 |
+
"language": "Bavarian",
|
1140 |
+
"code": "bar",
|
1141 |
+
"score": 6.222485353646334e-07
|
1142 |
+
},
|
1143 |
+
{
|
1144 |
+
"language": "Uzbek",
|
1145 |
+
"code": "uzb",
|
1146 |
+
"score": 6.021850822435226e-07
|
1147 |
+
},
|
1148 |
+
{
|
1149 |
+
"language": "Central Bikol",
|
1150 |
+
"code": "bcl",
|
1151 |
+
"score": 5.77034370508045e-07
|
1152 |
+
},
|
1153 |
+
{
|
1154 |
+
"language": "Asturian",
|
1155 |
+
"code": "ast",
|
1156 |
+
"score": 5.743918336520437e-07
|
1157 |
+
},
|
1158 |
+
{
|
1159 |
+
"language": "Lombard",
|
1160 |
+
"code": "lmo",
|
1161 |
+
"score": 4.6301857992148143e-07
|
1162 |
+
},
|
1163 |
+
{
|
1164 |
+
"language": "Romansh",
|
1165 |
+
"code": "roh",
|
1166 |
+
"score": 4.5534079617937095e-07
|
1167 |
+
},
|
1168 |
+
{
|
1169 |
+
"language": "Narom",
|
1170 |
+
"code": "nrm",
|
1171 |
+
"score": 3.6611126574825903e-07
|
1172 |
+
},
|
1173 |
+
{
|
1174 |
+
"language": "Northern Sami",
|
1175 |
+
"code": "sme",
|
1176 |
+
"score": 1.0723972820869676e-07
|
1177 |
+
}
|
1178 |
+
]
|
1179 |
+
]
|
libs/examples.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
EXAMPLES = {
|
2 |
+
'Example 1 - Paragraph - Wu Chinese': '里维拉军政府 ( 西班牙文 Directorio militar de Primo de Rivera 是20世纪20年代 西班牙军事人物大里维拉立在西班牙国王阿方索十三世搭西班牙天主教个支持下建立个威权主义政府 迭个政府自称超出传统个政治党派 一切为国王搭西班牙个稳定服务 伊拉个支持者是爱国同盟 弗过 因为遇到经济大萧条 威权政府立在1930年垮台 之后西班牙拿国王废脱 成立西班牙第二共和国',
|
3 |
+
'Example 2 - Paragraph - Scots': 'Polyandry ( frae Greek : πολυ - poly - , " " mony " " and ἀνήρ anēr , " " man " " ) involves mairiage that includes mair nor twa pairtners an can faw unner the broader category o polyamory . Mair speceefically , it is a form o polygamy , whaur a woman takes twa or mair husbands at the same time . Polyandry is contrasted wi polygyny , involvin ane male an twa or mair females . If a mairiage involves a plural nummer o " " husbands an wives " " pairteecipants o each gender , then it can be cried polyamory , group or conjynt mairiage . In its broadest uise , polyandry refers tae sexual relations wi multiple males within or withoot mairiage .',
|
4 |
+
'Example 3 - Paragraph - Sinhala': 'ට්රූඑක්ස් ක් රියාවලියට විකල්පයක් ලෙස මැලොන්ඩයමයිඩ් ( malondiamide ) යොදා ගන්නා නිස්සාරණ ක් රමයක් හදුන්වා දී ඇත . ඩයමෙක්ස් ( DIAMEX ; DIAMide Extraction ) ක් රියාවලියේ වාසියක් වන්නේ කාබන් , හයිඩ් රජන් , නයිට් රජන් හා ඔක්සිජන් හැර වෙනත් මූල ද් රව් ය අඩංගු කාබනික අපද් රව් ය උත්පාදනය වීම වැළකීමයි . මෙම අපද් රව් ය අම්ල වැසිවලට උරදෙන ආම්ලික වායු නොසෑදෙන සේ දහනය කළ හැකිය . ඩයමෙක්ස් ක් රමය ප් රංශ CEA මගින් යුරෝපයේ භාවිතයට ගනී . ක් රියාවලිය ගැන දැනට පවතින දැනුමෙන් කාර්මික කම්හල් ඉදි කිරීමට තරම් මෙම ක් රියාවලිය පරිණත වී ඇත . මෙම ක් රියාවලිය ද භාවිතා කරන්නේ ද් රව් යතා යාන්ත් රණයක්ය .',
|
5 |
+
'Example 4 - Paragraph - Asturian': "Presupuestos públicos pa la igualdá y el desenvolvimientu sustentable para con ello tresformar l ' actual modelu de desenvolvimientu atendiendo les causes estructurales que xeneren y reproducen desigualdaes por ello participa nel ambito internacional y rexonal nel siguimientu de los Oxetivos de Desenvolvimientu del Mileniu ( ODS ) y agora na axenda 2030 .",
|
6 |
+
'Example 5 - Paragraph - Swahili (macrolanguage)': 'Kuna nyota nyingi katika eneo la kundinyota hii lakini zote si angavu sana . Nyota angavu zaidi ni Beta Aquarii ambayo ni nyota jitu kubwa njano mwenye uangavu unaoonekana wa 2 . 9 .',
|
7 |
+
'Example 6 - Paragraph - Czech': 'CASSE , Gilbert ; CUNDALL , Peter ; TULLY , Anthony . IJN ARGENTINA MARU : Tabular Record of Movement [ online ] . combinedfleet . com , 2015 - 09 - 13 , [ cit . 2015 - 09 - 13 ] . Dostupné online . ( anglicky )',
|
8 |
+
'Example 7 - Paragraph - Lingala': 'Mayanzi ekutanaka mingi mingi na Afrika , na amérika mpé na bisika ya moyi makasi . Liyanzi elingaka bisika ya zélo mpé ya salité mpo na ko kota na nzoto ya nyama mosusu to na nzoto ya moto .',
|
9 |
+
'Example 8 - Paragraph - Thai': 'โดยที่ an > 0 สำหรับทุก n แต่ละพจน์ของอนุกรมจะมีเครื่องหมายบวกและลบสลับกัน เช่นเดียวกับอนุกรมอื่นๆ อนุกรมสลับจะลู่เข้าก็ต่อเมื่อลำดับของผลบวกจำกัดพจน์ลู่เข้า',
|
10 |
+
'Example 9 - Paragraph - Waray': 'An Orphnus rufithorax in uska species han Coleoptera nga ginhulagway ni Benderitter hadton 1914 . An Orphnus rufithorax in nahilalakip ha genus nga Orphnus , ngan familia nga Orphnidae . Waray hini subspecies nga nakalista .',
|
11 |
+
'Example 10 - Paragraph - Gagauz': 'Ama buna bakmadaan , masmediya milionnarca insannarın ölmesinnän korkudêr . Sansın büün medişina XIX - cu üzyılın uurunda bulunarmış gibi . Düünnedä panika başlamaya yakın .',
|
12 |
+
'Example 11 - Paragraph - Western Panjabi': 'پینڈولم دی فزکس سب توں پہلا گلیلیو نیں 1602 دی چ سمجی سی تے اس ویلے توں 1930 تک پینڈولم نوں ویلے دے سب توں ٹھیک حساب لئی ورتیا جاندا ��ی پینڈولم نوں ویلا نپن دے علاوہ زمین دی کچھ نپن تے بھونچال دے زور نپن لئی وی ورتیا جاندا سی',
|
13 |
+
'Example 12 - Paragraph - West Low German': 'Hoog Buurlo is n gehucht in de gemeente Apeldoorne , in de Nederlandse provinsie Gelderland . t Ligt ten westen van de stad Apeldoorne en iets ten oosten van Radio Kootwiek .',
|
14 |
+
'Example 13 - Paragraph - Quechua': '| 5 ñiqin pachakwata | 6 ñiqin pachakwata | 7 ñiqin pachakwata | | 510 watakuna | 520 watakuna | 530 watakuna | 540 watakuna | 550 watakuna | 560 watakuna | 570 watakuna | | | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | |',
|
15 |
+
'Example 14 - Paragraph - Serbo-Croatian': 'Sočanica je naselje u opštini Leposavić na Kosovu i Metohiji . Površina katastarske opštine Sočanica gde je atar naselja iznosi 1 . 929 ha . Sedište je mesne zajednice Sočanica . Naselje Sočanica poslednjih godina prerasta iz seoskog naselja u varošicu , nalazi se 5 km južno od Leposavića sa desne strane reke Ibar . Srednja nadmorska visina naselja iznosi 636m . U pisanim izvorima selo se prvi put pominje 1315 . godine , u povelji srpskog kralja Stefana Milutina manastiru Banjskoj .',
|
16 |
+
'Example 15 - Paragraph - Tongan': 'ʻOku pehē ʻe Niel Gunson naʻe ʻafifio ʻa e Tuʻi Tonga ʻi he taimi ko ia ʻi Manuʻa pea naʻa nau hoko ki he Tuʻi Manuʻa . Ko ia ai ʻoku hala ha tuʻi ʻi Tongatapu , pea naʻe kamata ha holongā tuʻi foʻou .',
|
17 |
+
'Example 16 - Paragraph - Rusyn': 'Прыпять - є єднов з найдовшых рік на Україні і Европы . Тече на теріторії Білорусії і Україны . Довжка рікы є 775 км . Коло міста Чорнобыль ся вливать до водозбіру рікы Днїпр і так там кінчыть свою путь .',
|
18 |
+
'Example 17 - Paragraph - Ladino': 'Ciruelos de Cervera es un puevlo de la Provinsia de Burgos en la junta de Kastiya i Leon en Espanya . Tiene una povlasion de 111 avitants i una ekstension de 37 , 874 km² ( 2015 ) .',
|
19 |
+
'Example 18 - Paragraph - Bosnian': 'Po posljednjem službenom popisu stanovništva iz 1991 . godine , općina Hadžići ( u to vrijeme jedna od 5 prigradskih općina Grada Sarajeva ) imala je 24 . 200 stanovnika , raspoređenih u 62 naselja .',
|
20 |
+
'Example 19 - Paragraph - Chuvash': 'Мăн Агыйдел ( Агыйдел ) Раççей территоринчи юханшыв . Вологда облаçĕ , Киров облаçĕ , Коми Республики территорипе юхать . Шарженг юханшывăн сылтăм çыранĕпе 143 км вăрринчен юханшыва юхса кĕрет . Юханшыв тăршшĕ 10 км .',
|
21 |
+
'Example 20 - Paragraph - Dhivehi': 'ގުރުދާ ބަލި ބޮޑުވަމުންދާވަރަކަށް ލޭގައި ޖަމާވަމުންދާ ބޭކާރު މާއްދާތައް ހަށިގަނޑުން ބޭރުކުރުމަށް ހަށިގަނޑު ނުކުޅެދެއެވެ . ނަތީޖާއެއްގެ ގޮތުގައި މިމާއްދާތައް ގިނަވެ ވިހަވާ މިންވަރަށް އިތުރުވެއެވެ . ލޭގެ Pްރެޝަރ އިތުރުވެފައިވާނަމަ މިމައްސަލަ އިތުރަށް ގޯސްވެއެވެ . ގުރުދާ ބަލިން ރައްކާތެރިވެ ވީހާވެސް ކުރިން ސިއްހީ ފަރުވާ ފެށުމަށްޓަކައި ބަލީގެ ކުރީކޮޅުގައި ބަލި ފާހަގަކުރުމަށް މަސައްކަތް ކުރުމަކީ މުހިއްމު ކަމެކެވެ .',
|
22 |
+
'Example 21 - Sentence - Western Panjabi': 'یکی از این آبخورگاه ها در ضلع شرقی صحن و در مقابل مقبره راجه قرار داشته و بهره هند ( طایفه ای از اسماعیلیان ) آن را نوسازی کرده بودند و در جوار آن نیز دو درخت میوه و یک درخت سدر بوده است .',
|
23 |
+
'Example 22 - Sentence - Tamil': 'மேற்குறிப்பிட்ட கட்சிகளைத் தவிர முஸ்லிம் லீக் , இந்திய கம்யூனிஸ்ட் கட்சி , ஃபார்வார்டு ப்ளாக் , சி .',
|
24 |
+
'Example 23 - Sentence - Basque': 'Hala ere , garai horretan ELAk ez zituen langile etorkinak onartzen , afiliatzeko lehen lau abizenetatik bat gutxienez euskal jatorrikoa izatea eskatzen baitzuen oraindik .',
|
25 |
+
'Example 24 - Sentence - Livvi-Karelian': 'Niilöis lapset lujendetah omua tervehytty , harjavutah vedeh .',
|
26 |
+
'Example 25 - Sentence - Eastern Mari': '3 : Тÿнямбал да руш классике , рушлаш кусарыме сборник - влак .',
|
27 |
+
'Example 26 - Sentence - Breton': "Brudet eo bet e Breizh abalamour d ' e enebiezh ruz ouzh Diwan hag ouzh ar brezhoneg .",
|
28 |
+
'Example 27 - Sentence - Bosnian': 'Sa 52 godine vratio se u Veneciju i ponudio svoje usluge svojim donedavnim progoniteljima za koje je radio kao špijun i za to su ga plaćali .',
|
29 |
+
'Example 28 - Sentence - Uzbek': 'diapazoni N .', 'Example 29 - Sentence - Esperanto': 'Loa !',
|
30 |
+
'Example 30 - Sentence - Lezghian': 'Кабир - Казмаляр ) Дагъустан республикадин Мегьарамдхуьруьн районда авай , « КьепIир Къазмайрин » хуьруьнсоветдик акатзавай хуьр .',
|
31 |
+
'Example 31 - Sentence - Norwegian Nynorsk': 'Morfaren var elles ein velkjend filosof og samfunnskritikar , Aleksandr Radisjtsjev .',
|
32 |
+
'Example 32 - Sentence - Papiamento': 'Esaki tabata inaceptabel pa e politiconan di Curaçao .',
|
33 |
+
'Example 33 - Sentence - Azerbaijani': 'Müharibədən evə sağ - salamat dönən Xındı Məmməd öz gözəl sənətini davam etdirmişdir .',
|
34 |
+
'Example 34 - Sentence - Volapük': 'Lödanadensit äbinon mö mens 460 , 2 a km² .',
|
35 |
+
'Example 35 - Sentence - Konkani': '13 No creature is concealed from him , but everything is naked and exposed to the eyes of him to whom we must render an account',
|
36 |
+
'Example 36 - Sentence - Latgalian': '1990 godā solu reorganizej kai pogostu .',
|
37 |
+
'Example 37 - Sentence - Swedish': 'Inga underarter finns listade i Catalogue of Life .',
|
38 |
+
'Example 38 - Sentence - Tuvan': 'Ол ам оралакчы сайыт апарган кижи чүге дуза кадып шыдавас деп бодап , эжинге бүзүрел Серге - Байырны ооӊ - биле ужуражылгаже эккээр .',
|
39 |
+
'Example 39 - Sentence - Malagasy': "200 no isan ' ny kisoa .",
|
40 |
+
'Example 40 - Sentence - English': 'The convention followed after a request by the Bulgarian government on 24 September asking for a ceasefire .'}
|
libs/languages.py
ADDED
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
languages = {
|
2 |
+
"ace": "Achinese",
|
3 |
+
"afr": "Afrikaans",
|
4 |
+
"als": "Alemannic German",
|
5 |
+
"amh": "Amharic",
|
6 |
+
"ang": "Old English ",
|
7 |
+
"ara": "Arabic",
|
8 |
+
"arg": "Aragonese",
|
9 |
+
"arz": "Egyptian Arabic",
|
10 |
+
"asm": "Assamese",
|
11 |
+
"ast": "Asturian",
|
12 |
+
"ava": "Avar",
|
13 |
+
"aym": "Aymara",
|
14 |
+
"azb": "South Azerbaijani",
|
15 |
+
"aze": "Azerbaijani",
|
16 |
+
"bak": "Bashkir",
|
17 |
+
"bar": "Bavarian",
|
18 |
+
"bcl": "Central Bikol",
|
19 |
+
"be-tarask": "Belarusian (Taraschkewiza)",
|
20 |
+
"bel": "Belarusian",
|
21 |
+
"ben": "Bengali",
|
22 |
+
"bho": "Bhojpuri",
|
23 |
+
"bjn": "Banjar",
|
24 |
+
"bod": "Tibetan",
|
25 |
+
"bos": "Bosnian",
|
26 |
+
"bpy": "Bishnupriya",
|
27 |
+
"bre": "Breton",
|
28 |
+
"bul": "Bulgarian",
|
29 |
+
"bxr": "Buryat",
|
30 |
+
"cat": "Catalan",
|
31 |
+
"cbk": "Chavacano",
|
32 |
+
"cdo": "Min Dong",
|
33 |
+
"ceb": "Cebuano",
|
34 |
+
"ces": "Czech",
|
35 |
+
"che": "Chechen",
|
36 |
+
"chr": "Cherokee",
|
37 |
+
"chv": "Chuvash",
|
38 |
+
"ckb": "Central Kurdish",
|
39 |
+
"cor": "Cornish",
|
40 |
+
"cos": "Corsican",
|
41 |
+
"crh": "Crimean Tatar",
|
42 |
+
"csb": "Kashubian",
|
43 |
+
"cym": "Welsh",
|
44 |
+
"dan": "Danish",
|
45 |
+
"deu": "German",
|
46 |
+
"diq": "Dimli",
|
47 |
+
"div": "Dhivehi",
|
48 |
+
"dsb": "Lower Sorbian",
|
49 |
+
"dty": "Doteli",
|
50 |
+
"egl": "Emilian",
|
51 |
+
"ell": "Modern Greek",
|
52 |
+
"eng": "English",
|
53 |
+
"epo": "Esperanto",
|
54 |
+
"est": "Estonian",
|
55 |
+
"eus": "Basque",
|
56 |
+
"ext": "Extremaduran",
|
57 |
+
"fao": "Faroese",
|
58 |
+
"fas": "Persian",
|
59 |
+
"fin": "Finnish",
|
60 |
+
"fra": "French",
|
61 |
+
"frp": "Arpitan",
|
62 |
+
"fry": "Western Frisian",
|
63 |
+
"fur": "Friulian",
|
64 |
+
"gag": "Gagauz",
|
65 |
+
"gla": "Scottish Gaelic",
|
66 |
+
"gle": "Irish",
|
67 |
+
"glg": "Galician",
|
68 |
+
"glk": "Gilaki",
|
69 |
+
"glv": "Manx",
|
70 |
+
"grn": "Guarani",
|
71 |
+
"guj": "Gujarati",
|
72 |
+
"hak": "Hakka Chinese",
|
73 |
+
"hat": "Haitian Creole",
|
74 |
+
"hau": "Hausa",
|
75 |
+
"hbs": "Serbo-Croatian",
|
76 |
+
"heb": "Hebrew",
|
77 |
+
"hif": "Fiji Hindi",
|
78 |
+
"hin": "Hindi",
|
79 |
+
"hrv": "Croatian",
|
80 |
+
"hsb": "Upper Sorbian",
|
81 |
+
"hun": "Hungarian",
|
82 |
+
"hye": "Armenian",
|
83 |
+
"ibo": "Igbo",
|
84 |
+
"ido": "Ido",
|
85 |
+
"ile": "Interlingue",
|
86 |
+
"ilo": "Iloko",
|
87 |
+
"ina": "Interlingua",
|
88 |
+
"ind": "Indonesian",
|
89 |
+
"isl": "Icelandic",
|
90 |
+
"ita": "Italian",
|
91 |
+
"jam": "Jamaican Patois",
|
92 |
+
"jav": "Javanese",
|
93 |
+
"jbo": "Lojban",
|
94 |
+
"jpn": "Japanese",
|
95 |
+
"kaa": "Karakalpak",
|
96 |
+
"kab": "Kabyle",
|
97 |
+
"kan": "Kannada",
|
98 |
+
"kat": "Georgian",
|
99 |
+
"kaz": "Kazakh",
|
100 |
+
"kbd": "Kabardian",
|
101 |
+
"khm": "Central Khmer",
|
102 |
+
"kin": "Kinyarwanda",
|
103 |
+
"kir": "Kirghiz",
|
104 |
+
"koi": "Komi-Permyak",
|
105 |
+
"kok": "Konkani",
|
106 |
+
"kom": "Komi",
|
107 |
+
"kor": "Korean",
|
108 |
+
"krc": "Karachay-Balkar",
|
109 |
+
"ksh": "Ripuarisch",
|
110 |
+
"kur": "Kurdish",
|
111 |
+
"lad": "Ladino",
|
112 |
+
"lao": "Lao",
|
113 |
+
"lat": "Latin",
|
114 |
+
"lav": "Latvian",
|
115 |
+
"lez": "Lezghian",
|
116 |
+
"lij": "Ligurian",
|
117 |
+
"lim": "Limburgan",
|
118 |
+
"lin": "Lingala",
|
119 |
+
"lit": "Lithuanian",
|
120 |
+
"lmo": "Lombard",
|
121 |
+
"lrc": "Northern Luri",
|
122 |
+
"ltg": "Latgalian",
|
123 |
+
"ltz": "Luxembourgish",
|
124 |
+
"lug": "Luganda",
|
125 |
+
"lzh": "Literary Chinese",
|
126 |
+
"mai": "Maithili",
|
127 |
+
"mal": "Malayalam",
|
128 |
+
"map-bms": "Banyumasan",
|
129 |
+
"mar": "Marathi",
|
130 |
+
"mdf": "Moksha",
|
131 |
+
"mhr": "Eastern Mari",
|
132 |
+
"min": "Minangkabau",
|
133 |
+
"mkd": "Macedonian",
|
134 |
+
"mlg": "Malagasy",
|
135 |
+
"mlt": "Maltese",
|
136 |
+
"mon": "Mongolian",
|
137 |
+
"mri": "Maori",
|
138 |
+
"mrj": "Western Mari",
|
139 |
+
"msa": "Malay",
|
140 |
+
"mwl": "Mirandese",
|
141 |
+
"mya": "Burmese",
|
142 |
+
"myv": "Erzya",
|
143 |
+
"mzn": "Mazanderani",
|
144 |
+
"nan": "Min Nan Chinese",
|
145 |
+
"nap": "Neapolitan",
|
146 |
+
"nav": "Navajo",
|
147 |
+
"nci": "Classical Nahuatl",
|
148 |
+
"nds": "Low German",
|
149 |
+
"nds-nl": "West Low German",
|
150 |
+
"nep": "Nepali (macrolanguage)",
|
151 |
+
"new": "Newari",
|
152 |
+
"nld": "Dutch",
|
153 |
+
"nno": "Norwegian Nynorsk",
|
154 |
+
"nob": "Bokmål",
|
155 |
+
"nrm": "Narom",
|
156 |
+
"nso": "Northern Sotho",
|
157 |
+
"oci": "Occitan",
|
158 |
+
"olo": "Livvi-Karelian",
|
159 |
+
"ori": "Oriya",
|
160 |
+
"orm": "Oromo",
|
161 |
+
"oss": "Ossetian",
|
162 |
+
"pag": "Pangasinan",
|
163 |
+
"pam": "Pampanga",
|
164 |
+
"pan": "Panjabi",
|
165 |
+
"pap": "Papiamento",
|
166 |
+
"pcd": "Picard",
|
167 |
+
"pdc": "Pennsylvania German",
|
168 |
+
"pfl": "Palatine German",
|
169 |
+
"pnb": "Western Panjabi",
|
170 |
+
"pol": "Polish",
|
171 |
+
"por": "Portuguese",
|
172 |
+
"pus": "Pushto",
|
173 |
+
"que": "Quechua",
|
174 |
+
"roa-tara": "Tarantino dialect",
|
175 |
+
"roh": "Romansh",
|
176 |
+
"ron": "Romanian",
|
177 |
+
"rue": "Rusyn",
|
178 |
+
"rup": "Aromanian",
|
179 |
+
"rus": "Russian",
|
180 |
+
"sah": "Yakut",
|
181 |
+
"san": "Sanskrit",
|
182 |
+
"scn": "Sicilian",
|
183 |
+
"sco": "Scots",
|
184 |
+
"sgs": "Samogitian",
|
185 |
+
"sin": "Sinhala",
|
186 |
+
"slk": "Slovak",
|
187 |
+
"slv": "Slovene",
|
188 |
+
"sme": "Northern Sami",
|
189 |
+
"sna": "Shona",
|
190 |
+
"snd": "Sindhi",
|
191 |
+
"som": "Somali",
|
192 |
+
"spa": "Spanish",
|
193 |
+
"sqi": "Albanian",
|
194 |
+
"srd": "Sardinian",
|
195 |
+
"srn": "Sranan",
|
196 |
+
"srp": "Serbian",
|
197 |
+
"stq": "Saterfriesisch",
|
198 |
+
"sun": "Sundanese",
|
199 |
+
"swa": "Swahili (macrolanguage)",
|
200 |
+
"swe": "Swedish",
|
201 |
+
"szl": "Silesian",
|
202 |
+
"tam": "Tamil",
|
203 |
+
"tat": "Tatar",
|
204 |
+
"tcy": "Tulu",
|
205 |
+
"tel": "Telugu",
|
206 |
+
"tet": "Tetum",
|
207 |
+
"tgk": "Tajik",
|
208 |
+
"tgl": "Tagalog",
|
209 |
+
"tha": "Thai",
|
210 |
+
"ton": "Tongan",
|
211 |
+
"tsn": "Tswana",
|
212 |
+
"tuk": "Turkmen",
|
213 |
+
"tur": "Turkish",
|
214 |
+
"tyv": "Tuvan",
|
215 |
+
"udm": "Udmurt",
|
216 |
+
"uig": "Uighur",
|
217 |
+
"ukr": "Ukrainian",
|
218 |
+
"urd": "Urdu",
|
219 |
+
"uzb": "Uzbek",
|
220 |
+
"vec": "Venetian",
|
221 |
+
"vep": "Veps",
|
222 |
+
"vie": "Vietnamese",
|
223 |
+
"vls": "Vlaams",
|
224 |
+
"vol": "Volapük",
|
225 |
+
"vro": "Võro",
|
226 |
+
"war": "Waray",
|
227 |
+
"wln": "Walloon",
|
228 |
+
"wol": "Wolof",
|
229 |
+
"wuu": "Wu Chinese",
|
230 |
+
"xho": "Xhosa",
|
231 |
+
"xmf": "Mingrelian",
|
232 |
+
"yid": "Yiddish",
|
233 |
+
"yor": "Yoruba",
|
234 |
+
"zea": "Zeeuws",
|
235 |
+
"zh-yue": "Cantonese",
|
236 |
+
"zho": "Standard Chinese",
|
237 |
+
}
|
libs/normalizer.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import regex
|
3 |
+
import sys
|
4 |
+
import textwrap
|
5 |
+
from typing import Any, Dict, Optional
|
6 |
+
|
7 |
+
punctuations = [
|
8 |
+
'!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '.',
|
9 |
+
'/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_',
|
10 |
+
'`', '{', '|', '}', '~', '»', '«', '“', '”', "-",
|
11 |
+
]
|
12 |
+
|
13 |
+
|
14 |
+
class Normalizer:
|
15 |
+
"""A general normalizer for every language"""
|
16 |
+
|
17 |
+
_whitelist = r"[" + "\p{N}\p{L}\p{M}" + re.escape("".join(punctuations)) + "]+"
|
18 |
+
_dictionary = {}
|
19 |
+
|
20 |
+
def __init__(
|
21 |
+
self,
|
22 |
+
whitelist: str = None,
|
23 |
+
dictionary: Dict[str, str] = None,
|
24 |
+
) -> None:
|
25 |
+
self.whitelist = whitelist if whitelist and isinstance(whitelist, str) else self._whitelist
|
26 |
+
self.dictionary = dictionary if dictionary and isinstance(dictionary, dict) else self._dictionary
|
27 |
+
|
28 |
+
def chars_to_map(self, sentence: str) -> str:
|
29 |
+
"""Maps every character, words, and phrase into a proper one.
|
30 |
+
|
31 |
+
Args:
|
32 |
+
sentence (str): A piece of text.
|
33 |
+
"""
|
34 |
+
if not len(self.dictionary) > 0:
|
35 |
+
return sentence
|
36 |
+
|
37 |
+
pattern = "|".join(map(re.escape, self.dictionary.keys()))
|
38 |
+
return re.sub(pattern, lambda m: self.dictionary[m.group()], str(sentence))
|
39 |
+
|
40 |
+
def chars_to_preserve(
|
41 |
+
self,
|
42 |
+
sentence: str,
|
43 |
+
) -> str:
|
44 |
+
"""Keeps specified characters from sentence
|
45 |
+
|
46 |
+
Args:
|
47 |
+
sentence (str): A piece of text.
|
48 |
+
"""
|
49 |
+
try:
|
50 |
+
tokenized = regex.findall(self.whitelist, sentence)
|
51 |
+
return " ".join(tokenized)
|
52 |
+
except Exception as error:
|
53 |
+
print(
|
54 |
+
textwrap.dedent(
|
55 |
+
f"""
|
56 |
+
Bad characters range {self.whitelist},
|
57 |
+
{error}
|
58 |
+
"""
|
59 |
+
)
|
60 |
+
)
|
61 |
+
raise
|
62 |
+
|
63 |
+
def text_level_normalizer(self, text: str) -> str:
|
64 |
+
"""A text level of normalization"""
|
65 |
+
|
66 |
+
text = regex.sub(r"([" + re.escape("".join(punctuations)) + "])", r" \1 ", text)
|
67 |
+
text = text.strip()
|
68 |
+
|
69 |
+
return text
|
70 |
+
|
71 |
+
def __call__(
|
72 |
+
self,
|
73 |
+
text: str,
|
74 |
+
do_lowercase: Optional[bool] = False
|
75 |
+
) -> Any:
|
76 |
+
"""Normalization caller"""
|
77 |
+
|
78 |
+
text = self.chars_to_map(text)
|
79 |
+
text = self.chars_to_preserve(text)
|
80 |
+
text = self.text_level_normalizer(text)
|
81 |
+
text = re.sub(r"\s+", " ", text)
|
82 |
+
|
83 |
+
if do_lowercase:
|
84 |
+
text = text.lower()
|
85 |
+
|
86 |
+
return text
|
libs/utils.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import numpy as np
|
3 |
+
import plotly.express as px
|
4 |
+
|
5 |
+
|
6 |
+
def plot_result(top_languages):
|
7 |
+
top_k = len(top_languages)
|
8 |
+
languages = [f'{r["language"]} ({r["code"]})' for r in top_languages]
|
9 |
+
scores = np.array([r["score"] for r in top_languages])
|
10 |
+
scores *= 100
|
11 |
+
fig = px.bar(
|
12 |
+
x=scores,
|
13 |
+
y=languages,
|
14 |
+
orientation='h',
|
15 |
+
labels={'x': 'Confidence', 'y': 'Language'},
|
16 |
+
text=scores,
|
17 |
+
range_x=(0, 115),
|
18 |
+
title=f'Top Detections {top_k}',
|
19 |
+
color=np.linspace(0, 1, len(scores)),
|
20 |
+
color_continuous_scale='GnBu'
|
21 |
+
)
|
22 |
+
fig.update(layout_coloraxis_showscale=False)
|
23 |
+
fig.update_traces(texttemplate='%{text:0.1f}%', textposition='outside')
|
24 |
+
fig.update_layout(yaxis={'categoryorder': 'total ascending'})
|
25 |
+
st.plotly_chart(fig, use_container_width=True)
|
meta.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
INFO = """
|
2 |
+
# Zabanshenas 🕵
|
3 |
+
|
4 |
+
A Transformer-based solution for identifying the most likely language of a written document/text. Zabanshenas is a Persian word that has two meanings:
|
5 |
+
|
6 |
+
- A person who studies linguistics.
|
7 |
+
- A way to identify the type of written language.
|
8 |
+
""".strip()
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
transformers
|
3 |
+
torch
|
4 |
+
regex
|
5 |
+
plotly
|