Spaces:
Sleeping
Sleeping
update
Browse files- language_identification.md +18 -0
- main.py +12 -4
language_identification.md
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Language Identification
|
2 |
+
|
3 |
+
### lanid
|
4 |
+
|
5 |
+
langid 识别 97 种语言。
|
6 |
+
https://github.com/saffsd/langid.py
|
7 |
+
|
8 |
+
原理:
|
9 |
+
```text
|
10 |
+
https://github.com/saffsd/langid.py/tree/master/langid/train
|
11 |
+
|
12 |
+
1. 分词.
|
13 |
+
2. 计算 `字符ngram` 或 `词ngram` 特征.
|
14 |
+
3. 计算 item 的文档频率.
|
15 |
+
4. 计算 IG weights 信息增益权重, 提取重要特征.
|
16 |
+
4. 训练 NB (Naive Bayes) 概率模型, 即每个 item 对每个类型的概率贡献.
|
17 |
+
|
18 |
+
```
|
main.py
CHANGED
@@ -20,6 +20,11 @@ from project_settings import project_path, temp_directory
|
|
20 |
|
21 |
def get_args():
|
22 |
parser = argparse.ArgumentParser()
|
|
|
|
|
|
|
|
|
|
|
23 |
parser.add_argument(
|
24 |
"--lang_id_examples_file",
|
25 |
default=(project_path / "lang_id_examples.json").as_posix(),
|
@@ -47,12 +52,13 @@ def main():
|
|
47 |
args = get_args()
|
48 |
|
49 |
brief_description = """
|
50 |
-
|
51 |
-
|
52 |
-
langid 识别 97 种语言。
|
53 |
-
https://github.com/saffsd/langid.py
|
54 |
"""
|
55 |
|
|
|
|
|
|
|
|
|
56 |
# examples
|
57 |
with open(args.lang_id_examples_file, "r", encoding="utf-8") as f:
|
58 |
lang_id_examples = json.load(f)
|
@@ -101,6 +107,8 @@ def main():
|
|
101 |
outputs=[lang_id_label, lang_id_prob],
|
102 |
)
|
103 |
|
|
|
|
|
104 |
blocks.queue().launch(
|
105 |
share=False if platform.system() == "Windows" else False,
|
106 |
server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0",
|
|
|
20 |
|
21 |
def get_args():
|
22 |
parser = argparse.ArgumentParser()
|
23 |
+
parser.add_argument(
|
24 |
+
"--language_identification_md_file",
|
25 |
+
default=(project_path / "language_identification.md").as_posix(),
|
26 |
+
type=str
|
27 |
+
)
|
28 |
parser.add_argument(
|
29 |
"--lang_id_examples_file",
|
30 |
default=(project_path / "lang_id_examples.json").as_posix(),
|
|
|
52 |
args = get_args()
|
53 |
|
54 |
brief_description = """
|
55 |
+
### Language Identification
|
|
|
|
|
|
|
56 |
"""
|
57 |
|
58 |
+
# description
|
59 |
+
with open(args.language_identification_md_file, "r", encoding="utf-8") as f:
|
60 |
+
description = f.read()
|
61 |
+
|
62 |
# examples
|
63 |
with open(args.lang_id_examples_file, "r", encoding="utf-8") as f:
|
64 |
lang_id_examples = json.load(f)
|
|
|
107 |
outputs=[lang_id_label, lang_id_prob],
|
108 |
)
|
109 |
|
110 |
+
gr.Markdown(value=description)
|
111 |
+
|
112 |
blocks.queue().launch(
|
113 |
share=False if platform.system() == "Windows" else False,
|
114 |
server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0",
|