HoneyTian commited on
Commit
820797e
·
1 Parent(s): cb97e31
Files changed (2) hide show
  1. language_identification.md +18 -0
  2. main.py +12 -4
language_identification.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Language Identification
2
+
3
+ ### lanid
4
+
5
+ langid 识别 97 种语言。
6
+ https://github.com/saffsd/langid.py
7
+
8
+ 原理:
9
+ ```text
10
+ https://github.com/saffsd/langid.py/tree/master/langid/train
11
+
12
+ 1. 分词.
13
+ 2. 计算 `字符ngram` 或 `词ngram` 特征.
14
+ 3. 计算 item 的文档频率.
15
+ 4. 计算 IG weights 信息增益权重, 提取重要特征.
16
+ 4. 训练 NB (Naive Bayes) 概率模型, 即每个 item 对每个类型的概率贡献.
17
+
18
+ ```
main.py CHANGED
@@ -20,6 +20,11 @@ from project_settings import project_path, temp_directory
20
 
21
  def get_args():
22
  parser = argparse.ArgumentParser()
 
 
 
 
 
23
  parser.add_argument(
24
  "--lang_id_examples_file",
25
  default=(project_path / "lang_id_examples.json").as_posix(),
@@ -47,12 +52,13 @@ def main():
47
  args = get_args()
48
 
49
  brief_description = """
50
- ## Language Identification
51
-
52
- langid 识别 97 种语言。
53
- https://github.com/saffsd/langid.py
54
  """
55
 
 
 
 
 
56
  # examples
57
  with open(args.lang_id_examples_file, "r", encoding="utf-8") as f:
58
  lang_id_examples = json.load(f)
@@ -101,6 +107,8 @@ def main():
101
  outputs=[lang_id_label, lang_id_prob],
102
  )
103
 
 
 
104
  blocks.queue().launch(
105
  share=False if platform.system() == "Windows" else False,
106
  server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0",
 
20
 
21
  def get_args():
22
  parser = argparse.ArgumentParser()
23
+ parser.add_argument(
24
+ "--language_identification_md_file",
25
+ default=(project_path / "language_identification.md").as_posix(),
26
+ type=str
27
+ )
28
  parser.add_argument(
29
  "--lang_id_examples_file",
30
  default=(project_path / "lang_id_examples.json").as_posix(),
 
52
  args = get_args()
53
 
54
  brief_description = """
55
+ ### Language Identification
 
 
 
56
  """
57
 
58
+ # description
59
+ with open(args.language_identification_md_file, "r", encoding="utf-8") as f:
60
+ description = f.read()
61
+
62
  # examples
63
  with open(args.lang_id_examples_file, "r", encoding="utf-8") as f:
64
  lang_id_examples = json.load(f)
 
107
  outputs=[lang_id_label, lang_id_prob],
108
  )
109
 
110
+ gr.Markdown(value=description)
111
+
112
  blocks.queue().launch(
113
  share=False if platform.system() == "Windows" else False,
114
  server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0",