root commited on
Commit
7a919c0
1 Parent(s): a1a765b

first commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. README.md +165 -12
  3. app.py +640 -0
  4. config-bak.ini +63 -0
  5. config.ini +63 -0
  6. deepdoc/README.md +122 -0
  7. deepdoc/README_zh.md +116 -0
  8. deepdoc/__init__.py +6 -0
  9. deepdoc/__pycache__/__init__.cpython-310.pyc +0 -0
  10. deepdoc/parser/__init__.py +8 -0
  11. deepdoc/parser/__pycache__/__init__.cpython-310.pyc +0 -0
  12. deepdoc/parser/__pycache__/docx_parser.cpython-310.pyc +0 -0
  13. deepdoc/parser/__pycache__/excel_parser.cpython-310.pyc +0 -0
  14. deepdoc/parser/__pycache__/pdf_parser.cpython-310.pyc +0 -0
  15. deepdoc/parser/__pycache__/ppt_parser.cpython-310.pyc +0 -0
  16. deepdoc/parser/docx_parser.py +119 -0
  17. deepdoc/parser/excel_parser.py +101 -0
  18. deepdoc/parser/pdf_parser.py +1161 -0
  19. deepdoc/parser/ppt_parser.py +60 -0
  20. deepdoc/parser/readme_parse.md +27 -0
  21. deepdoc/parser/readpdf.py +62 -0
  22. deepdoc/parser/resume/__init__.py +52 -0
  23. deepdoc/parser/resume/entities/__init__.py +0 -0
  24. deepdoc/parser/resume/entities/corporations.py +80 -0
  25. deepdoc/parser/resume/entities/degrees.py +24 -0
  26. deepdoc/parser/resume/entities/industries.py +692 -0
  27. deepdoc/parser/resume/entities/regions.py +762 -0
  28. deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
  29. deepdoc/parser/resume/entities/res/corp_baike_len.csv +0 -0
  30. deepdoc/parser/resume/entities/res/corp_tag.json +0 -0
  31. deepdoc/parser/resume/entities/res/good_corp.json +911 -0
  32. deepdoc/parser/resume/entities/res/good_sch.json +595 -0
  33. deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
  34. deepdoc/parser/resume/entities/res/schools.csv +0 -0
  35. deepdoc/parser/resume/entities/schools.py +62 -0
  36. deepdoc/parser/resume/step_one.py +174 -0
  37. deepdoc/parser/resume/step_two.py +580 -0
  38. deepdoc/utils/__init__.py +0 -0
  39. deepdoc/utils/__pycache__/__init__.cpython-310.pyc +0 -0
  40. deepdoc/utils/__pycache__/file_utils.cpython-310.pyc +0 -0
  41. deepdoc/utils/__pycache__/log_utils.cpython-310.pyc +0 -0
  42. deepdoc/utils/__pycache__/rag_tokenizer.cpython-310.pyc +0 -0
  43. deepdoc/utils/file_utils.py +216 -0
  44. deepdoc/utils/log_utils.py +313 -0
  45. deepdoc/utils/rag_tokenizer.py +423 -0
  46. deepdoc/vision/__init__.py +48 -0
  47. deepdoc/vision/__pycache__/__init__.cpython-310.pyc +0 -0
  48. deepdoc/vision/__pycache__/layout_recognizer.cpython-310.pyc +0 -0
  49. deepdoc/vision/__pycache__/ocr.cpython-310.pyc +0 -0
  50. deepdoc/vision/__pycache__/operators.cpython-310.pyc +0 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.trie filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,12 +1,165 @@
1
- ---
2
- title: ReadReview
3
- emoji: 📊
4
- colorFrom: red
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 4.32.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MedicalReviewAgent 不想看文献
2
+ ## 项目概述
3
+ - 整一个帮我写综述的Agent,希望他能完成文献内容的收集,文本分类和总结,科学事实对比,撰写综述等功能
4
+ - 计划用到RAG, function calling等技术
5
+ - 还在不断摸索中,欢迎大佬指导!
6
+
7
+ ## 流程图
8
+ 基本上就是在上海AIlab的茴香豆上面改的 这里主要讲解使用流程 架构和茴香豆一样 [茴香豆架构](https://github.com/InternLM/HuixiangDou/blob/main/docs/architecture_zh.md)
9
+ ### 文献库和知识库构建
10
+ ![image](https://github.com/jabberwockyang/MedicalReviewAgent/assets/52541128/d70a2ec1-7a20-4b5b-a91c-bf649f657319)
11
+
12
+ ### 人机合作写文章
13
+ <img width="847" alt="image" src="https://github.com/jabberwockyang/MedicalReviewAgent/assets/52541128/fc394d8b-1668-4349-9adc-1c4c0a7e0a8b">
14
+
15
+
16
+ ## 功能
17
+
18
+ 1. **模型服务配置**
19
+ - **远程模型选择**:允许用户选择使用远程大模型或本地模型。提供多种大模型提供商选择,如kimi、deepseek、zhipuai、gpt。
20
+
21
+ 2. **文献查找+数据库生成**
22
+ - **文献查找**:
23
+ - 用户可以输入感兴趣的关键词,设置查找数量,并进行PubMed PMC文献查找。
24
+ - 支持用户上传已有的PDF文献文件,可处理复杂PDF结构。
25
+ - **文献库管理**:
26
+ - 支持删除现有文献库。
27
+ - 提供文献库概况的实时更新。
28
+ - **数据库生成**:
29
+ - 用户可以设置块大小用于构建数据库
30
+ - 用户可以设置聚类数量用于文本聚类
31
+ - **数据库管理**:
32
+ - 支持生成新的数据库,删除现有数据库,并查看数据库概况。
33
+
34
+ 3. **写综述**
35
+ - **抽样标注文章聚类**:
36
+ - 用户可以选择特定的块大小和聚类数,设置抽样标注比例,并开始标注过程。
37
+ - **获取灵感**:
38
+ - 基于标注的文章聚类,大模型提供灵感,帮助用户生成综述所需的问题框架。
39
+ - **综述生成**:
40
+ - 用户可以输入想写的内容或主题,点击生成综述按钮,系统将自动生成综述文本并提供参考文献。
41
+
42
+ ## 亮点
43
+
44
+ 1. **高效的文献查找和管理**
45
+ - 通过关键词快速查找相关文献,支持上传已有PDF文献,方便文献库的构建和管理。
46
+
47
+ 2. **灵活的数据库生成**
48
+ - 提供灵活的数据库生成参数设置,支持多次生成和更新数据库,保证数据的及时性和准确性。
49
+
50
+ 3. **智能的综述生成**
51
+ - 基于先进的大模型技术,提供自动化的文章聚类标注和灵感生成功能,帮助用户快速生成高质量的综述文本。
52
+
53
+ 4. **用户友好界面**
54
+ - 直观的操作界面和详细的使用指导,让用户能够轻松上手和使用各项功能。
55
+
56
+ 5. **远程和本地模型支持**
57
+ - 支持多种大模型提供商的选择,满足不同用户的需求,无论是本地模型还是远程大模型,都能灵活配置和使用。
58
+
59
+ ## 安装运行
60
+ 新建conda环境
61
+
62
+ ```bash
63
+ conda create --name ReviewAgent python=3.10.14
64
+ conda activate ReviewAgent
65
+ ```
66
+ 拉取github仓库
67
+
68
+ ```bash
69
+ git clone https://github.com/jabberwockyang/MedicalReviewAgent.git
70
+ cd MedicalReviewAgent
71
+ pip install -r requirements.txt
72
+ ```
73
+ huggingface-cli下载模型
74
+
75
+ ```bash
76
+ cd /root && mkdir models
77
+ cd /root/models
78
+ # login required
79
+ huggingface-cli download Qwen/Qwen1.5-7B-Chat --local-dir /root/models/Qwen1.5-7B-Chat
80
+ huggingface-cli download maidalun1020/bce-embedding-base_v1 --local-dir /root/models/bce-embedding-base_v1
81
+ huggingface-cli download maidalun1020/bce-reranker-base_v1 --local-dir /root/models/bce-reranker-base_v1
82
+ ```
83
+ 启动服务
84
+
85
+ ```bash
86
+ conda activate ReviewAgent
87
+ cd MedicalReviewAgent
88
+ python3 app.py
89
+ ```
90
+ gradio在本地7860端口运行
91
+
92
+ ## 技术要点
93
+
94
+ 基于茴香豆加了几个功能
95
+
96
+ 1. 本地和远程模型灵活配置
97
+ - 本地模型是qwen1.5 7b chat
98
+ - 用户也可以在前端界面填写自己的API,本地远程模型任意切换
99
+
100
+ 3. 文献搜索和文本清洗
101
+ - 用户键入文献检索关键词,自动从PubMed公开数据库上搜索并下载文献全文
102
+ - xml到txt的文本清洗,去除reference 等无关信息
103
+
104
+ 4. 基于[Ragflow](https://github.com/infiniflow/ragflow/blob/main/README_zh.md)的deepdoc库的PDF识别
105
+ - 输出文献中的文字和表格,其中文字存储为 txt, 表格存储为图片,json, html三个格式
106
+ - 目前工作流中仅利用文字,表格数据的利用开发中
107
+
108
+ 5. chunk size可调的数据库生成
109
+ - default 1024 [ref](https://www.llamaindex.ai/blog/evaluating-the-ideal-chunk-size-for-a-rag-system-using-llamaindex-6207e5d3fec5)
110
+
111
+ 6. 嵌入kmeans聚类
112
+ - 基于Faiss库
113
+ - k可调
114
+
115
+ 7. 基于LLM的聚类内容标注
116
+ - 为节省算力,可以抽样标注
117
+ - 标注后本地储存避免重复标注
118
+
119
+ 8. 基于LLM的子问题生成
120
+ - 聚类标注内容作为context,生成对应的子问题
121
+
122
+ 9. 基于LLM的综述生成
123
+ - 输入可以是用户自己的问题,也可以参考之前llm生成的子问题
124
+ - 为了比较同一个科学问题的不同来源的观点,修改了一部分茴香豆的Retriever逻辑
125
+ - retreiver 优化 [ref](https://medium.aiplanet.com/evaluating-naive-rag-and-advanced-rag-pipeline-using-langchain-v-0-1-0-and-ragas-17d24e74e5cf)
126
+ - 返回topk =10的分段
127
+ - 由于LLM捞针能力在头尾两端较靠谱,用langchain_community.document_transformers.LongContextReorder 将相关性较高的文本分布在头尾两端
128
+
129
+ 10. gradio前端
130
+
131
+ <div style="display: flex;">
132
+ <img src="https://github.com/jabberwockyang/MedicalReviewAgent/assets/52541128/83a1cefe-ebe6-499a-9ca7-214e90089815" style="width: 30%;" />
133
+ <img src="https://github.com/jabberwockyang/MedicalReviewAgent/assets/52541128/c369aeaa-6749-4d56-b71a-d62ff1cb780f" style="width: 30%;" />
134
+ <img src="https://github.com/jabberwockyang/MedicalReviewAgent/assets/52541128/27f1d440-2b79-4cbb-9b15-a5e2fa037d33" style="width: 30%;" />
135
+ </div>
136
+ <div style="display: flex;">
137
+ <img src="https://github.com/jabberwockyang/MedicalReviewAgent/assets/52541128/db2443ff-b6a2-4c35-83e6-21e478c39eba" style="width: 30%;" />
138
+ <img src="https://github.com/jabberwockyang/MedicalReviewAgent/assets/52541128/77496f38-f1e6-4919-a439-c06b4fd52aab" style="width: 30%;" />
139
+ </div>
140
+
141
+
142
+ ## TODO
143
+ 1. 自然语言到文献搜索参数的functional call功能
144
+ - 比如:
145
+ - 输入:帮我搜索近五年特应性皮炎相关的孟德尔随机化文章,不要综述
146
+ - 输出:
147
+ ```json
148
+ {"keywords":["atopic dermatitis","mendelian randomisation"],
149
+ "min-year":2019,
150
+ "max-year":2024,
151
+ "include-type":null,
152
+ "exclude-type":"review"
153
+ }
154
+ ```
155
+ 2. 摸索适用于不同需求的chunk size和 k值
156
+ - 比方说用来找某个实验方法用多大浓度的试剂,和总结某研究领域的前沿进展所用到的chunksize应该是不一样的吧🤔
157
+ - 后者的大小其实取决于前者叭🤔
158
+ 3. 表格数据利用
159
+
160
+ ## 感谢
161
+ 1. [茴香豆](https://github.com/InternLM/HuixiangDou)
162
+ 2. [E-utilities](https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=PMCID)
163
+ 3. [Ragflow](https://github.com/infiniflow/ragflow/blob/main/README_zh.md)
164
+ 4. [Advanced RAG pipeline](https://medium.aiplanet.com/evaluating-naive-rag-and-advanced-rag-pipeline-using-langchain-v-0-1-0-and-ragas-17d24e74e5cf)
165
+
app.py ADDED
@@ -0,0 +1,640 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import time
4
+ import os
5
+ import glob
6
+ import random
7
+ import shutil
8
+ from enum import Enum
9
+ from threading import Thread
10
+ from multiprocessing import Process, Value
11
+
12
+ import gradio as gr
13
+ import pytoml
14
+ from loguru import logger
15
+
16
+ from huixiangdou.service import Worker, llm_serve, ArticleRetrieval, CacheRetriever, FeatureStore, FileOperation
17
+
18
+ class PARAM_CODE(Enum):
19
+ """Parameter code."""
20
+ SUCCESS = 0
21
+ FAILED = 1
22
+ ERROR = 2
23
+
24
+ def parse_args():
25
+ """Parse args."""
26
+ parser = argparse.ArgumentParser(description='Worker.')
27
+ parser.add_argument('--work_dir',
28
+ type=str,
29
+ default='workdir',
30
+ help='Working directory.')
31
+ parser.add_argument('--repo_dir',
32
+ type=str,
33
+ default='repodir',
34
+ help='Repository directory.')
35
+ parser.add_argument(
36
+ '--config_path',
37
+ default='config.ini',
38
+ type=str,
39
+ help='Worker configuration path. Default value is config.ini')
40
+ parser.add_argument('--standalone',
41
+ action='store_true',
42
+ default=True,
43
+ help='Auto deploy required Hybrid LLM Service.')
44
+ args = parser.parse_args()
45
+ return args
46
+
47
+ def update_remote_buttons(remote):
48
+ if remote:
49
+ return [
50
+ gr.Markdown("[如何配置API]('https://github.com/jabberwockyang/MedicalReviewAgent/blob/main/README.md')",
51
+ visible=True),
52
+ gr.Dropdown(["kimi", "deepseek", "zhipuai",'gpt'],
53
+ label="选择大模型提供商",
54
+ interactive=True,visible=True),
55
+ gr.Textbox(label="您的API",lines = 1,
56
+ interactive=True,visible=True),
57
+ gr.Dropdown([],label="选择模型",
58
+ interactive=True,visible=True)
59
+ ]
60
+ else:
61
+ return [
62
+ gr.Markdown("[如何配置API]('https://github.com/jabberwockyang/MedicalReviewAgent/blob/main/README.md')",
63
+ visible=False),
64
+ gr.Dropdown(["kimi", "deepseek", "zhipuai",'gpt'],
65
+ label="选择大模型提供商",
66
+ interactive=False,visible=False),
67
+ gr.Textbox(label="您的API",lines = 1,
68
+ interactive=False,visible=False),
69
+ gr.Dropdown([],label="选择模型",
70
+ interactive=False,visible=False)
71
+ ]
72
+
73
+ def udate_model_dropdown(remote_company):
74
+ model_choices = {
75
+ 'kimi': ['moonshot-v1-128k'],
76
+ 'deepseek': ['deepseek-chat'],
77
+ 'zhipuai': ['glm-4'],
78
+ 'gpt': ['gpt-4-32k-0613','gpt-3.5-turbo']
79
+ }
80
+ return gr.Dropdown(choices= model_choices[remote_company])
81
+
82
+ def update_remote_config(remote_ornot,remote_company = None,api = None,model = None):
83
+ with open(CONFIG_PATH, encoding='utf8') as f:
84
+ config = pytoml.load(f)
85
+
86
+ if remote_ornot:
87
+ if remote_company == None or api == None or model == None:
88
+ raise ValueError('remote_company, api, model not provided')
89
+ config['llm']['enable_local'] = 0
90
+ config['llm']['enable_remote'] = 1
91
+ config['llm']['server']['remote_type'] = remote_company
92
+ config['llm']['server']['remote_api_key'] = api
93
+ config['llm']['server']['remote_llm_model'] = model
94
+ else:
95
+ config['llm']['enable_local'] = 1
96
+ config['llm']['enable_remote'] = 0
97
+ with open(CONFIG_PATH, 'w') as f:
98
+ pytoml.dump(config, f)
99
+ return gr.Button("配置已保存")
100
+
101
+
102
+ def get_ready(query:str,chunksize=None,k=None):
103
+
104
+ with open(CONFIG_PATH, encoding='utf8') as f:
105
+ config = pytoml.load(f)
106
+ workdir = config['feature_store']['work_dir']
107
+ repodir = config['feature_store']['repo_dir']
108
+
109
+ if query == 'repo_work': # no need to return assistant
110
+ return repodir, workdir, config
111
+ theme = ''
112
+ try:
113
+ with open(os.path.join(config['feature_store']['repo_dir'],'config.json'), 'r') as f:
114
+ repo_config = json.load(f)
115
+ theme = ' '.join(repo_config['keywords'])
116
+ except:
117
+ pass
118
+
119
+ if query == 'annotation':
120
+ if not chunksize or not k:
121
+ raise ValueError('chunksize or k not provided')
122
+ chunkdir = os.path.join(workdir, f'chunksize_{chunksize}')
123
+ clusterdir = os.path.join(chunkdir, 'cluster_features', f'cluster_features_{k}')
124
+ assistant = Worker(work_dir=chunkdir, config_path=CONFIG_PATH,language='en')
125
+ samples_json = os.path.join(clusterdir,'samples.json')
126
+ with open(samples_json, 'r') as f:
127
+ samples = json.load(f)
128
+ f.close()
129
+ return clusterdir, samples, assistant, theme
130
+
131
+ elif query == 'inspiration':
132
+ if not chunksize or not k:
133
+ raise ValueError('chunksize or k not provided')
134
+
135
+ chunkdir = os.path.join(workdir, f'chunksize_{chunksize}')
136
+ clusterdir = os.path.join(chunkdir, 'cluster_features', f'cluster_features_{k}')
137
+ assistant = Worker(work_dir=chunkdir, config_path=CONFIG_PATH,language='en')
138
+ annofile = os.path.join(clusterdir,'annotation.jsonl')
139
+ with open(annofile, 'r') as f:
140
+ annoresult = f.readlines()
141
+
142
+ f.close()
143
+ annoresult = [json.loads(obj) for obj in annoresult]
144
+ return clusterdir, annoresult, assistant, theme
145
+ elif query == 'summarize': # no need for params k
146
+ if not chunksize:
147
+ raise ValueError('chunksize not provided')
148
+ chunkdir = os.path.join(workdir, f'chunksize_{chunksize}')
149
+ assistant = Worker(work_dir=chunkdir, config_path=CONFIG_PATH,language='en')
150
+ return assistant,theme
151
+
152
+ else:
153
+ raise ValueError('query not recognized')
154
+
155
+ def update_repo_info():
156
+ with open(CONFIG_PATH, encoding='utf8') as f:
157
+ config = pytoml.load(f)
158
+ repodir = config['feature_store']['repo_dir']
159
+ if os.path.exists(repodir):
160
+ pdffiles = glob.glob(os.path.join(repodir, '*.pdf'))
161
+ number_of_pdf = len(pdffiles)
162
+ if os.path.exists(os.path.join(repodir,'config.json')):
163
+
164
+ with open(os.path.join(repodir,'config.json'), 'r') as f:
165
+ repo_config = json.load(f)
166
+
167
+ keywords = repo_config['keywords']
168
+ length = repo_config['len']
169
+ retmax = repo_config['retmax']
170
+
171
+ return keywords,length,retmax,number_of_pdf
172
+ else:
173
+ return None,None,None,number_of_pdf
174
+ else:
175
+ return None,None,None,None
176
+
177
+ def upload_file(files):
178
+ repodir, workdir, _ = get_ready('repo_work')
179
+ if not os.path.exists(repodir):
180
+ os.makedirs(repodir)
181
+
182
+ for file in files:
183
+ destination_path = os.path.join(repodir, os.path.basename(file.name))
184
+
185
+ shutil.copy(file.name, destination_path)
186
+
187
+
188
+ return files
189
+
190
+ def generate_articles_repo(keywords:str,retmax:int):
191
+ keys= [k.strip() for k in keywords.split('\n')]
192
+ repodir, _, _ = get_ready('repo_work')
193
+
194
+ articelfinder = ArticleRetrieval(keywords = keys,
195
+ repo_dir = repodir,
196
+ retmax = retmax)
197
+ articelfinder.initiallize()
198
+ return update_repo()
199
+ def delete_articles_repo():
200
+ # 在这里运行生成数据库的函数
201
+ repodir, workdir, _ = get_ready('repo_work')
202
+ if os.path.exists(repodir):
203
+ shutil.rmtree(repodir)
204
+ if os.path.exists(workdir):
205
+ shutil.rmtree(workdir)
206
+
207
+ return gr.Textbox(label="文献库概况",lines =3,
208
+ value = '文献库和相关数据库已删除',
209
+ visible = True)
210
+
211
+ def update_repo():
212
+ keys,len,retmax,pdflen = update_repo_info()
213
+ if keys:
214
+ newinfo = f"搜索得到文献:\n 关键词:{keys}\n 文献数量:{len}\n 获取上限:{retmax}\n\n上传文献:\n 数量:{pdflen}"
215
+ else:
216
+ if pdflen:
217
+ newinfo = f'搜索得到文献:无\n上传文献:\n 数量:{pdflen}'
218
+ else:
219
+ newinfo = '目前还没有文献库'
220
+
221
+ return gr.Textbox(label="文献库概况",lines =1,
222
+ value = newinfo,
223
+ visible = True)
224
+
225
+ def update_database_info():
226
+ with open(CONFIG_PATH, encoding='utf8') as f:
227
+ config = pytoml.load(f)
228
+ workdir = config['feature_store']['work_dir']
229
+ chunkdirs = glob.glob(os.path.join(workdir, 'chunksize_*'))
230
+ chunkdirs.sort()
231
+ list_of_chunksize = [int(chunkdir.split('_')[-1]) for chunkdir in chunkdirs]
232
+ # print(list_of_chunksize)
233
+ jsonobj = {}
234
+ for chunkdir in chunkdirs:
235
+ k_dir = glob.glob(os.path.join(chunkdir, 'cluster_features','cluster_features_*'))
236
+ k_dir.sort()
237
+ list_of_k = [int(k.split('_')[-1]) for k in k_dir]
238
+ jsonobj[int(chunkdir.split('_')[-1])] = list_of_k
239
+
240
+
241
+ new_options = [f"chunksize:{chunksize}, k:{k}" for chunksize in list_of_chunksize for k in jsonobj[chunksize]]
242
+
243
+ return new_options, jsonobj
244
+
245
+
246
+ def generate_database(chunksize:int,nclusters:str|list[str]):
247
+ # 在这里运行生成数据库的函数
248
+ repodir, workdir, _ = get_ready('repo_work')
249
+ if not os.path.exists(repodir):
250
+ return gr.Textbox(label="数据库已生成",value = '请先生成文献库',visible = True)
251
+ nclusters = [int(i) for i in nclusters]
252
+ # 文献库和数据库的覆盖删除逻辑待定
253
+ # 理论上 文献库只能生成一次 所以每次生成文献库都要删除之前的文献库和数据库
254
+ # 数据库可以根据文献库多次生成 暂不做删除 目前没有节省算力的逻辑 重复计算后覆盖 以后优化
255
+ # 不同的chunksize和nclusters会放在不同的��件夹下 不会互相覆盖
256
+ # if os.path.exists(workdir):
257
+ # shutil.rmtree(workdir)
258
+
259
+ cache = CacheRetriever(config_path=CONFIG_PATH)
260
+ fs_init = FeatureStore(embeddings=cache.embeddings,
261
+ reranker=cache.reranker,
262
+ chunk_size=chunksize,
263
+ n_clusters=nclusters,
264
+ config_path=CONFIG_PATH)
265
+
266
+ # walk all files in repo dir
267
+ file_opr = FileOperation()
268
+ files = file_opr.scan_dir(repo_dir=repodir)
269
+ fs_init.initialize(files=files, work_dir=workdir,file_opr=file_opr)
270
+ file_opr.summarize(files)
271
+ del fs_init
272
+ cache.pop('default')
273
+ texts, _ = update_database_info()
274
+ return gr.Textbox(label="数据库概况",value = '\n'.join(texts) ,visible = True)
275
+
276
+ def delete_database():
277
+ _, workdir, _ = get_ready('repo_work')
278
+ if os.path.exists(workdir):
279
+ shutil.rmtree(workdir)
280
+
281
+ return gr.Textbox(label="数据库概况",lines =3,value = '数据库已删除',visible = True)
282
+ def update_database_textbox():
283
+ texts, _ = update_database_info()
284
+ if texts == []:
285
+ return gr.Textbox(label="数据库概况",value = '目前还没有数据库',visible = True)
286
+ else:
287
+ return gr.Textbox(label="数据库概况",value = '\n'.join(texts),visible = True)
288
+
289
+ def update_chunksize_dropdown():
290
+ _, jsonobj = update_database_info()
291
+ return gr.Dropdown(choices= jsonobj.keys())
292
+
293
+ def update_ncluster_dropdown(chunksize:int):
294
+ _, jsonobj = update_database_info()
295
+ nclusters = jsonobj[chunksize]
296
+ return gr.Dropdown(choices= nclusters)
297
+
298
+ def annotation(n,chunksize:int,nclusters:int,remote_ornot:bool):
299
+ '''
300
+ use llm to annotate cluster
301
+ n: percentage of clusters to annotate
302
+ '''
303
+ query = 'annotation'
304
+ if remote_ornot:
305
+ backend = 'remote'
306
+ else:
307
+ backend = 'local'
308
+
309
+ clusterdir, samples, assistant, theme = get_ready('annotation',chunksize,nclusters)
310
+ new_obj_list = []
311
+ n = round(n * len(samples.keys()))
312
+ for cluster_no in random.sample(samples.keys(), n):
313
+ chunk = '\n'.join(samples[cluster_no]['samples'][:10])
314
+
315
+ code, reply, cluster_no = assistant.annotate_cluster(
316
+ theme = theme,
317
+ cluster_no=cluster_no,
318
+ chunk=chunk,
319
+ history=[],
320
+ groupname='',
321
+ backend=backend)
322
+ references = f"cluster_no: {cluster_no}"
323
+ new_obj = {
324
+ 'cluster_no': cluster_no,
325
+ 'chunk': chunk,
326
+ 'annotation': reply
327
+ }
328
+ new_obj_list.append(new_obj)
329
+ logger.info(f'{code}, {query}, {reply}, {references}')
330
+
331
+ with open(os.path.join(clusterdir, 'annotation.jsonl'), 'a') as f:
332
+ json.dump(new_obj, f, ensure_ascii=False)
333
+ f.write('\n')
334
+
335
+ return '\n\n'.join([obj['annotation'] for obj in new_obj_list])
336
+
337
+
338
+ def inspiration(annotation:str,chunksize:int,nclusters:int,remote_ornot:bool):
339
+ query = 'inspiration'
340
+ if remote_ornot:
341
+ backend = 'remote'
342
+ else:
343
+ backend = 'local'
344
+
345
+ clusterdir, annoresult, assistant, theme = get_ready('inspiration',chunksize,nclusters)
346
+ new_obj_list = []
347
+
348
+ if annotation is not None: # if the user wants to get inspiration from specific clusters only
349
+ annoresult = [obj for obj in annoresult if obj['annotation'] in [txt.strip() for txt in annotation.split('\n')]]
350
+
351
+ for index in random.sample(range(len(annoresult)), min(5, len(annoresult))):
352
+ cluster_no = annoresult[index]['cluster_no']
353
+ chunks = annoresult[index]['annotation']
354
+
355
+ code, reply = assistant.getinspiration(
356
+ theme = theme,
357
+ annotations = chunks,
358
+ history=[],
359
+ groupname='',backend=backend)
360
+ new_obj = {
361
+ 'inspiration': reply,
362
+ 'cluster_no': cluster_no
363
+ }
364
+ new_obj_list.append(new_obj)
365
+ logger.info(f'{code}, {query}, {cluster_no},{reply}')
366
+
367
+ with open(os.path.join(clusterdir, 'inspiration.jsonl'), 'a') as f:
368
+ json.dump(new_obj, f, ensure_ascii=False)
369
+ with open(os.path.join(clusterdir, 'inspiration.txt'), 'a') as f:
370
+ f.write(f'{reply}\n')
371
+
372
+ return '\n\n'.join(list(set([obj['inspiration'] for obj in new_obj_list])))
373
+
374
+
375
+ def getpmcurls(references):
376
+ urls = []
377
+ for ref in references:
378
+ if ref.startswith('PMC'):
379
+
380
+ refid = ref.replace('.txt','')
381
+ urls.append(f'https://www.ncbi.nlm.nih.gov/pmc/articles/{refid}/')
382
+ else:
383
+ urls.append(ref)
384
+ return urls
385
+
386
+ def summarize_text(query,chunksize:int,remote_ornot:bool):
387
+ if remote_ornot:
388
+ backend = 'remote'
389
+ else:
390
+ backend = 'local'
391
+
392
+ assistant,_ = get_ready('summarize',chunksize=chunksize,k=None)
393
+ code, reply, references = assistant.generate(query=query,
394
+ history=[],
395
+ groupname='',backend = backend)
396
+
397
+ logger.info(f'{code}, {query}, {reply}, {references}')
398
+ urls = getpmcurls(references)
399
+ mds = '\n'.join([f'[{ref}]({url})' for ref,url in zip(references,urls)])
400
+ return reply, gr.Markdown(label="参考文献",value = mds)
401
+
402
+ def main_interface():
403
+ with gr.Blocks() as demo:
404
+ with gr.Row():
405
+ gr.Markdown(
406
+ """
407
+ # 医学文献综述助手 (又名 不想看文献)
408
+ """
409
+ )
410
+
411
+ with gr.Tab("模型服务配置"):
412
+ gr.Markdown("""
413
+ #### 配置模型服务 🛠️
414
+
415
+ 1. **是否使用远程大模型**
416
+ - 勾选此项,如果你想使用远程的大模型服务。
417
+ - 如果不勾选,将默认使用本地模型服务。
418
+
419
+ 2. **API配置**
420
+ - 配置大模型提供商和API,确保模型服务能够正常运行。
421
+ - 提供商选择:kimi、deepseek、zhipuai、gpt。
422
+ - 输入您的API密钥和选择对应模型。
423
+ - 点击“保存配置”按钮以保存您的设置。
424
+
425
+ 📝 **备注**:请参考[如何使用]('https://github.com/jabberwockyang/MedicalReviewAgent/blob/main/README.md')获取更多信息。
426
+
427
+ """)
428
+
429
+ remote_ornot = gr.Checkbox(label="是否使用远程大模型")
430
+ with gr.Accordion("API配置", open=True):
431
+ apimd = gr.Markdown("[如何配置API]('https://github.com/jabberwockyang/MedicalReviewAgent/blob/main/README.md')",visible=False)
432
+ remote_company = gr.Dropdown(["kimi", "deepseek", "zhipuai",'gpt'],
433
+ label="选择大模型提供商",interactive=False,visible=False)
434
+ api = gr.Textbox(label="您的API",lines = 1,interactive=False,visible=False)
435
+ model = gr.Dropdown([],label="选择模型",interactive=False,visible=False)
436
+
437
+ confirm_button = gr.Button("保存配置")
438
+
439
+ remote_ornot.change(update_remote_buttons, inputs=[remote_ornot],outputs=[apimd,remote_company,api,model])
440
+ remote_company.change(udate_model_dropdown, inputs=[remote_company],outputs=[model])
441
+ confirm_button.click(update_remote_config, inputs=[remote_ornot,remote_company,api,model],outputs=[confirm_button])
442
+
443
+
444
+ with gr.Tab("文献查找+数据库生成"):
445
+ gr.Markdown("""
446
+ #### 查找文献 📚
447
+
448
+ 1. **输入关键词批量PubMed PMC文献**
449
+ - 在“感兴趣的关键词”框中输入您感兴趣的关键词,每行一个。
450
+ - 设置查找数量(0-1000)。
451
+ - 点击“搜索PubMed PMC”按钮进行文献查找。
452
+
453
+ 2. **上传PDF**
454
+ - 通过“上传PDF”按钮上传您已有的PDF文献文件。
455
+
456
+ 3. **更新文献库情况 删除文献库**
457
+ - 点击“更新文献库情况”按钮,查看当前文献库的概况。
458
+ - 如果需要重置或删除现有文献库,点击“删除文献库”按钮。
459
+
460
+
461
+ #### 生成数据库 🗂️
462
+
463
+ 1. **设置数据库构建参数 生成数据库**
464
+ - 选择块大小(Chunk Size)和聚类数(Number of Clusters)。
465
+ - 提供选项用于选择合适的块大小和聚类数。
466
+ - 点击“生成数据库”按钮开始数据库生成过程。
467
+
468
+ 2. **更新数据库情况 删除数据库**
469
+ - 点击“更新数据库情况”按钮,查看当前数据库的概况。
470
+ - 点击“删除数据库”按钮移除现有数据库。
471
+
472
+ 📝 **备注**:请参考[如何选择数据库构建参数]('https://github.com/jabberwockyang/MedicalReviewAgent/tree/main')获取更多信息。
473
+ """)
474
+ with gr.Row(equal_height=True):
475
+ with gr.Column(scale=1):
476
+ input_keys = gr.Textbox(label="感兴趣的关键词",
477
+ lines = 5)
478
+ retmax = gr.Slider(
479
+ minimum=0,
480
+ maximum=1000,
481
+ value=500,
482
+ interactive=True,
483
+ label="查多少",
484
+ )
485
+ generate_repo_button = gr.Button("搜索PubMed PMC")
486
+ with gr.Column(scale=2):
487
+ file_output = gr.File(scale=2)
488
+ upload_button = gr.UploadButton("上传PDF",
489
+ file_types=[".pdf",".csv",".doc"],
490
+ file_count="multiple",scale=0)
491
+
492
+ with gr.Row(equal_height=True):
493
+ with gr.Column(scale=0):
494
+ delete_repo_button = gr.Button("删除文献库")
495
+ update_repo_button = gr.Button("更新文献库情况")
496
+ with gr.Column(scale=2):
497
+
498
+ repo_summary =gr.Textbox(label= '文献库概况', value="目前还没有文献库")
499
+
500
+ generate_repo_button.click(generate_articles_repo,
501
+ inputs=[input_keys,retmax],
502
+ outputs = [repo_summary])
503
+
504
+
505
+ delete_repo_button.click(delete_articles_repo, inputs=None,
506
+ outputs = repo_summary)
507
+ update_repo_button.click(update_repo, inputs=None,
508
+ outputs = repo_summary)
509
+ upload_button.upload(upload_file, upload_button, file_output)
510
+
511
+ with gr.Accordion("数据库构建参数", open=True):
512
+ gr.Markdown("[如何选择数据库构建参数]('https://github.com/jabberwockyang/MedicalReviewAgent/tree/main')")
513
+ chunksize = gr.Slider(label="Chunk Size",
514
+ info= 'How long you want the chunk to be?',
515
+ minimum=128, maximum=4096,value=1024,step=1,
516
+ interactive=True)
517
+ ncluster = gr.CheckboxGroup(["10", "20", "50", '100','200','500','1000'],
518
+ # default=["20", "50", '100'],
519
+ label="Number of Clusters",
520
+ info="How many Clusters you want to generate")
521
+
522
+ with gr.Row():
523
+ gene_database_button = gr.Button("生成数据库")
524
+ delete_database_button = gr.Button("删除数据库")
525
+ update_database_button = gr.Button("更新数据库情况")
526
+
527
+ database_summary = gr.Textbox(label="数据库概况",lines = 1,value="目前还没有数据库")
528
+
529
+
530
+ gene_database_button.click(generate_database, inputs=[chunksize,ncluster],
531
+ outputs = database_summary)
532
+
533
+ update_database_button.click(update_database_textbox,inputs=None,
534
+ outputs = [database_summary])
535
+
536
+ delete_database_button.click(delete_database, inputs=None,
537
+ outputs = database_summary)
538
+ with gr.Tab("写综述"):
539
+ gr.Markdown("""
540
+ #### 写综述 ✍️
541
+
542
+ 1. **更新数据库情况**
543
+ - 点击“更新数据库情况”按钮,确保使用最新的数据库信息。
544
+
545
+ 2. **选择块大小和聚类数**
546
+ - 从下拉菜单中选择合适的块大小和聚类数。
547
+
548
+ 3. **抽样标注文章聚类**
549
+ - 设置抽样标注比例(0-1)。
550
+ - 点击“抽样标注文章聚类”按钮开始标注过程。
551
+
552
+ 4. **获取灵感**
553
+ - 如果不知道写什么,点击“获取灵感”按钮。
554
+ - 系统将基于标注的文章聚类提供相应的综述子问题。
555
+
556
+ 5. **写综述**
557
+ - 输入您想写的内容或主题。
558
+ - 点击“写综述”按钮,生成综述文本。
559
+
560
+ 6. **查看生成结果**
561
+ - 生成的综述文本将显示在“看看”文本框中。
562
+ - 参考文献将显示在“参考文献”框中。
563
+
564
+ 📝 **备注**:可以尝试不同的参数进行标注和灵感获取,有助于提高综述的质量和相关性。
565
+ """)
566
+
567
+ with gr.Accordion("聚类标注相关参数", open=True):
568
+ with gr.Row():
569
+ update_options = gr.Button("更新数据库情况", scale=0)
570
+ chunksize = gr.Dropdown([], label="选择块大小", scale=0)
571
+ nclusters = gr.Dropdown([], label="选择聚类数", scale=0)
572
+ ntoread = gr.Slider(
573
+ minimum=0,maximum=1,value=0.5,
574
+ interactive=True,
575
+ label="抽样标注比例",
576
+ )
577
+
578
+ annotation_button = gr.Button("抽样标注文章聚类")
579
+ annotation_output = gr.Textbox(label="文章聚类标注/片段摘要",
580
+ lines = 5,
581
+ interactive= True,
582
+ show_copy_button=True)
583
+ inspiration_button = gr.Button("获取灵感")
584
+ inspiration_output = gr.Textbox(label="灵光一现",
585
+ lines = 5,
586
+ show_copy_button=True)
587
+
588
+
589
+ query = gr.Textbox(label="想写什么")
590
+
591
+ write_button = gr.Button("写综述")
592
+ output_text = gr.Textbox(label="看看",lines=10)
593
+ output_references = gr.Markdown(label="参考文献")
594
+
595
+ update_options.click(update_chunksize_dropdown,
596
+ outputs=[chunksize])
597
+
598
+ chunksize.change(update_ncluster_dropdown,
599
+ inputs=[chunksize],
600
+ outputs= [nclusters])
601
+
602
+ annotation_button.click(annotation,
603
+ inputs = [ntoread, chunksize, nclusters,remote_ornot],
604
+ outputs=[annotation_output])
605
+
606
+ inspiration_button.click(inspiration,
607
+ inputs= [annotation_output, chunksize, nclusters,remote_ornot],
608
+ outputs=[inspiration_output])
609
+
610
+ write_button.click(summarize_text,
611
+ inputs=[query, chunksize,remote_ornot],
612
+ outputs =[output_text,output_references])
613
+
614
+ demo.launch(share=False, server_name='0.0.0.0', debug=True,show_error=True,allowed_paths=['img_0.jpg'])
615
+
616
+ # start service
617
+ if __name__ == '__main__':
618
+ args = parse_args()
619
+ # copy config from config-bak
620
+ shutil.copy('config-bak.ini', args.config_path) # yyj
621
+ CONFIG_PATH = args.config_path
622
+
623
+ if args.standalone is True:
624
+ # hybrid llm serve
625
+ server_ready = Value('i', 0)
626
+ server_process = Process(target=llm_serve,
627
+ args=(args.config_path, server_ready))
628
+ server_process.start()
629
+ while True:
630
+ if server_ready.value == 0:
631
+ logger.info('waiting for server to be ready..')
632
+ time.sleep(3)
633
+ elif server_ready.value == 1:
634
+ break
635
+ else:
636
+ logger.error('start local LLM server failed, quit.')
637
+ raise Exception('local LLM path')
638
+ logger.info('Hybrid LLM Server start.')
639
+
640
+ main_interface()
config-bak.ini ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [feature_store]
2
+ reject_throttle = 0
3
+ embedding_model_path = "/root/models/bce-embedding-base_v1"
4
+ reranker_model_path = "/root/models/bce-reranker-base_v1"
5
+ repo_dir = "repodir"
6
+ work_dir = "workdir"
7
+ n_clusters = [20, 50]
8
+ chunk_size = 1024
9
+
10
+ [web_search]
11
+ x_api_key = "${YOUR-API-KEY}"
12
+ domain_partial_order = ["openai.com", "pytorch.org", "readthedocs.io", "nvidia.com", "stackoverflow.com", "juejin.cn", "zhuanlan.zhihu.com", "www.cnblogs.com"]
13
+ save_dir = "logs/web_search_result"
14
+
15
+ [llm]
16
+ enable_local = 1
17
+ enable_remote = 1
18
+ client_url = "http://127.0.0.1:8888/inference"
19
+
20
+ [llm.server]
21
+ local_llm_path = "/root/models/Qwen1.5-7B-Chat"
22
+ local_llm_max_text_length = 32000
23
+ local_llm_bind_port = 8888
24
+ remote_type = ""
25
+ remote_api_key = ""
26
+ remote_llm_max_text_length = 32000
27
+ remote_llm_model = ""
28
+ rpm = 500
29
+
30
+ [worker]
31
+ enable_sg_search = 0
32
+ save_path = "logs/work.txt"
33
+
34
+ [worker.time]
35
+ start = "00:00:00"
36
+ end = "23:59:59"
37
+ has_weekday = 1
38
+
39
+ [sg_search]
40
+ binary_src_path = "/usr/local/bin/src"
41
+ src_access_token = "${YOUR-SRC-ACCESS-TOKEN}"
42
+
43
+ [sg_search.opencompass]
44
+ github_repo_id = "open-compass/opencompass"
45
+ introduction = "用于评测大型语言模型(LLM). 它提供了完整的开源可复现的评测框架,支持大语言模型、多模态模型的一站式评测,基于分布式技术,对大参数量模型亦能实现高效评测。评测方向汇总为知识、语言、理解、推理、考试五大能力维度,整合集纳了超过70个评测数据集,合计提供了超过40万个模型评测问题,并提供长文本、安全、代码3类大模型特色技术能力评测。"
46
+
47
+ [sg_search.lmdeploy]
48
+ github_repo_id = "internlm/lmdeploy"
49
+ introduction = "lmdeploy 是一个用于压缩、部署和服务 LLM(Large Language Model)的工具包。是一个服务端场景下,transformer 结构 LLM 部署工具,支持 GPU 服务端部署,速度有保障,支持 Tensor Parallel,多并发优化,功能全面,包括模型转换、缓存历史会话的 cache feature 等. 它还提供了 WebUI、命令行和 gRPC 客户端接入。"
50
+
51
+ [frontend]
52
+ type = "none"
53
+ webhook_url = "https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxxxxxxx"
54
+ message_process_policy = "immediate"
55
+
56
+ [frontend.lark_group]
57
+ app_id = "cli_a53a34dcb778500e"
58
+ app_secret = "2ajhg1ixSvlNm1bJkH4tJhPfTCsGGHT1"
59
+ encrypt_key = "abc"
60
+ verification_token = "def"
61
+
62
+ [frontend.wechat_personal]
63
+ bind_port = 9527
config.ini ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [feature_store]
2
+ reject_throttle = 0
3
+ embedding_model_path = "/root/models/bce-embedding-base_v1"
4
+ reranker_model_path = "/root/models/bce-reranker-base_v1"
5
+ repo_dir = "repodir"
6
+ work_dir = "workdir"
7
+ n_clusters = [20, 50]
8
+ chunk_size = 1024
9
+
10
+ [web_search]
11
+ x_api_key = "${YOUR-API-KEY}"
12
+ domain_partial_order = ["openai.com", "pytorch.org", "readthedocs.io", "nvidia.com", "stackoverflow.com", "juejin.cn", "zhuanlan.zhihu.com", "www.cnblogs.com"]
13
+ save_dir = "logs/web_search_result"
14
+
15
+ [llm]
16
+ enable_local = 1
17
+ enable_remote = 1
18
+ client_url = "http://127.0.0.1:8888/inference"
19
+
20
+ [llm.server]
21
+ local_llm_path = "/root/models/Qwen1.5-7B-Chat"
22
+ local_llm_max_text_length = 32000
23
+ local_llm_bind_port = 8888
24
+ remote_type = ""
25
+ remote_api_key = ""
26
+ remote_llm_max_text_length = 32000
27
+ remote_llm_model = ""
28
+ rpm = 500
29
+
30
+ [worker]
31
+ enable_sg_search = 0
32
+ save_path = "logs/work.txt"
33
+
34
+ [worker.time]
35
+ start = "00:00:00"
36
+ end = "23:59:59"
37
+ has_weekday = 1
38
+
39
+ [sg_search]
40
+ binary_src_path = "/usr/local/bin/src"
41
+ src_access_token = "${YOUR-SRC-ACCESS-TOKEN}"
42
+
43
+ [sg_search.opencompass]
44
+ github_repo_id = "open-compass/opencompass"
45
+ introduction = "用于评测大型语言模型(LLM). 它提供了完整的开源可复现的评测框架,支持大语言模型、多模态模型的一站式评测,基于分布式技术,对大参数量模型亦能实现高效评测。评测方向汇总为知识、语言、理解、推理、考试五大能力维度,整合集纳了超过70个评测数据集,合计提供了超过40万个模型评测问题,并提供长文本、安全、代码3类大模型特色技术能力评测。"
46
+
47
+ [sg_search.lmdeploy]
48
+ github_repo_id = "internlm/lmdeploy"
49
+ introduction = "lmdeploy 是一个用于压缩、部署和服务 LLM(Large Language Model)的工具包。是一个服务端场景下,transformer 结构 LLM 部署工具,支持 GPU 服务端部署,速度有保障,支持 Tensor Parallel,多并发优化,功能全面,包括模型转换、缓存历史会话的 cache feature 等. 它还提供了 WebUI、命令行和 gRPC 客户端接入。"
50
+
51
+ [frontend]
52
+ type = "none"
53
+ webhook_url = "https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxxxxxxx"
54
+ message_process_policy = "immediate"
55
+
56
+ [frontend.lark_group]
57
+ app_id = "cli_a53a34dcb778500e"
58
+ app_secret = "2ajhg1ixSvlNm1bJkH4tJhPfTCsGGHT1"
59
+ encrypt_key = "abc"
60
+ verification_token = "def"
61
+
62
+ [frontend.wechat_personal]
63
+ bind_port = 9527
deepdoc/README.md ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ English | [简体中文](./README_zh.md)
2
+
3
+ # *Deep*Doc
4
+
5
+ - [1. Introduction](#1)
6
+ - [2. Vision](#2)
7
+ - [3. Parser](#3)
8
+
9
+ <a name="1"></a>
10
+ ## 1. Introduction
11
+
12
+ With a bunch of documents from various domains with various formats and along with diverse retrieval requirements,
13
+ an accurate analysis becomes a very challenge task. *Deep*Doc is born for that purpose.
14
+ There are 2 parts in *Deep*Doc so far: vision and parser.
15
+ You can run the flowing test programs if you're interested in our results of OCR, layout recognition and TSR.
16
+ ```bash
17
+ python deepdoc/vision/t_ocr.py -h
18
+ usage: t_ocr.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR]
19
+
20
+ options:
21
+ -h, --help show this help message and exit
22
+ --inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF
23
+ --output_dir OUTPUT_DIR
24
+ Directory where to store the output images. Default: './ocr_outputs'
25
+ ```
26
+ ```bash
27
+ python deepdoc/vision/t_recognizer.py -h
28
+ usage: t_recognizer.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR] [--threshold THRESHOLD] [--mode {layout,tsr}]
29
+
30
+ options:
31
+ -h, --help show this help message and exit
32
+ --inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF
33
+ --output_dir OUTPUT_DIR
34
+ Directory where to store the output images. Default: './layouts_outputs'
35
+ --threshold THRESHOLD
36
+ A threshold to filter out detections. Default: 0.5
37
+ --mode {layout,tsr} Task mode: layout recognition or table structure recognition
38
+ ```
39
+
40
+ Our models are served on HuggingFace. If you have trouble downloading HuggingFace models, this might help!!
41
+ ```bash
42
+ export HF_ENDPOINT=https://hf-mirror.com
43
+ ```
44
+
45
+ <a name="2"></a>
46
+ ## 2. Vision
47
+
48
+ We use vision information to resolve problems as human being.
49
+ - OCR. Since a lot of documents presented as images or at least be able to transform to image,
50
+ OCR is a very essential and fundamental or even universal solution for text extraction.
51
+ ```bash
52
+ python deepdoc/vision/t_ocr.py --inputs=path_to_images_or_pdfs --output_dir=path_to_store_result
53
+ ```
54
+ The inputs could be directory to images or PDF, or a image or PDF.
55
+ You can look into the folder 'path_to_store_result' where has images which demonstrate the positions of results,
56
+ txt files which contain the OCR text.
57
+ <div align="center" style="margin-top:20px;margin-bottom:20px;">
58
+ <img src="https://github.com/infiniflow/ragflow/assets/12318111/f25bee3d-aaf7-4102-baf5-d5208361d110" width="900"/>
59
+ </div>
60
+
61
+ - Layout recognition. Documents from different domain may have various layouts,
62
+ like, newspaper, magazine, book and résumé are distinct in terms of layout.
63
+ Only when machine have an accurate layout analysis, it can decide if these text parts are successive or not,
64
+ or this part needs Table Structure Recognition(TSR) to process, or this part is a figure and described with this caption.
65
+ We have 10 basic layout components which covers most cases:
66
+ - Text
67
+ - Title
68
+ - Figure
69
+ - Figure caption
70
+ - Table
71
+ - Table caption
72
+ - Header
73
+ - Footer
74
+ - Reference
75
+ - Equation
76
+
77
+ Have a try on the following command to see the layout detection results.
78
+ ```bash
79
+ python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=layout --output_dir=path_to_store_result
80
+ ```
81
+ The inputs could be directory to images or PDF, or a image or PDF.
82
+ You can look into the folder 'path_to_store_result' where has images which demonstrate the detection results as following:
83
+ <div align="center" style="margin-top:20px;margin-bottom:20px;">
84
+ <img src="https://github.com/infiniflow/ragflow/assets/12318111/07e0f625-9b28-43d0-9fbb-5bf586cd286f" width="1000"/>
85
+ </div>
86
+
87
+ - Table Structure Recognition(TSR). Data table is a frequently used structure to present data including numbers or text.
88
+ And the structure of a table might be very complex, like hierarchy headers, spanning cells and projected row headers.
89
+ Along with TSR, we also reassemble the content into sentences which could be well comprehended by LLM.
90
+ We have five labels for TSR task:
91
+ - Column
92
+ - Row
93
+ - Column header
94
+ - Projected row header
95
+ - Spanning cell
96
+
97
+ Have a try on the following command to see the layout detection results.
98
+ ```bash
99
+ python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=tsr --output_dir=path_to_store_result
100
+ ```
101
+ The inputs could be directory to images or PDF, or a image or PDF.
102
+ You can look into the folder 'path_to_store_result' where has both images and html pages which demonstrate the detection results as following:
103
+ <div align="center" style="margin-top:20px;margin-bottom:20px;">
104
+ <img src="https://github.com/infiniflow/ragflow/assets/12318111/cb24e81b-f2ba-49f3-ac09-883d75606f4c" width="1000"/>
105
+ </div>
106
+
107
+ <a name="3"></a>
108
+ ## 3. Parser
109
+
110
+ Four kinds of document formats as PDF, DOCX, EXCEL and PPT have their corresponding parser.
111
+ The most complex one is PDF parser since PDF's flexibility. The output of PDF parser includes:
112
+ - Text chunks with their own positions in PDF(page number and rectangular positions).
113
+ - Tables with cropped image from the PDF, and contents which has already translated into natural language sentences.
114
+ - Figures with caption and text in the figures.
115
+
116
+ ### Résumé
117
+
118
+ The résumé is a very complicated kind of document. A résumé which is composed of unstructured text
119
+ with various layouts could be resolved into structured data composed of nearly a hundred of fields.
120
+ We haven't opened the parser yet, as we open the processing method after parsing procedure.
121
+
122
+
deepdoc/README_zh.md ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [English](./README.md) | 简体中文
2
+
3
+ # *Deep*Doc
4
+
5
+ - [*Deep*Doc](#deepdoc)
6
+ - [1. 介绍](#1-介绍)
7
+ - [2. 视觉处理](#2-视觉处理)
8
+ - [3. 解析器](#3-解析器)
9
+ - [简历](#简历)
10
+
11
+ <a name="1"></a>
12
+ ## 1. 介绍
13
+
14
+ 对于来自不同领域、具有不同格式和不同检索要求的大量文档,准确的分析成为一项极具挑战性的任务。*Deep*Doc 就是为了这个目的而诞生的。到目前为止,*Deep*Doc 中有两个组成部分:视觉处理和解析器。如果您对我们的OCR、布局识别和TSR结果感兴趣,您可以运行下面的测试程序。
15
+
16
+ ```bash
17
+ python deepdoc/vision/t_ocr.py -h
18
+ usage: t_ocr.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR]
19
+
20
+ options:
21
+ -h, --help show this help message and exit
22
+ --inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF
23
+ --output_dir OUTPUT_DIR
24
+ Directory where to store the output images. Default: './ocr_outputs'
25
+ ```
26
+
27
+ ```bash
28
+ python deepdoc/vision/t_recognizer.py -h
29
+ usage: t_recognizer.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR] [--threshold THRESHOLD] [--mode {layout,tsr}]
30
+
31
+ options:
32
+ -h, --help show this help message and exit
33
+ --inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF
34
+ --output_dir OUTPUT_DIR
35
+ Directory where to store the output images. Default: './layouts_outputs'
36
+ --threshold THRESHOLD
37
+ A threshold to filter out detections. Default: 0.5
38
+ --mode {layout,tsr} Task mode: layout recognition or table structure recognition
39
+ ```
40
+
41
+ HuggingFace为我们的模型提供服务。如果你在下载HuggingFace模型时遇到问题,这可能会有所帮助!!
42
+
43
+ ```bash
44
+ export HF_ENDPOINT=https://hf-mirror.com
45
+ ```
46
+
47
+ <a name="2"></a>
48
+ ## 2. 视觉处理
49
+
50
+ 作为人类,我们使用视觉信息来解决问题。
51
+
52
+ - **OCR(Optical Character Recognition,光学字符识别)**。由于许多文档都是以图像形式呈现的,或者至少能够转换为图像,因此OCR是文本提取的一个非常重要、基本,甚至通用的解决方案。
53
+
54
+ ```bash
55
+ python deepdoc/vision/t_ocr.py --inputs=path_to_images_or_pdfs --output_dir=path_to_store_result
56
+ ```
57
+
58
+ 输入可以是图像或PDF的目录,或者单个图像、PDF文件。您可以查看文件夹 `path_to_store_result` ,其中有演示结果位置的图像,以及包含OCR文本的txt文件。
59
+
60
+ <div align="center" style="margin-top:20px;margin-bottom:20px;">
61
+ <img src="https://github.com/infiniflow/ragflow/assets/12318111/f25bee3d-aaf7-4102-baf5-d5208361d110" width="900"/>
62
+ </div>
63
+
64
+ - 布局识别(Layout recognition)。来自不同领域的文件可能有不同的布局,如报纸、杂志、书籍和简历在布局方面是不同的。只有当机器有准确的布局分析时,它才能决定这些文本部分是连续的还是不连续的,或者这个部分需要表结构识别(Table Structure Recognition,TSR)来处理,或者这个部件是一个图形并用这个标题来描述。我们有10个基本布局组件,涵盖了大多数情况:
65
+ - 文本
66
+ - 标题
67
+ - 配图
68
+ - 配图标题
69
+ - 表格
70
+ - 表格标题
71
+ - 页头
72
+ - 页尾
73
+ - 参考引用
74
+ - 公式
75
+
76
+ 请尝试以下命令以查看布局检测结果。
77
+
78
+ ```bash
79
+ python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=layout --output_dir=path_to_store_result
80
+ ```
81
+
82
+ 输入可以是图像或PDF的目录,或者单个图像、PDF文件。您可以查看文件夹 `path_to_store_result` ,其中有显示检测结果的图像,如下所示:
83
+ <div align="center" style="margin-top:20px;margin-bottom:20px;">
84
+ <img src="https://github.com/infiniflow/ragflow/assets/12318111/07e0f625-9b28-43d0-9fbb-5bf586cd286f" width="1000"/>
85
+ </div>
86
+
87
+ - **TSR(Table Structure Recognition,表结构识别)**。数据表是一种常用的结构,用于表示包括数字或文本在内的数据。表的结构可能非常复杂,比如层次结构标题、跨单元格和投影行标题。除了TSR,我们还将内容重新组合成LLM可以很好理解的句子。TSR任务有五个标签:
88
+ - 列
89
+ - 行
90
+ - 列标题
91
+ - 行标题
92
+ - 合并单元格
93
+
94
+ 请尝试以下命令以查看布局检测结果。
95
+
96
+ ```bash
97
+ python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=tsr --output_dir=path_to_store_result
98
+ ```
99
+
100
+ 输入可以是图像或PDF的目录,或者单个图像、PDF文件。您可以查看文件夹 `path_to_store_result` ,其中包含图像和html页面,这些页面展示了以下检测结果:
101
+
102
+ <div align="center" style="margin-top:20px;margin-bottom:20px;">
103
+ <img src="https://github.com/infiniflow/ragflow/assets/12318111/cb24e81b-f2ba-49f3-ac09-883d75606f4c" width="1000"/>
104
+ </div>
105
+
106
+ <a name="3"></a>
107
+ ## 3. 解析器
108
+
109
+ PDF、DOCX、EXCEL和PPT四种文档格式都有相应的解析器。最复杂的是PDF解析器,因为PDF具有灵活性。PDF解析器的输出包括:
110
+ - 在PDF中有自己位置的文本块(页码和矩形位置)。
111
+ - 带有PDF裁剪图像的表格,以及已经翻译成自然语言句子的内容。
112
+ - 图中带标题和文字的图。
113
+
114
+ ### 简历
115
+
116
+ 简历是一种非常复杂的文件。一份由各种布局的非结构化文本组成的简历可以分解为由近百个字段组成的结构化数据。我们还没有打开解析器,因为我们在解析过程之后打开了处理方法。
deepdoc/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from .parser import RAGFlowPdfParser
2
+ from .parser import PlainParser
3
+ from .parser import RAGFlowDocxParser
4
+ from .parser import RAGFlowExcelParser
5
+ from .parser import RAGFlowPptParser
6
+
deepdoc/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (338 Bytes). View file
 
deepdoc/parser/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ from .pdf_parser import RAGFlowPdfParser
4
+ from .pdf_parser import PlainParser
5
+ from .docx_parser import RAGFlowDocxParser
6
+ from .excel_parser import RAGFlowExcelParser
7
+ from .ppt_parser import RAGFlowPptParser
8
+
deepdoc/parser/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (388 Bytes). View file
 
deepdoc/parser/__pycache__/docx_parser.cpython-310.pyc ADDED
Binary file (4.75 kB). View file
 
deepdoc/parser/__pycache__/excel_parser.cpython-310.pyc ADDED
Binary file (2.64 kB). View file
 
deepdoc/parser/__pycache__/pdf_parser.cpython-310.pyc ADDED
Binary file (35.8 kB). View file
 
deepdoc/parser/__pycache__/ppt_parser.cpython-310.pyc ADDED
Binary file (2.11 kB). View file
 
deepdoc/parser/docx_parser.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from docx import Document
3
+ import re
4
+ import pandas as pd
5
+ from collections import Counter
6
+ from deepdoc.utils import rag_tokenizer
7
+ from io import BytesIO
8
+
9
+
10
+ class RAGFlowDocxParser:
11
+
12
+ def __extract_table_content(self, tb):
13
+ df = []
14
+ for row in tb.rows:
15
+ df.append([c.text for c in row.cells])
16
+ return self.__compose_table_content(pd.DataFrame(df))
17
+
18
+ def __compose_table_content(self, df):
19
+
20
+ def blockType(b):
21
+ patt = [
22
+ ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
23
+ (r"^(20|19)[0-9]{2}年$", "Dt"),
24
+ (r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
25
+ ("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
26
+ (r"^第*[一二三四1-4]季度$", "Dt"),
27
+ (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
28
+ (r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
29
+ ("^[0-9.,+%/ -]+$", "Nu"),
30
+ (r"^[0-9A-Z/\._~-]+$", "Ca"),
31
+ (r"^[A-Z]*[a-z' -]+$", "En"),
32
+ (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
33
+ (r"^.{1}$", "Sg")
34
+ ]
35
+ for p, n in patt:
36
+ if re.search(p, b):
37
+ return n
38
+ tks = [t for t in rag_tokenizer.tokenize(b).split(" ") if len(t) > 1]
39
+ if len(tks) > 3:
40
+ if len(tks) < 12:
41
+ return "Tx"
42
+ else:
43
+ return "Lx"
44
+
45
+ if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
46
+ return "Nr"
47
+
48
+ return "Ot"
49
+
50
+ if len(df) < 2:
51
+ return []
52
+ max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
53
+ 1, len(df)) for j in range(len(df.iloc[i, :]))])
54
+ max_type = max(max_type.items(), key=lambda x: x[1])[0]
55
+
56
+ colnm = len(df.iloc[0, :])
57
+ hdrows = [0] # header is not nessesarily appear in the first line
58
+ if max_type == "Nu":
59
+ for r in range(1, len(df)):
60
+ tys = Counter([blockType(str(df.iloc[r, j]))
61
+ for j in range(len(df.iloc[r, :]))])
62
+ tys = max(tys.items(), key=lambda x: x[1])[0]
63
+ if tys != max_type:
64
+ hdrows.append(r)
65
+
66
+ lines = []
67
+ for i in range(1, len(df)):
68
+ if i in hdrows:
69
+ continue
70
+ hr = [r - i for r in hdrows]
71
+ hr = [r for r in hr if r < 0]
72
+ t = len(hr) - 1
73
+ while t > 0:
74
+ if hr[t] - hr[t - 1] > 1:
75
+ hr = hr[t:]
76
+ break
77
+ t -= 1
78
+ headers = []
79
+ for j in range(len(df.iloc[i, :])):
80
+ t = []
81
+ for h in hr:
82
+ x = str(df.iloc[i + h, j]).strip()
83
+ if x in t:
84
+ continue
85
+ t.append(x)
86
+ t = ",".join(t)
87
+ if t:
88
+ t += ": "
89
+ headers.append(t)
90
+ cells = []
91
+ for j in range(len(df.iloc[i, :])):
92
+ if not str(df.iloc[i, j]):
93
+ continue
94
+ cells.append(headers[j] + str(df.iloc[i, j]))
95
+ lines.append(";".join(cells))
96
+
97
+ if colnm > 3:
98
+ return lines
99
+ return ["\n".join(lines)]
100
+
101
+ def __call__(self, fnm, from_page=0, to_page=100000):
102
+ self.doc = Document(fnm) if isinstance(
103
+ fnm, str) else Document(BytesIO(fnm))
104
+ pn = 0
105
+ secs = []
106
+ for p in self.doc.paragraphs:
107
+ if pn > to_page:
108
+ break
109
+ if from_page <= pn < to_page and p.text.strip():
110
+ secs.append((p.text, p.style.name))
111
+ for run in p.runs:
112
+ if 'lastRenderedPageBreak' in run._element.xml:
113
+ pn += 1
114
+ continue
115
+ if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
116
+ pn += 1
117
+
118
+ tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
119
+ return secs, tbls
deepdoc/parser/excel_parser.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from openpyxl import load_workbook
3
+ import sys
4
+ from io import BytesIO
5
+
6
+ def find_codec(blob):
7
+ global all_codecs
8
+ for c in all_codecs:
9
+ try:
10
+ blob[:1024].decode(c)
11
+ return c
12
+ except Exception as e:
13
+ pass
14
+ try:
15
+ blob.decode(c)
16
+ return c
17
+ except Exception as e:
18
+ pass
19
+
20
+ return "utf-8"
21
+
22
+
23
+ class RAGFlowExcelParser:
24
+ def html(self, fnm, chunk_rows=256):
25
+ if isinstance(fnm, str):
26
+ wb = load_workbook(fnm)
27
+ else:
28
+ wb = load_workbook(BytesIO(fnm))
29
+
30
+ tb_chunks = []
31
+ for sheetname in wb.sheetnames:
32
+ ws = wb[sheetname]
33
+ rows = list(ws.rows)
34
+ if not rows: continue
35
+
36
+ tb_rows_0 = "<tr>"
37
+ for t in list(rows[0]):
38
+ tb_rows_0 += f"<th>{t.value}</th>"
39
+ tb_rows_0 += "</tr>"
40
+
41
+ for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
42
+ tb = ""
43
+ tb += f"<table><caption>{sheetname}</caption>"
44
+ tb += tb_rows_0
45
+ for r in list(rows[1 + chunk_i * chunk_rows:1 + (chunk_i + 1) * chunk_rows]):
46
+ tb += "<tr>"
47
+ for i, c in enumerate(r):
48
+ if c.value is None:
49
+ tb += "<td></td>"
50
+ else:
51
+ tb += f"<td>{c.value}</td>"
52
+ tb += "</tr>"
53
+ tb += "</table>\n"
54
+ tb_chunks.append(tb)
55
+
56
+ return tb_chunks
57
+
58
+ def __call__(self, fnm):
59
+ if isinstance(fnm, str):
60
+ wb = load_workbook(fnm)
61
+ else:
62
+ wb = load_workbook(BytesIO(fnm))
63
+ res = []
64
+ for sheetname in wb.sheetnames:
65
+ ws = wb[sheetname]
66
+ rows = list(ws.rows)
67
+ if not rows:continue
68
+ ti = list(rows[0])
69
+ for r in list(rows[1:]):
70
+ l = []
71
+ for i, c in enumerate(r):
72
+ if not c.value:
73
+ continue
74
+ t = str(ti[i].value) if i < len(ti) else ""
75
+ t += (":" if t else "") + str(c.value)
76
+ l.append(t)
77
+ l = "; ".join(l)
78
+ if sheetname.lower().find("sheet") < 0:
79
+ l += " ——" + sheetname
80
+ res.append(l)
81
+ return res
82
+
83
+ @staticmethod
84
+ def row_number(fnm, binary):
85
+ if fnm.split(".")[-1].lower().find("xls") >= 0:
86
+ wb = load_workbook(BytesIO(binary))
87
+ total = 0
88
+ for sheetname in wb.sheetnames:
89
+ ws = wb[sheetname]
90
+ total += len(list(ws.rows))
91
+ return total
92
+
93
+ if fnm.split(".")[-1].lower() in ["csv", "txt"]:
94
+ encoding = find_codec(binary)
95
+ txt = binary.decode(encoding, errors="ignore")
96
+ return len(txt.split("\n"))
97
+
98
+
99
+ if __name__ == "__main__":
100
+ psr = RAGFlowExcelParser()
101
+ psr(sys.argv[1])
deepdoc/parser/pdf_parser.py ADDED
@@ -0,0 +1,1161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import os
3
+ import random
4
+
5
+ import xgboost as xgb
6
+ from io import BytesIO
7
+ import torch
8
+ import re
9
+ import pdfplumber
10
+ import logging
11
+ from PIL import Image, ImageDraw
12
+ import numpy as np
13
+ from timeit import default_timer as timer
14
+ from PyPDF2 import PdfReader as pdf2_read
15
+
16
+ from deepdoc.utils.file_utils import get_project_base_directory
17
+ from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
18
+ from deepdoc.utils import rag_tokenizer
19
+ from copy import deepcopy
20
+ from huggingface_hub import snapshot_download
21
+
22
+ logging.getLogger("pdfminer").setLevel(logging.WARNING)
23
+
24
+
25
+ class RAGFlowPdfParser:
26
+ def __init__(self):
27
+ self.ocr = OCR()
28
+ if hasattr(self, "model_speciess"):
29
+ self.layouter = LayoutRecognizer("layout." + self.model_speciess)
30
+ else:
31
+ self.layouter = LayoutRecognizer("layout")
32
+ self.tbl_det = TableStructureRecognizer()
33
+
34
+ self.updown_cnt_mdl = xgb.Booster()
35
+ if torch.cuda.is_available():
36
+ self.updown_cnt_mdl.set_param({"device": "cuda"})
37
+ try:
38
+ model_dir = os.path.join(
39
+ get_project_base_directory(),
40
+ "rag/res/deepdoc")
41
+ self.updown_cnt_mdl.load_model(os.path.join(
42
+ model_dir, "updown_concat_xgb.model"))
43
+ except Exception as e:
44
+ model_dir = snapshot_download(
45
+ repo_id="InfiniFlow/text_concat_xgb_v1.0",
46
+ local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
47
+ local_dir_use_symlinks=False)
48
+ self.updown_cnt_mdl.load_model(os.path.join(
49
+ model_dir, "updown_concat_xgb.model"))
50
+
51
+ self.page_from = 0
52
+ """
53
+ If you have trouble downloading HuggingFace models, -_^ this might help!!
54
+
55
+ For Linux:
56
+ export HF_ENDPOINT=https://hf-mirror.com
57
+
58
+ For Windows:
59
+ Good luck
60
+ ^_-
61
+
62
+ """
63
+
64
+ def __char_width(self, c):
65
+ return (c["x1"] - c["x0"]) // max(len(c["text"]), 1)
66
+
67
+ def __height(self, c):
68
+ return c["bottom"] - c["top"]
69
+
70
+ def _x_dis(self, a, b):
71
+ return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
72
+ abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
73
+
74
+ def _y_dis(
75
+ self, a, b):
76
+ return (
77
+ b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
78
+
79
+ def _match_proj(self, b):
80
+ proj_patt = [
81
+ r"第[零一二三四五六七八九十百]+章",
82
+ r"第[零一二三四五六七八九十百]+[条节]",
83
+ r"[零一二三四五六七八九十百]+[、是  ]",
84
+ r"[\((][零一二三四五六七八九十百]+[)\)]",
85
+ r"[\((][0-9]+[)\)]",
86
+ r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
87
+ r"[0-9]+\.[0-9.]+(、|\.[  ])",
88
+ r"[⚫•➢①② ]",
89
+ ]
90
+ return any([re.match(p, b["text"]) for p in proj_patt])
91
+
92
+ def _updown_concat_features(self, up, down):
93
+ w = max(self.__char_width(up), self.__char_width(down))
94
+ h = max(self.__height(up), self.__height(down))
95
+ y_dis = self._y_dis(up, down)
96
+ LEN = 6
97
+ tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split(" ")
98
+ tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split(" ")
99
+ tks_all = up["text"][-LEN:].strip() \
100
+ + (" " if re.match(r"[a-zA-Z0-9]+",
101
+ up["text"][-1] + down["text"][0]) else "") \
102
+ + down["text"][:LEN].strip()
103
+ tks_all = rag_tokenizer.tokenize(tks_all).split(" ")
104
+ fea = [
105
+ up.get("R", -1) == down.get("R", -1),
106
+ y_dis / h,
107
+ down["page_number"] - up["page_number"],
108
+ up["layout_type"] == down["layout_type"],
109
+ up["layout_type"] == "text",
110
+ down["layout_type"] == "text",
111
+ up["layout_type"] == "table",
112
+ down["layout_type"] == "table",
113
+ True if re.search(
114
+ r"([。?!;!?;+))]|[a-z]\.)$",
115
+ up["text"]) else False,
116
+ True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
117
+ True if re.search(
118
+ r"(^.?[/,?;:\],。;:’”?!》】)-])",
119
+ down["text"]) else False,
120
+ True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
121
+ True if re.search(r"[,,][^。.]+$", up["text"]) else False,
122
+ True if re.search(r"[,,][^。.]+$", up["text"]) else False,
123
+ True if re.search(r"[\((][^\))]+$", up["text"])
124
+ and re.search(r"[\))]", down["text"]) else False,
125
+ self._match_proj(down),
126
+ True if re.match(r"[A-Z]", down["text"]) else False,
127
+ True if re.match(r"[A-Z]", up["text"][-1]) else False,
128
+ True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
129
+ True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
130
+ up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
131
+ ) > 1 and len(
132
+ down["text"].strip()) > 1 else False,
133
+ up["x0"] > down["x1"],
134
+ abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
135
+ self.__height(down)),
136
+ self._x_dis(up, down) / max(w, 0.000001),
137
+ (len(up["text"]) - len(down["text"])) /
138
+ max(len(up["text"]), len(down["text"])),
139
+ len(tks_all) - len(tks_up) - len(tks_down),
140
+ len(tks_down) - len(tks_up),
141
+ tks_down[-1] == tks_up[-1],
142
+ max(down["in_row"], up["in_row"]),
143
+ abs(down["in_row"] - up["in_row"]),
144
+ len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0,
145
+ len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0
146
+ ]
147
+ return fea
148
+
149
+ @staticmethod
150
+ def sort_X_by_page(arr, threashold):
151
+ # sort using y1 first and then x1
152
+ arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
153
+ for i in range(len(arr) - 1):
154
+ for j in range(i, -1, -1):
155
+ # restore the order using th
156
+ if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
157
+ and arr[j + 1]["top"] < arr[j]["top"] \
158
+ and arr[j + 1]["page_number"] == arr[j]["page_number"]:
159
+ tmp = arr[j]
160
+ arr[j] = arr[j + 1]
161
+ arr[j + 1] = tmp
162
+ return arr
163
+
164
+ def _has_color(self, o):
165
+ if o.get("ncs", "") == "DeviceGray":
166
+ if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
167
+ o["non_stroking_color"][0] == 1:
168
+ if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
169
+ return False
170
+ return True
171
+
172
+ def _table_transformer_job(self, ZM):
173
+ logging.info("Table processing...")
174
+ imgs, pos = [], []
175
+ tbcnt = [0]
176
+ MARGIN = 10
177
+ self.tb_cpns = []
178
+ assert len(self.page_layout) == len(self.page_images)
179
+ for p, tbls in enumerate(self.page_layout): # for page
180
+ tbls = [f for f in tbls if f["type"] == "table"]
181
+ tbcnt.append(len(tbls))
182
+ if not tbls:
183
+ continue
184
+ for tb in tbls: # for table
185
+ left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
186
+ tb["x1"] + MARGIN, tb["bottom"] + MARGIN
187
+ left *= ZM
188
+ top *= ZM
189
+ right *= ZM
190
+ bott *= ZM
191
+ pos.append((left, top))
192
+ imgs.append(self.page_images[p].crop((left, top, right, bott)))
193
+
194
+ assert len(self.page_images) == len(tbcnt) - 1
195
+ if not imgs:
196
+ return
197
+ recos = self.tbl_det(imgs)
198
+ tbcnt = np.cumsum(tbcnt)
199
+ for i in range(len(tbcnt) - 1): # for page
200
+ pg = []
201
+ for j, tb_items in enumerate(
202
+ recos[tbcnt[i]: tbcnt[i + 1]]): # for table
203
+ poss = pos[tbcnt[i]: tbcnt[i + 1]]
204
+ for it in tb_items: # for table components
205
+ it["x0"] = (it["x0"] + poss[j][0])
206
+ it["x1"] = (it["x1"] + poss[j][0])
207
+ it["top"] = (it["top"] + poss[j][1])
208
+ it["bottom"] = (it["bottom"] + poss[j][1])
209
+ for n in ["x0", "x1", "top", "bottom"]:
210
+ it[n] /= ZM
211
+ it["top"] += self.page_cum_height[i]
212
+ it["bottom"] += self.page_cum_height[i]
213
+ it["pn"] = i
214
+ it["layoutno"] = j
215
+ pg.append(it)
216
+ self.tb_cpns.extend(pg)
217
+
218
+ def gather(kwd, fzy=10, ption=0.6):
219
+ eles = Recognizer.sort_Y_firstly(
220
+ [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
221
+ eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption)
222
+ return Recognizer.sort_Y_firstly(eles, 0)
223
+
224
+ # add R,H,C,SP tag to boxes within table layout
225
+ headers = gather(r".*header$")
226
+ rows = gather(r".* (row|header)")
227
+ spans = gather(r".*spanning")
228
+ clmns = sorted([r for r in self.tb_cpns if re.match(
229
+ r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
230
+ clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5)
231
+ for b in self.boxes:
232
+ if b.get("layout_type", "") != "table":
233
+ continue
234
+ ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
235
+ if ii is not None:
236
+ b["R"] = ii
237
+ b["R_top"] = rows[ii]["top"]
238
+ b["R_bott"] = rows[ii]["bottom"]
239
+
240
+ ii = Recognizer.find_overlapped_with_threashold(
241
+ b, headers, thr=0.3)
242
+ if ii is not None:
243
+ b["H_top"] = headers[ii]["top"]
244
+ b["H_bott"] = headers[ii]["bottom"]
245
+ b["H_left"] = headers[ii]["x0"]
246
+ b["H_right"] = headers[ii]["x1"]
247
+ b["H"] = ii
248
+
249
+ ii = Recognizer.find_horizontally_tightest_fit(b, clmns)
250
+ if ii is not None:
251
+ b["C"] = ii
252
+ b["C_left"] = clmns[ii]["x0"]
253
+ b["C_right"] = clmns[ii]["x1"]
254
+
255
+ ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
256
+ if ii is not None:
257
+ b["H_top"] = spans[ii]["top"]
258
+ b["H_bott"] = spans[ii]["bottom"]
259
+ b["H_left"] = spans[ii]["x0"]
260
+ b["H_right"] = spans[ii]["x1"]
261
+ b["SP"] = ii
262
+
263
+ def __ocr(self, pagenum, img, chars, ZM=3):
264
+ bxs = self.ocr.detect(np.array(img))
265
+ if not bxs:
266
+ self.boxes.append([])
267
+ return
268
+ bxs = [(line[0], line[1][0]) for line in bxs]
269
+ bxs = Recognizer.sort_Y_firstly(
270
+ [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
271
+ "top": b[0][1] / ZM, "text": "", "txt": t,
272
+ "bottom": b[-1][1] / ZM,
273
+ "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
274
+ self.mean_height[-1] / 3
275
+ )
276
+
277
+ # merge chars in the same rect
278
+ for c in Recognizer.sort_X_firstly(
279
+ chars, self.mean_width[pagenum - 1] // 4):
280
+ ii = Recognizer.find_overlapped(c, bxs)
281
+ if ii is None:
282
+ self.lefted_chars.append(c)
283
+ continue
284
+ ch = c["bottom"] - c["top"]
285
+ bh = bxs[ii]["bottom"] - bxs[ii]["top"]
286
+ if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
287
+ self.lefted_chars.append(c)
288
+ continue
289
+ if c["text"] == " " and bxs[ii]["text"]:
290
+ if re.match(r"[0-9a-zA-Z,.?;:!%%]", bxs[ii]["text"][-1]):
291
+ bxs[ii]["text"] += " "
292
+ else:
293
+ bxs[ii]["text"] += c["text"]
294
+
295
+ for b in bxs:
296
+ if not b["text"]:
297
+ left, right, top, bott = b["x0"] * ZM, b["x1"] * \
298
+ ZM, b["top"] * ZM, b["bottom"] * ZM
299
+ b["text"] = self.ocr.recognize(np.array(img),
300
+ np.array([[left, top], [right, top], [right, bott], [left, bott]],
301
+ dtype=np.float32))
302
+ del b["txt"]
303
+ bxs = [b for b in bxs if b["text"]]
304
+ if self.mean_height[-1] == 0:
305
+ self.mean_height[-1] = np.median([b["bottom"] - b["top"]
306
+ for b in bxs])
307
+ self.boxes.append(bxs)
308
+
309
+ def _layouts_rec(self, ZM, drop=True):
310
+ assert len(self.page_images) == len(self.boxes)
311
+ self.boxes, self.page_layout = self.layouter(
312
+ self.page_images, self.boxes, ZM, drop=drop)
313
+ # cumlative Y
314
+ for i in range(len(self.boxes)):
315
+ self.boxes[i]["top"] += \
316
+ self.page_cum_height[self.boxes[i]["page_number"] - 1]
317
+ self.boxes[i]["bottom"] += \
318
+ self.page_cum_height[self.boxes[i]["page_number"] - 1]
319
+
320
+ def _text_merge(self):
321
+ # merge adjusted boxes
322
+ bxs = self.boxes
323
+
324
+ def end_with(b, txt):
325
+ txt = txt.strip()
326
+ tt = b.get("text", "").strip()
327
+ return tt and tt.find(txt) == len(tt) - len(txt)
328
+
329
+ def start_with(b, txts):
330
+ tt = b.get("text", "").strip()
331
+ return tt and any([tt.find(t.strip()) == 0 for t in txts])
332
+
333
+ # horizontally merge adjacent box with the same layout
334
+ i = 0
335
+ while i < len(bxs) - 1:
336
+ b = bxs[i]
337
+ b_ = bxs[i + 1]
338
+ if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure",
339
+ "equation"]:
340
+ i += 1
341
+ continue
342
+ if abs(self._y_dis(b, b_)
343
+ ) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
344
+ # merge
345
+ bxs[i]["x1"] = b_["x1"]
346
+ bxs[i]["top"] = (b["top"] + b_["top"]) / 2
347
+ bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
348
+ bxs[i]["text"] += b_["text"]
349
+ bxs.pop(i + 1)
350
+ continue
351
+ i += 1
352
+ continue
353
+
354
+ dis_thr = 1
355
+ dis = b["x1"] - b_["x0"]
356
+ if b.get("layout_type", "") != "text" or b_.get(
357
+ "layout_type", "") != "text":
358
+ if end_with(b, ",") or start_with(b_, "(,"):
359
+ dis_thr = -8
360
+ else:
361
+ i += 1
362
+ continue
363
+
364
+ if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
365
+ and dis >= dis_thr and b["x1"] < b_["x1"]:
366
+ # merge
367
+ bxs[i]["x1"] = b_["x1"]
368
+ bxs[i]["top"] = (b["top"] + b_["top"]) / 2
369
+ bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
370
+ bxs[i]["text"] += b_["text"]
371
+ bxs.pop(i + 1)
372
+ continue
373
+ i += 1
374
+ self.boxes = bxs
375
+
376
+ def _naive_vertical_merge(self):
377
+ bxs = Recognizer.sort_Y_firstly(
378
+ self.boxes, np.median(
379
+ self.mean_height) / 3)
380
+ i = 0
381
+ while i + 1 < len(bxs):
382
+ b = bxs[i]
383
+ b_ = bxs[i + 1]
384
+ if b["page_number"] < b_["page_number"] and re.match(
385
+ r"[0-9 •一—-]+$", b["text"]):
386
+ bxs.pop(i)
387
+ continue
388
+ if not b["text"].strip():
389
+ bxs.pop(i)
390
+ continue
391
+ concatting_feats = [
392
+ b["text"].strip()[-1] in ",;:'\",、‘“;:-",
393
+ len(b["text"].strip()) > 1 and b["text"].strip(
394
+ )[-2] in ",;:'\",‘“、;:",
395
+ b["text"].strip()[0] in "。;?!?”)),,、:",
396
+ ]
397
+ # features for not concating
398
+ feats = [
399
+ b.get("layoutno", 0) != b_.get("layoutno", 0),
400
+ b["text"].strip()[-1] in "。?!?",
401
+ self.is_english and b["text"].strip()[-1] in ".!?",
402
+ b["page_number"] == b_["page_number"] and b_["top"] -
403
+ b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
404
+ b["page_number"] < b_["page_number"] and abs(
405
+ b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4,
406
+ ]
407
+ # split features
408
+ detach_feats = [b["x1"] < b_["x0"],
409
+ b["x0"] > b_["x1"]]
410
+ if (any(feats) and not any(concatting_feats)) or any(detach_feats):
411
+ print(
412
+ b["text"],
413
+ b_["text"],
414
+ any(feats),
415
+ any(concatting_feats),
416
+ any(detach_feats))
417
+ i += 1
418
+ continue
419
+ # merge up and down
420
+ b["bottom"] = b_["bottom"]
421
+ b["text"] += b_["text"]
422
+ b["x0"] = min(b["x0"], b_["x0"])
423
+ b["x1"] = max(b["x1"], b_["x1"])
424
+ bxs.pop(i + 1)
425
+ self.boxes = bxs
426
+
427
+ def _concat_downward(self, concat_between_pages=True):
428
+ # count boxes in the same row as a feature
429
+ for i in range(len(self.boxes)):
430
+ mh = self.mean_height[self.boxes[i]["page_number"] - 1]
431
+ self.boxes[i]["in_row"] = 0
432
+ j = max(0, i - 12)
433
+ while j < min(i + 12, len(self.boxes)):
434
+ if j == i:
435
+ j += 1
436
+ continue
437
+ ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
438
+ if abs(ydis) < 1:
439
+ self.boxes[i]["in_row"] += 1
440
+ elif ydis > 0:
441
+ break
442
+ j += 1
443
+
444
+ # concat between rows
445
+ boxes = deepcopy(self.boxes)
446
+ blocks = []
447
+ while boxes:
448
+ chunks = []
449
+
450
+ def dfs(up, dp):
451
+ chunks.append(up)
452
+ i = dp
453
+ while i < min(dp + 12, len(boxes)):
454
+ ydis = self._y_dis(up, boxes[i])
455
+ smpg = up["page_number"] == boxes[i]["page_number"]
456
+ mh = self.mean_height[up["page_number"] - 1]
457
+ mw = self.mean_width[up["page_number"] - 1]
458
+ if smpg and ydis > mh * 4:
459
+ break
460
+ if not smpg and ydis > mh * 16:
461
+ break
462
+ down = boxes[i]
463
+ if not concat_between_pages and down["page_number"] > up["page_number"]:
464
+ break
465
+
466
+ if up.get("R", "") != down.get(
467
+ "R", "") and up["text"][-1] != ",":
468
+ i += 1
469
+ continue
470
+
471
+ if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
472
+ or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]) \
473
+ or not down["text"].strip():
474
+ i += 1
475
+ continue
476
+
477
+ if not down["text"].strip():
478
+ i += 1
479
+ continue
480
+
481
+ if up["x1"] < down["x0"] - 10 * \
482
+ mw or up["x0"] > down["x1"] + 10 * mw:
483
+ i += 1
484
+ continue
485
+
486
+ if i - dp < 5 and up.get("layout_type") == "text":
487
+ if up.get("layoutno", "1") == down.get(
488
+ "layoutno", "2"):
489
+ dfs(down, i + 1)
490
+ boxes.pop(i)
491
+ return
492
+ i += 1
493
+ continue
494
+
495
+ fea = self._updown_concat_features(up, down)
496
+ if self.updown_cnt_mdl.predict(
497
+ xgb.DMatrix([fea]))[0] <= 0.5:
498
+ i += 1
499
+ continue
500
+ dfs(down, i + 1)
501
+ boxes.pop(i)
502
+ return
503
+
504
+ dfs(boxes[0], 1)
505
+ boxes.pop(0)
506
+ if chunks:
507
+ blocks.append(chunks)
508
+
509
+ # concat within each block
510
+ boxes = []
511
+ for b in blocks:
512
+ if len(b) == 1:
513
+ boxes.append(b[0])
514
+ continue
515
+ t = b[0]
516
+ for c in b[1:]:
517
+ t["text"] = t["text"].strip()
518
+ c["text"] = c["text"].strip()
519
+ if not c["text"]:
520
+ continue
521
+ if t["text"] and re.match(
522
+ r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
523
+ t["text"] += " "
524
+ t["text"] += c["text"]
525
+ t["x0"] = min(t["x0"], c["x0"])
526
+ t["x1"] = max(t["x1"], c["x1"])
527
+ t["page_number"] = min(t["page_number"], c["page_number"])
528
+ t["bottom"] = c["bottom"]
529
+ if not t["layout_type"] \
530
+ and c["layout_type"]:
531
+ t["layout_type"] = c["layout_type"]
532
+ boxes.append(t)
533
+
534
+ self.boxes = Recognizer.sort_Y_firstly(boxes, 0)
535
+
536
+ def _filter_forpages(self):
537
+ if not self.boxes:
538
+ return
539
+ findit = False
540
+ i = 0
541
+ while i < len(self.boxes):
542
+ if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
543
+ re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
544
+ i += 1
545
+ continue
546
+ findit = True
547
+ eng = re.match(
548
+ r"[0-9a-zA-Z :'.-]{5,}",
549
+ self.boxes[i]["text"].strip())
550
+ self.boxes.pop(i)
551
+ if i >= len(self.boxes):
552
+ break
553
+ prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
554
+ self.boxes[i]["text"].strip().split(" ")[:2])
555
+ while not prefix:
556
+ self.boxes.pop(i)
557
+ if i >= len(self.boxes):
558
+ break
559
+ prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
560
+ self.boxes[i]["text"].strip().split(" ")[:2])
561
+ self.boxes.pop(i)
562
+ if i >= len(self.boxes) or not prefix:
563
+ break
564
+ for j in range(i, min(i + 128, len(self.boxes))):
565
+ if not re.match(prefix, self.boxes[j]["text"]):
566
+ continue
567
+ for k in range(i, j):
568
+ self.boxes.pop(i)
569
+ break
570
+ if findit:
571
+ return
572
+
573
+ page_dirty = [0] * len(self.page_images)
574
+ for b in self.boxes:
575
+ if re.search(r"(··|··|··)", b["text"]):
576
+ page_dirty[b["page_number"] - 1] += 1
577
+ page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3])
578
+ if not page_dirty:
579
+ return
580
+ i = 0
581
+ while i < len(self.boxes):
582
+ if self.boxes[i]["page_number"] in page_dirty:
583
+ self.boxes.pop(i)
584
+ continue
585
+ i += 1
586
+
587
+ def _merge_with_same_bullet(self):
588
+ i = 0
589
+ while i + 1 < len(self.boxes):
590
+ b = self.boxes[i]
591
+ b_ = self.boxes[i + 1]
592
+ if not b["text"].strip():
593
+ self.boxes.pop(i)
594
+ continue
595
+ if not b_["text"].strip():
596
+ self.boxes.pop(i + 1)
597
+ continue
598
+
599
+ if b["text"].strip()[0] != b_["text"].strip()[0] \
600
+ or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
601
+ or rag_tokenizer.is_chinese(b["text"].strip()[0]) \
602
+ or b["top"] > b_["bottom"]:
603
+ i += 1
604
+ continue
605
+ b_["text"] = b["text"] + "\n" + b_["text"]
606
+ b_["x0"] = min(b["x0"], b_["x0"])
607
+ b_["x1"] = max(b["x1"], b_["x1"])
608
+ b_["top"] = b["top"]
609
+ self.boxes.pop(i)
610
+
611
+ def _extract_table_figure(self, need_image, ZM,
612
+ return_html, need_position):
613
+ tables = {}
614
+ figures = {}
615
+ # extract figure and table boxes
616
+ i = 0
617
+ lst_lout_no = ""
618
+ nomerge_lout_no = []
619
+ while i < len(self.boxes):
620
+ if "layoutno" not in self.boxes[i]:
621
+ i += 1
622
+ continue
623
+ lout_no = str(self.boxes[i]["page_number"]) + \
624
+ "-" + str(self.boxes[i]["layoutno"])
625
+ if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
626
+ "title",
627
+ "figure caption",
628
+ "reference"]:
629
+ nomerge_lout_no.append(lst_lout_no)
630
+ if self.boxes[i]["layout_type"] == "table":
631
+ if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
632
+ self.boxes.pop(i)
633
+ continue
634
+ if lout_no not in tables:
635
+ tables[lout_no] = []
636
+ tables[lout_no].append(self.boxes[i])
637
+ self.boxes.pop(i)
638
+ lst_lout_no = lout_no
639
+ continue
640
+ if need_image and self.boxes[i]["layout_type"] == "figure":
641
+ if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
642
+ self.boxes.pop(i)
643
+ continue
644
+ if lout_no not in figures:
645
+ figures[lout_no] = []
646
+ figures[lout_no].append(self.boxes[i])
647
+ self.boxes.pop(i)
648
+ lst_lout_no = lout_no
649
+ continue
650
+ i += 1
651
+
652
+ # merge table on different pages
653
+ nomerge_lout_no = set(nomerge_lout_no)
654
+ tbls = sorted([(k, bxs) for k, bxs in tables.items()],
655
+ key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
656
+
657
+ i = len(tbls) - 1
658
+ while i - 1 >= 0:
659
+ k0, bxs0 = tbls[i - 1]
660
+ k, bxs = tbls[i]
661
+ i -= 1
662
+ if k0 in nomerge_lout_no:
663
+ continue
664
+ if bxs[0]["page_number"] == bxs0[0]["page_number"]:
665
+ continue
666
+ if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
667
+ continue
668
+ mh = self.mean_height[bxs[0]["page_number"] - 1]
669
+ if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
670
+ continue
671
+ tables[k0].extend(tables[k])
672
+ del tables[k]
673
+
674
+ def x_overlapped(a, b):
675
+ return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
676
+
677
+ # find captions and pop out
678
+ i = 0
679
+ while i < len(self.boxes):
680
+ c = self.boxes[i]
681
+ # mh = self.mean_height[c["page_number"]-1]
682
+ if not TableStructureRecognizer.is_caption(c):
683
+ i += 1
684
+ continue
685
+
686
+ # find the nearest layouts
687
+ def nearest(tbls):
688
+ nonlocal c
689
+ mink = ""
690
+ minv = 1000000000
691
+ for k, bxs in tbls.items():
692
+ for b in bxs:
693
+ if b.get("layout_type", "").find("caption") >= 0:
694
+ continue
695
+ y_dis = self._y_dis(c, b)
696
+ x_dis = self._x_dis(
697
+ c, b) if not x_overlapped(
698
+ c, b) else 0
699
+ dis = y_dis * y_dis + x_dis * x_dis
700
+ if dis < minv:
701
+ mink = k
702
+ minv = dis
703
+ return mink, minv
704
+
705
+ tk, tv = nearest(tables)
706
+ fk, fv = nearest(figures)
707
+ # if min(tv, fv) > 2000:
708
+ # i += 1
709
+ # continue
710
+ if tv < fv and tk:
711
+ tables[tk].insert(0, c)
712
+ logging.debug(
713
+ "TABLE:" +
714
+ self.boxes[i]["text"] +
715
+ "; Cap: " +
716
+ tk)
717
+ elif fk:
718
+ figures[fk].insert(0, c)
719
+ logging.debug(
720
+ "FIGURE:" +
721
+ self.boxes[i]["text"] +
722
+ "; Cap: " +
723
+ tk)
724
+ self.boxes.pop(i)
725
+
726
+ res = []
727
+ positions = []
728
+
729
+ def cropout(bxs, ltype, poss):
730
+ nonlocal ZM
731
+ pn = set([b["page_number"] - 1 for b in bxs])
732
+ if len(pn) < 2:
733
+ pn = list(pn)[0]
734
+ ht = self.page_cum_height[pn]
735
+ b = {
736
+ "x0": np.min([b["x0"] for b in bxs]),
737
+ "top": np.min([b["top"] for b in bxs]) - ht,
738
+ "x1": np.max([b["x1"] for b in bxs]),
739
+ "bottom": np.max([b["bottom"] for b in bxs]) - ht
740
+ }
741
+ louts = [l for l in self.page_layout[pn] if l["type"] == ltype]
742
+ ii = Recognizer.find_overlapped(b, louts, naive=True)
743
+ if ii is not None:
744
+ b = louts[ii]
745
+ else:
746
+ logging.warn(
747
+ f"Missing layout match: {pn + 1},%s" %
748
+ (bxs[0].get(
749
+ "layoutno", "")))
750
+
751
+ left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
752
+ if right < left: right = left + 1
753
+ poss.append((pn + self.page_from, left, right, top, bott))
754
+ return self.page_images[pn] \
755
+ .crop((left * ZM, top * ZM,
756
+ right * ZM, bott * ZM))
757
+ pn = {}
758
+ for b in bxs:
759
+ p = b["page_number"] - 1
760
+ if p not in pn:
761
+ pn[p] = []
762
+ pn[p].append(b)
763
+ pn = sorted(pn.items(), key=lambda x: x[0])
764
+ imgs = [cropout(arr, ltype, poss) for p, arr in pn]
765
+ pic = Image.new("RGB",
766
+ (int(np.max([i.size[0] for i in imgs])),
767
+ int(np.sum([m.size[1] for m in imgs]))),
768
+ (245, 245, 245))
769
+ height = 0
770
+ for img in imgs:
771
+ pic.paste(img, (0, int(height)))
772
+ height += img.size[1]
773
+ return pic
774
+
775
+ # crop figure out and add caption
776
+ for k, bxs in figures.items():
777
+ txt = "\n".join([b["text"] for b in bxs])
778
+ if not txt:
779
+ continue
780
+
781
+ poss = []
782
+ res.append(
783
+ (cropout(
784
+ bxs,
785
+ "figure", poss),
786
+ [txt]))
787
+ positions.append(poss)
788
+
789
+ for k, bxs in tables.items():
790
+ if not bxs:
791
+ continue
792
+ bxs = Recognizer.sort_Y_firstly(bxs, np.mean(
793
+ [(b["bottom"] - b["top"]) / 2 for b in bxs]))
794
+ poss = []
795
+ res.append((cropout(bxs, "table", poss),
796
+ self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
797
+ positions.append(poss)
798
+
799
+ assert len(positions) == len(res)
800
+
801
+ if need_position:
802
+ return list(zip(res, positions))
803
+ return res
804
+
805
+ def proj_match(self, line):
806
+ if len(line) <= 2:
807
+ return
808
+ if re.match(r"[0-9 ().,%%+/-]+$", line):
809
+ return False
810
+ for p, j in [
811
+ (r"第[零一二三四五六七八九十百]+章", 1),
812
+ (r"第[零一二三四五六七八九十百]+[条节]", 2),
813
+ (r"[零一二三四五六七八九十百]+[、  ]", 3),
814
+ (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
815
+ (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
816
+ (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
817
+ (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
818
+ (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
819
+ (r".{,48}[::??]$", 9),
820
+ (r"[0-9]+)", 10),
821
+ (r"[\((][0-9]+[)\)]", 11),
822
+ (r"[零一二三四五六七八九十百]+是", 12),
823
+ (r"[⚫•➢✓]", 12)
824
+ ]:
825
+ if re.match(p, line):
826
+ return j
827
+ return
828
+
829
+ def _line_tag(self, bx, ZM):
830
+ pn = [bx["page_number"]]
831
+ top = bx["top"] - self.page_cum_height[pn[0] - 1]
832
+ bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
833
+ page_images_cnt = len(self.page_images)
834
+ if pn[-1] - 1 >= page_images_cnt: return ""
835
+ while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
836
+ bott -= self.page_images[pn[-1] - 1].size[1] / ZM
837
+ pn.append(pn[-1] + 1)
838
+ if pn[-1] - 1 >= page_images_cnt:
839
+ return ""
840
+
841
+ return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
842
+ .format("-".join([str(p) for p in pn]),
843
+ bx["x0"], bx["x1"], top, bott)
844
+
845
+ def __filterout_scraps(self, boxes, ZM):
846
+
847
+ def width(b):
848
+ return b["x1"] - b["x0"]
849
+
850
+ def height(b):
851
+ return b["bottom"] - b["top"]
852
+
853
+ def usefull(b):
854
+ if b.get("layout_type"):
855
+ return True
856
+ if width(
857
+ b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
858
+ return True
859
+ if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
860
+ return True
861
+ return False
862
+
863
+ res = []
864
+ while boxes:
865
+ lines = []
866
+ widths = []
867
+ pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
868
+ mh = self.mean_height[boxes[0]["page_number"] - 1]
869
+ mj = self.proj_match(
870
+ boxes[0]["text"]) or boxes[0].get(
871
+ "layout_type",
872
+ "") == "title"
873
+
874
+ def dfs(line, st):
875
+ nonlocal mh, pw, lines, widths
876
+ lines.append(line)
877
+ widths.append(width(line))
878
+ width_mean = np.mean(widths)
879
+ mmj = self.proj_match(
880
+ line["text"]) or line.get(
881
+ "layout_type",
882
+ "") == "title"
883
+ for i in range(st + 1, min(st + 20, len(boxes))):
884
+ if (boxes[i]["page_number"] - line["page_number"]) > 0:
885
+ break
886
+ if not mmj and self._y_dis(
887
+ line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
888
+ break
889
+
890
+ if not usefull(boxes[i]):
891
+ continue
892
+ if mmj or \
893
+ (self._x_dis(boxes[i], line) < pw / 10): \
894
+ # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
895
+ # concat following
896
+ dfs(boxes[i], i)
897
+ boxes.pop(i)
898
+ break
899
+
900
+ try:
901
+ if usefull(boxes[0]):
902
+ dfs(boxes[0], 0)
903
+ else:
904
+ logging.debug("WASTE: " + boxes[0]["text"])
905
+ except Exception as e:
906
+ pass
907
+ boxes.pop(0)
908
+ mw = np.mean(widths)
909
+ if mj or mw / pw >= 0.35 or mw > 200:
910
+ res.append(
911
+ "\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
912
+ else:
913
+ logging.debug("REMOVED: " +
914
+ "<<".join([c["text"] for c in lines]))
915
+
916
+ return "\n\n".join(res)
917
+
918
+ @staticmethod
919
+ def total_page_number(fnm, binary=None):
920
+ try:
921
+ pdf = pdfplumber.open(
922
+ fnm) if not binary else pdfplumber.open(BytesIO(binary))
923
+ return len(pdf.pages)
924
+ except Exception as e:
925
+ logging.error(str(e))
926
+
927
+ def __images__(self, fnm, zoomin=3, page_from=0,
928
+ page_to=299, callback=None):
929
+ self.lefted_chars = []
930
+ self.mean_height = []
931
+ self.mean_width = []
932
+ self.boxes = []
933
+ self.garbages = {}
934
+ self.page_cum_height = [0]
935
+ self.page_layout = []
936
+ self.page_from = page_from
937
+ st = timer()
938
+ try:
939
+ self.pdf = pdfplumber.open(fnm) if isinstance(
940
+ fnm, str) else pdfplumber.open(BytesIO(fnm))
941
+ self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
942
+ enumerate(self.pdf.pages[page_from:page_to])]
943
+ self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in
944
+ self.pdf.pages[page_from:page_to]]
945
+ self.total_page = len(self.pdf.pages)
946
+ except Exception as e:
947
+ logging.error(str(e))
948
+
949
+ self.outlines = []
950
+ try:
951
+ self.pdf = pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm))
952
+ outlines = self.pdf.outline
953
+
954
+ def dfs(arr, depth):
955
+ for a in arr:
956
+ if isinstance(a, dict):
957
+ self.outlines.append((a["/Title"], depth))
958
+ continue
959
+ dfs(a, depth + 1)
960
+
961
+ dfs(outlines, 0)
962
+ except Exception as e:
963
+ logging.warning(f"Outlines exception: {e}")
964
+ if not self.outlines:
965
+ logging.warning(f"Miss outlines")
966
+
967
+ logging.info("Images converted.")
968
+ self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
969
+ random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
970
+ range(len(self.page_chars))]
971
+ if sum([1 if e else 0 for e in self.is_english]) > len(
972
+ self.page_images) / 2:
973
+ self.is_english = True
974
+ else:
975
+ self.is_english = False
976
+ self.is_english = False
977
+
978
+ st = timer()
979
+ for i, img in enumerate(self.page_images):
980
+ chars = self.page_chars[i] if not self.is_english else []
981
+ self.mean_height.append(
982
+ np.median(sorted([c["height"] for c in chars])) if chars else 0
983
+ )
984
+ self.mean_width.append(
985
+ np.median(sorted([c["width"] for c in chars])) if chars else 8
986
+ )
987
+ self.page_cum_height.append(img.size[1] / zoomin)
988
+ j = 0
989
+ while j + 1 < len(chars):
990
+ if chars[j]["text"] and chars[j + 1]["text"] \
991
+ and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
992
+ and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
993
+ chars[j]["width"]) / 2:
994
+ chars[j]["text"] += " "
995
+ j += 1
996
+
997
+ self.__ocr(i + 1, img, chars, zoomin)
998
+ if callback and i % 6 == 5:
999
+ callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
1000
+ # print("OCR:", timer()-st)
1001
+
1002
+ if not self.is_english and not any(
1003
+ [c for c in self.page_chars]) and self.boxes:
1004
+ bxes = [b for bxs in self.boxes for b in bxs]
1005
+ self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
1006
+ "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
1007
+
1008
+ logging.info(f"Is it English: {self.is_english}")
1009
+
1010
+ self.page_cum_height = np.cumsum(self.page_cum_height)
1011
+ assert len(self.page_cum_height) == len(self.page_images) + 1
1012
+
1013
+ def __call__(self, fnm, need_image=False, zoomin=3, return_html=False):
1014
+ self.__images__(fnm, zoomin)
1015
+ self._layouts_rec(zoomin)
1016
+ self._table_transformer_job(zoomin)
1017
+ self._text_merge()
1018
+ self._concat_downward()
1019
+ self._filter_forpages()
1020
+ tbls = self._extract_table_figure(
1021
+ need_image, zoomin, return_html, False)
1022
+ return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
1023
+
1024
+ def remove_tag(self, txt):
1025
+ return re.sub(r"@@[\t0-9.-]+?##", "", txt)
1026
+
1027
+ def crop(self, text, ZM=3, need_position=False):
1028
+ imgs = []
1029
+ poss = []
1030
+ for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
1031
+ pn, left, right, top, bottom = tag.strip(
1032
+ "#").strip("@").split("\t")
1033
+ left, right, top, bottom = float(left), float(
1034
+ right), float(top), float(bottom)
1035
+ poss.append(([int(p) - 1 for p in pn.split("-")],
1036
+ left, right, top, bottom))
1037
+ if not poss:
1038
+ if need_position:
1039
+ return None, None
1040
+ return
1041
+
1042
+ max_width = max(
1043
+ np.max([right - left for (_, left, right, _, _) in poss]), 6)
1044
+ GAP = 6
1045
+ pos = poss[0]
1046
+ poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(
1047
+ 0, pos[3] - 120), max(pos[3] - GAP, 0)))
1048
+ pos = poss[-1]
1049
+ poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP),
1050
+ min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))
1051
+
1052
+ positions = []
1053
+ for ii, (pns, left, right, top, bottom) in enumerate(poss):
1054
+ right = left + max_width
1055
+ bottom *= ZM
1056
+ for pn in pns[1:]:
1057
+ bottom += self.page_images[pn - 1].size[1]
1058
+ imgs.append(
1059
+ self.page_images[pns[0]].crop((left * ZM, top * ZM,
1060
+ right *
1061
+ ZM, min(
1062
+ bottom, self.page_images[pns[0]].size[1])
1063
+ ))
1064
+ )
1065
+ if 0 < ii < len(poss) - 1:
1066
+ positions.append((pns[0] + self.page_from, left, right, top, min(
1067
+ bottom, self.page_images[pns[0]].size[1]) / ZM))
1068
+ bottom -= self.page_images[pns[0]].size[1]
1069
+ for pn in pns[1:]:
1070
+ imgs.append(
1071
+ self.page_images[pn].crop((left * ZM, 0,
1072
+ right * ZM,
1073
+ min(bottom,
1074
+ self.page_images[pn].size[1])
1075
+ ))
1076
+ )
1077
+ if 0 < ii < len(poss) - 1:
1078
+ positions.append((pn + self.page_from, left, right, 0, min(
1079
+ bottom, self.page_images[pn].size[1]) / ZM))
1080
+ bottom -= self.page_images[pn].size[1]
1081
+
1082
+ if not imgs:
1083
+ if need_position:
1084
+ return None, None
1085
+ return
1086
+ height = 0
1087
+ for img in imgs:
1088
+ height += img.size[1] + GAP
1089
+ height = int(height)
1090
+ width = int(np.max([i.size[0] for i in imgs]))
1091
+ pic = Image.new("RGB",
1092
+ (width, height),
1093
+ (245, 245, 245))
1094
+ height = 0
1095
+ for ii, img in enumerate(imgs):
1096
+ if ii == 0 or ii + 1 == len(imgs):
1097
+ img = img.convert('RGBA')
1098
+ overlay = Image.new('RGBA', img.size, (0, 0, 0, 0))
1099
+ overlay.putalpha(128)
1100
+ img = Image.alpha_composite(img, overlay).convert("RGB")
1101
+ pic.paste(img, (0, int(height)))
1102
+ height += img.size[1] + GAP
1103
+
1104
+ if need_position:
1105
+ return pic, positions
1106
+ return pic
1107
+
1108
+ def get_position(self, bx, ZM):
1109
+ poss = []
1110
+ pn = bx["page_number"]
1111
+ top = bx["top"] - self.page_cum_height[pn - 1]
1112
+ bott = bx["bottom"] - self.page_cum_height[pn - 1]
1113
+ poss.append((pn, bx["x0"], bx["x1"], top, min(
1114
+ bott, self.page_images[pn - 1].size[1] / ZM)))
1115
+ while bott * ZM > self.page_images[pn - 1].size[1]:
1116
+ bott -= self.page_images[pn - 1].size[1] / ZM
1117
+ top = 0
1118
+ pn += 1
1119
+ poss.append((pn, bx["x0"], bx["x1"], top, min(
1120
+ bott, self.page_images[pn - 1].size[1] / ZM)))
1121
+ return poss
1122
+
1123
+
1124
+ class PlainParser(object):
1125
+ def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
1126
+ self.outlines = []
1127
+ lines = []
1128
+ try:
1129
+ self.pdf = pdf2_read(
1130
+ filename if isinstance(
1131
+ filename, str) else BytesIO(filename))
1132
+ for page in self.pdf.pages[from_page:to_page]:
1133
+ lines.extend([t for t in page.extract_text().split("\n")])
1134
+
1135
+ outlines = self.pdf.outline
1136
+
1137
+ def dfs(arr, depth):
1138
+ for a in arr:
1139
+ if isinstance(a, dict):
1140
+ self.outlines.append((a["/Title"], depth))
1141
+ continue
1142
+ dfs(a, depth + 1)
1143
+
1144
+ dfs(outlines, 0)
1145
+ except Exception as e:
1146
+ logging.warning(f"Outlines exception: {e}")
1147
+ if not self.outlines:
1148
+ logging.warning(f"Miss outlines")
1149
+
1150
+ return [(l, "") for l in lines], []
1151
+
1152
+ def crop(self, ck, need_position):
1153
+ raise NotImplementedError
1154
+
1155
+ @staticmethod
1156
+ def remove_tag(txt):
1157
+ raise NotImplementedError
1158
+
1159
+
1160
+ if __name__ == "__main__":
1161
+ pass
deepdoc/parser/ppt_parser.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+ from io import BytesIO
14
+ from pptx import Presentation
15
+
16
+
17
+ class RAGFlowPptParser(object):
18
+ def __init__(self):
19
+ super().__init__()
20
+
21
+ def __extract(self, shape):
22
+ if shape.shape_type == 19:
23
+ tb = shape.table
24
+ rows = []
25
+ for i in range(1, len(tb.rows)):
26
+ rows.append("; ".join([tb.cell(
27
+ 0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
28
+ return "\n".join(rows)
29
+
30
+ if shape.has_text_frame:
31
+ return shape.text_frame.text
32
+
33
+ if shape.shape_type == 6:
34
+ texts = []
35
+ for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
36
+ t = self.__extract(p)
37
+ if t:
38
+ texts.append(t)
39
+ return "\n".join(texts)
40
+
41
+ def __call__(self, fnm, from_page, to_page, callback=None):
42
+ ppt = Presentation(fnm) if isinstance(
43
+ fnm, str) else Presentation(
44
+ BytesIO(fnm))
45
+ txts = []
46
+ self.total_page = len(ppt.slides)
47
+ for i, slide in enumerate(ppt.slides):
48
+ if i < from_page:
49
+ continue
50
+ if i >= to_page:
51
+ break
52
+ texts = []
53
+ for shape in sorted(
54
+ slide.shapes, key=lambda x: (x.top // 10, x.left)):
55
+ txt = self.__extract(shape)
56
+ if txt:
57
+ texts.append(txt)
58
+ txts.append("\n".join(texts))
59
+
60
+ return txts
deepdoc/parser/readme_parse.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 使用 `RAGFlowPdfParser` 类可以从 PDF 文件中解析出文本和表格等内容。以下是如何使用这个类的步骤:
2
+
3
+ 1. **初始化解析器**:
4
+ 创建 `RAGFlowPdfParser` 的实例。
5
+ ```python
6
+ pdf_parser = RAGFlowPdfParser()
7
+ ```
8
+
9
+ 2. **调用解析器**:
10
+ 使用 PDF 文件的路径或文件对象调用解析器。你可以指定是否需要图像、放大比例、是否返回 HTML 格式的表格等参数。
11
+ ```python
12
+ pdf_file_path = 'path_to_your_pdf_file.pdf'
13
+ text_content, tables = pdf_parser(pdf_file_path, need_image=True, zoomin=3, return_html=False)
14
+ ```
15
+
16
+ 这里 `text_content` 将包含从 PDF 提取的文本,而 `tables` 将包含提取的表格。
17
+
18
+ 3. **处理结果**:
19
+ `text_content` 和 `tables` 包含解析结果,可以根据需要进一步处理。例如,可以打印文本内容或者处理表格数据。
20
+
21
+ 4. **高级功能**:
22
+ - 使用 `crop` 方法从 PDF 中裁剪特定文本或区域。
23
+ - 使用 `remove_tag` 方法移除从文本中提取的特定标记。
24
+
25
+ 这个类高度依赖外部库和自定义的模型,包括 OCR、布局识别和表格结构识别等。确保这些依赖和相关资源都已正确安装和配置。
26
+
27
+ 此外,如果有特定的需求或需要处理的特殊 PDF 文件格式,可能还需要对解析器进行适当的调整或优化。
deepdoc/parser/readpdf.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from deepdoc.parser import RAGFlowPdfParser, PlainParser
2
+ import os
3
+ from PIL import Image
4
+ import json
5
+ class PDFprocess():
6
+ def __init__(self, repodir, workdir):
7
+ self.repodir = repodir
8
+ self.preprocessdir = os.path.join(workdir,'preprocess')
9
+ self.pdf_parser = RAGFlowPdfParser()
10
+ # self.plain_parser = PlainParser()
11
+
12
+ def _save_image(self,image, path, name):
13
+ """ 保存图片到指定路径 """
14
+ if not os.path.exists(path):
15
+ os.makedirs(path)
16
+ image_path = os.path.join(path, name)
17
+ image.save(image_path)
18
+ return image_path
19
+
20
+ def save_all_image(self,preprocessdir,tables):
21
+ image_folder = os.path.join(preprocessdir,'saved_images')
22
+ # 假设 res 中包含了图片对象和其他数据
23
+ for index, data in enumerate(tables):
24
+ image, text = data # 假设 data 结构是这样的
25
+ image_path = self._save_image(image, image_folder, f'image_{index}.png')
26
+ # relative_path = os.path.relpath(image_path, preprocessdir)
27
+ tables[index] = (image_path, text) # 更新 res 中的图片对象为图片路径
28
+ return tables
29
+
30
+ def create_html_file(self,tables, html_file_path):
31
+ html_content = '<html><body>\n'
32
+ for index, data in enumerate(tables):
33
+ image_path, text = data
34
+ # 创建图片链接和文本
35
+ html_content += f'<img src="{image_path}" alt="Image">\n{text}\n'
36
+ html_content += '</body></html>'
37
+
38
+ # 写入 HTML 文件
39
+ with open(html_file_path, 'w') as file:
40
+ file.write(html_content)
41
+ def process(self, pdffilename):
42
+ pdf_file_path = os.path.join(self.repodir,pdffilename)
43
+ text_content, tables = self.pdf_parser(pdf_file_path, need_image=False, zoomin=3, return_html=True)
44
+
45
+ text_file_path = os.path.join(self.preprocessdir,pdffilename.replace('.pdf','.txt'))
46
+ with open(text_file_path, 'w') as f:
47
+ f.write(text_content)
48
+
49
+ image_folder = os.path.join(self.preprocessdir,f'{pdffilename}_images')
50
+ tables = self.save_all_image(image_folder,tables)
51
+
52
+ html_file_path = os.path.join(self.preprocessdir,pdffilename.replace('.pdf','.html'))
53
+ self.create_html_file(tables, html_file_path)
54
+
55
+ json_file_path = os.path.join(self.preprocessdir,pdffilename.replace('.pdf','.json'))
56
+ with open(json_file_path, 'w') as f:
57
+ json.dump(tables, f, indent=4, ensure_ascii=False)
58
+ if __name__ == '__main__':
59
+ repodir = '/Users/chen/Downloads/ReviewAgent2'
60
+ preprocessdir = '/Users/chen/Downloads/ReviewAgent2/preprocess'
61
+ pdfprocess = PDFprocess(repodir, preprocessdir)
62
+ pdfprocess.process('test.pdf')
deepdoc/parser/resume/__init__.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+
3
+
4
+ def refactor(cv):
5
+ for n in ["raw_txt", "parser_name", "inference", "ori_text", "use_time", "time_stat"]:
6
+ if n in cv and cv[n] is not None: del cv[n]
7
+ cv["is_deleted"] = 0
8
+ if "basic" not in cv: cv["basic"] = {}
9
+ if cv["basic"].get("photo2"): del cv["basic"]["photo2"]
10
+
11
+ for n in ["education", "work", "certificate", "project", "language", "skill", "training"]:
12
+ if n not in cv or cv[n] is None: continue
13
+ if type(cv[n]) == type({}): cv[n] = [v for _, v in cv[n].items()]
14
+ if type(cv[n]) != type([]):
15
+ del cv[n]
16
+ continue
17
+ vv = []
18
+ for v in cv[n]:
19
+ if "external" in v and v["external"] is not None: del v["external"]
20
+ vv.append(v)
21
+ cv[n] = {str(i): vv[i] for i in range(len(vv))}
22
+
23
+ basics = [
24
+ ("basic_salary_month", "salary_month"),
25
+ ("expect_annual_salary_from", "expect_annual_salary"),
26
+ ]
27
+ for n, t in basics:
28
+ if cv["basic"].get(n):
29
+ cv["basic"][t] = cv["basic"][n]
30
+ del cv["basic"][n]
31
+
32
+ work = sorted([v for _, v in cv.get("work", {}).items()], key=lambda x: x.get("start_time", ""))
33
+ edu = sorted([v for _, v in cv.get("education", {}).items()], key=lambda x: x.get("start_time", ""))
34
+
35
+ if work:
36
+ cv["basic"]["work_start_time"] = work[0].get("start_time", "")
37
+ cv["basic"]["management_experience"] = 'Y' if any(
38
+ [w.get("management_experience", '') == 'Y' for w in work]) else 'N'
39
+ cv["basic"]["annual_salary"] = work[-1].get("annual_salary_from", "0")
40
+
41
+ for n in ["annual_salary_from", "annual_salary_to", "industry_name", "position_name", "responsibilities",
42
+ "corporation_type", "scale", "corporation_name"]:
43
+ cv["basic"][n] = work[-1].get(n, "")
44
+
45
+ if edu:
46
+ for n in ["school_name", "discipline_name"]:
47
+ if n in edu[-1]: cv["basic"][n] = edu[-1][n]
48
+
49
+ cv["basic"]["updated_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
50
+ if "contact" not in cv: cv["contact"] = {}
51
+ if not cv["contact"].get("name"): cv["contact"]["name"] = cv["basic"].get("name", "")
52
+ return cv
deepdoc/parser/resume/entities/__init__.py ADDED
File without changes
deepdoc/parser/resume/entities/corporations.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re,json,os
2
+ import pandas as pd
3
+ from rag.nlp import rag_tokenizer
4
+ from . import regions
5
+ current_file_path = os.path.dirname(os.path.abspath(__file__))
6
+ GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0)
7
+ GOODS["cid"] = GOODS["cid"].astype(str)
8
+ GOODS = GOODS.set_index(["cid"])
9
+ CORP_TKS = json.load(open(os.path.join(current_file_path, "res/corp.tks.freq.json"), "r"))
10
+ GOOD_CORP = json.load(open(os.path.join(current_file_path, "res/good_corp.json"), "r"))
11
+ CORP_TAG = json.load(open(os.path.join(current_file_path, "res/corp_tag.json"), "r"))
12
+
13
+ def baike(cid, default_v=0):
14
+ global GOODS
15
+ try:
16
+ return GOODS.loc[str(cid), "len"]
17
+ except Exception as e:
18
+ pass
19
+ return default_v
20
+
21
+
22
+ def corpNorm(nm, add_region=True):
23
+ global CORP_TKS
24
+ if not nm or type(nm)!=type(""):return ""
25
+ nm = rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(nm)).lower()
26
+ nm = re.sub(r"&amp;", "&", nm)
27
+ nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm)
28
+ nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE)
29
+ nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
30
+ if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
31
+
32
+ tks = rag_tokenizer.tokenize(nm).split(" ")
33
+ reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
34
+ nm = ""
35
+ for t in tks:
36
+ if regions.isName(t) or t in CORP_TKS:continue
37
+ if re.match(r"[0-9a-zA-Z\\,.]+", t) and re.match(r".*[0-9a-zA-Z\,.]+$", nm):nm += " "
38
+ nm += t
39
+
40
+ r = re.search(r"^([^a-z0-9 \(\)&]{2,})[a-z ]{4,}$", nm.strip())
41
+ if r:nm = r.group(1)
42
+ r = re.search(r"^([a-z ]{3,})[^a-z0-9 \(\)&]{2,}$", nm.strip())
43
+ if r:nm = r.group(1)
44
+ return nm.strip() + (("" if not reg else "(%s)"%reg[0]) if add_region else "")
45
+
46
+
47
+ def rmNoise(n):
48
+ n = re.sub(r"[\((][^()()]+[))]", "", n)
49
+ n = re.sub(r"[,. &()()]+", "", n)
50
+ return n
51
+
52
+ GOOD_CORP = set([corpNorm(rmNoise(c), False) for c in GOOD_CORP])
53
+ for c,v in CORP_TAG.items():
54
+ cc = corpNorm(rmNoise(c), False)
55
+ if not cc: print (c)
56
+ CORP_TAG = {corpNorm(rmNoise(c), False):v for c,v in CORP_TAG.items()}
57
+
58
+ def is_good(nm):
59
+ global GOOD_CORP
60
+ if nm.find("外派")>=0:return False
61
+ nm = rmNoise(nm)
62
+ nm = corpNorm(nm, False)
63
+ for n in GOOD_CORP:
64
+ if re.match(r"[0-9a-zA-Z]+$", n):
65
+ if n == nm: return True
66
+ elif nm.find(n)>=0:return True
67
+ return False
68
+
69
+ def corp_tag(nm):
70
+ global CORP_TAG
71
+ nm = rmNoise(nm)
72
+ nm = corpNorm(nm, False)
73
+ for n in CORP_TAG.keys():
74
+ if re.match(r"[0-9a-zA-Z., ]+$", n):
75
+ if n == nm: return CORP_TAG[n]
76
+ elif nm.find(n)>=0:
77
+ if len(n)<3 and len(nm)/len(n)>=2:continue
78
+ return CORP_TAG[n]
79
+ return []
80
+
deepdoc/parser/resume/entities/degrees.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TBL = {"94":"EMBA",
2
+ "6":"MBA",
3
+ "95":"MPA",
4
+ "92":"专升本",
5
+ "4":"专科",
6
+ "90":"中专",
7
+ "91":"中技",
8
+ "86":"初中",
9
+ "3":"博士",
10
+ "10":"博士后",
11
+ "1":"本科",
12
+ "2":"硕士",
13
+ "87":"职高",
14
+ "89":"高中"
15
+ }
16
+
17
+ TBL_ = {v:k for k,v in TBL.items()}
18
+
19
+ def get_name(id):
20
+ return TBL.get(str(id), "")
21
+
22
+ def get_id(nm):
23
+ if not nm:return ""
24
+ return TBL_.get(nm.upper().strip(), "")
deepdoc/parser/resume/entities/industries.py ADDED
@@ -0,0 +1,692 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ TBL = {"1":{"name":"IT/通信/电子","parent":"0"},
3
+ "2":{"name":"互联网","parent":"0"},
4
+ "3":{"name":"电子商务","parent":"2"},
5
+ "4":{"name":"互联网金融","parent":"2"},
6
+ "5":{"name":"网络游戏","parent":"2"},
7
+ "6":{"name":"社交网络平台","parent":"2"},
8
+ "7":{"name":"视频音乐","parent":"2"},
9
+ "9":{"name":"安全","parent":"2"},
10
+ "10":{"name":"云计算","parent":"2"},
11
+ "12":{"name":"工具类客户端应用","parent":"2"},
12
+ "13":{"name":"互联网广告","parent":"2"},
13
+ "14":{"name":"企业互联网服务","parent":"2"},
14
+ "16":{"name":"在线教育","parent":"2"},
15
+ "17":{"name":"在线医疗","parent":"2"},
16
+ "19":{"name":"B2B","parent":"3"},
17
+ "20":{"name":"B2C","parent":"3"},
18
+ "21":{"name":"C2C","parent":"3"},
19
+ "22":{"name":"生活信息本地化","parent":"3"},
20
+ "23":{"name":"在线旅游","parent":"2"},
21
+ "24":{"name":"第三方支付","parent":"4"},
22
+ "26":{"name":"客户端游戏","parent":"5"},
23
+ "27":{"name":"网页游戏","parent":"5"},
24
+ "28":{"name":"手机游戏","parent":"5"},
25
+ "29":{"name":"微博","parent":"6"},
26
+ "30":{"name":"社交网站","parent":"6"},
27
+ "31":{"name":"在线视频","parent":"7"},
28
+ "32":{"name":"在线音乐","parent":"7"},
29
+ "35":{"name":"企业安全","parent":"9"},
30
+ "36":{"name":"个人安全","parent":"9"},
31
+ "37":{"name":"企业级云服务","parent":"10"},
32
+ "38":{"name":"个人级云服务","parent":"10"},
33
+ "43":{"name":"输入法","parent":"12"},
34
+ "44":{"name":"浏览器","parent":"12"},
35
+ "45":{"name":"词典","parent":"12"},
36
+ "46":{"name":"播放器","parent":"12"},
37
+ "47":{"name":"下载器","parent":"12"},
38
+ "48":{"name":"IM","parent":"12"},
39
+ "49":{"name":"广告服务","parent":"13"},
40
+ "50":{"name":"第三方广告网络平台","parent":"13"},
41
+ "51":{"name":"媒体代理","parent":"13"},
42
+ "52":{"name":"创意代理","parent":"13"},
43
+ "53":{"name":"IT-综合","parent":"1"},
44
+ "71":{"name":"团购","parent":"3"},
45
+ "72":{"name":"地图","parent":"2"},
46
+ "73":{"name":"数据存储","parent":"2"},
47
+ "414":{"name":"计算机软件","parent":"1"},
48
+ "415":{"name":"计算机硬件","parent":"1"},
49
+ "416":{"name":"计算机服务(系统、数据服务、维修)","parent":"1"},
50
+ "417":{"name":"通信/电信/网络设备","parent":"1"},
51
+ "418":{"name":"通信/电信运营、增值服务","parent":"1"},
52
+ "419":{"name":"电子技术/半导体/集成电路","parent":"1"},
53
+ "472":{"name":"P2P网贷","parent":"4"},
54
+ "473":{"name":"互联网理财","parent":"4"},
55
+ "474":{"name":"婚恋","parent":"6"},
56
+ "476":{"name":"虚拟化","parent":"10"},
57
+ "477":{"name":"邮箱","parent":"12"},
58
+ "478":{"name":"商业智能","parent":"14"},
59
+ "479":{"name":"企业建站","parent":"14"},
60
+ "480":{"name":"安防","parent":"14"},
61
+ "481":{"name":"网络营销","parent":"2"},
62
+ "487":{"name":"智能终端","parent":"2"},
63
+ "488":{"name":"移动互联网","parent":"2"},
64
+ "489":{"name":"数字城市","parent":"2"},
65
+ "490":{"name":"大数据","parent":"2"},
66
+ "491":{"name":"互联网人力资源","parent":"2"},
67
+ "492":{"name":"舆情监控","parent":"2"},
68
+ "493":{"name":"移动营销","parent":"481"},
69
+ "494":{"name":"微博营销","parent":"481"},
70
+ "495":{"name":"精准营销","parent":"481"},
71
+ "496":{"name":"海外营销","parent":"481"},
72
+ "497":{"name":"微信营销","parent":"481"},
73
+ "498":{"name":"智能手机","parent":"487"},
74
+ "499":{"name":"可穿戴设备","parent":"487"},
75
+ "500":{"name":"智能电视","parent":"487"},
76
+ "501":{"name":"WAP","parent":"488"},
77
+ "502":{"name":"物联网","parent":"489"},
78
+ "503":{"name":"O2O","parent":"489"},
79
+ "504":{"name":"数字出版","parent":"489"},
80
+ "505":{"name":"搜索","parent":"2"},
81
+ "506":{"name":"垂直搜索","parent":"505"},
82
+ "507":{"name":"无线搜索","parent":"505"},
83
+ "508":{"name":"网页搜索","parent":"505"},
84
+ "509":{"name":"网址导航","parent":"2"},
85
+ "510":{"name":"门户","parent":"2"},
86
+ "511":{"name":"网络文学","parent":"2"},
87
+ "512":{"name":"自媒体","parent":"2"},
88
+ "513":{"name":"金融","parent":"0"},
89
+ "514":{"name":"建筑与房地产","parent":"0"},
90
+ "515":{"name":"专业服务","parent":"0"},
91
+ "516":{"name":"教育培训","parent":"0"},
92
+ "517":{"name":"文化传媒","parent":"0"},
93
+ "518":{"name":"消费品","parent":"0"},
94
+ "519":{"name":"工业","parent":"0"},
95
+ "520":{"name":"交通物流","parent":"0"},
96
+ "521":{"name":"贸易","parent":"0"},
97
+ "522":{"name":"医药","parent":"0"},
98
+ "523":{"name":"医疗器械","parent":"522"},
99
+ "524":{"name":"保健品","parent":"518"},
100
+ "525":{"name":"服务业","parent":"0"},
101
+ "526":{"name":"能源/矿产/环保","parent":"0"},
102
+ "527":{"name":"化工","parent":"0"},
103
+ "528":{"name":"政府","parent":"0"},
104
+ "529":{"name":"公共事业","parent":"0"},
105
+ "530":{"name":"非盈利机构","parent":"0"},
106
+ "531":{"name":"农业","parent":"1131"},
107
+ "532":{"name":"林业","parent":"1131"},
108
+ "533":{"name":"畜牧业","parent":"1131"},
109
+ "534":{"name":"渔业","parent":"1131"},
110
+ "535":{"name":"学术科研","parent":"0"},
111
+ "536":{"name":"零售","parent":"0"},
112
+ "537":{"name":"银行","parent":"513"},
113
+ "538":{"name":"保险","parent":"513"},
114
+ "539":{"name":"证券","parent":"513"},
115
+ "540":{"name":"基金","parent":"513"},
116
+ "541":{"name":"信托","parent":"513"},
117
+ "542":{"name":"担保","parent":"513"},
118
+ "543":{"name":"典当","parent":"513"},
119
+ "544":{"name":"拍卖","parent":"513"},
120
+ "545":{"name":"投资/融资","parent":"513"},
121
+ "546":{"name":"期货","parent":"513"},
122
+ "547":{"name":"房地产开发","parent":"514"},
123
+ "548":{"name":"工程施工","parent":"514"},
124
+ "549":{"name":"建筑设计","parent":"514"},
125
+ "550":{"name":"房地产代理","parent":"514"},
126
+ "551":{"name":"物业管理","parent":"514"},
127
+ "552":{"name":"室内设计","parent":"514"},
128
+ "553":{"name":"装修装潢","parent":"514"},
129
+ "554":{"name":"市政工程","parent":"514"},
130
+ "555":{"name":"工程造价","parent":"514"},
131
+ "556":{"name":"工程监理","parent":"514"},
132
+ "557":{"name":"环境工程","parent":"514"},
133
+ "558":{"name":"园林景观","parent":"514"},
134
+ "559":{"name":"法律","parent":"515"},
135
+ "560":{"name":"人力资源","parent":"515"},
136
+ "561":{"name":"会计","parent":"1125"},
137
+ "562":{"name":"审计","parent":"515"},
138
+ "563":{"name":"检测认证","parent":"515"},
139
+ "565":{"name":"翻译","parent":"515"},
140
+ "566":{"name":"中介","parent":"515"},
141
+ "567":{"name":"咨询","parent":"515"},
142
+ "568":{"name":"外包服务","parent":"515"},
143
+ "569":{"name":"家教","parent":"516"},
144
+ "570":{"name":"早教","parent":"516"},
145
+ "571":{"name":"职业技能培训","parent":"516"},
146
+ "572":{"name":"外语培训","parent":"516"},
147
+ "573":{"name":"设计培训","parent":"516"},
148
+ "574":{"name":"IT培训","parent":"516"},
149
+ "575":{"name":"文艺体育培训","parent":"516"},
150
+ "576":{"name":"学历教育","parent":"516"},
151
+ "577":{"name":"管理培训","parent":"516"},
152
+ "578":{"name":"民办基础教育","parent":"516"},
153
+ "579":{"name":"广告","parent":"517"},
154
+ "580":{"name":"媒体","parent":"517"},
155
+ "581":{"name":"会展","parent":"517"},
156
+ "582":{"name":"公关","parent":"517"},
157
+ "583":{"name":"影视","parent":"517"},
158
+ "584":{"name":"艺术","parent":"517"},
159
+ "585":{"name":"文化传播","parent":"517"},
160
+ "586":{"name":"娱乐","parent":"517"},
161
+ "587":{"name":"体育","parent":"517"},
162
+ "588":{"name":"出版","parent":"517"},
163
+ "589":{"name":"休闲","parent":"517"},
164
+ "590":{"name":"动漫","parent":"517"},
165
+ "591":{"name":"市场推广","parent":"517"},
166
+ "592":{"name":"市场研究","parent":"517"},
167
+ "593":{"name":"食品","parent":"1129"},
168
+ "594":{"name":"饮料","parent":"1129"},
169
+ "595":{"name":"烟草","parent":"1129"},
170
+ "596":{"name":"酒品","parent":"518"},
171
+ "597":{"name":"服饰","parent":"518"},
172
+ "598":{"name":"纺织","parent":"518"},
173
+ "599":{"name":"化妆品","parent":"1129"},
174
+ "600":{"name":"日用品","parent":"1129"},
175
+ "601":{"name":"家电","parent":"518"},
176
+ "602":{"name":"家具","parent":"518"},
177
+ "603":{"name":"办公用品","parent":"518"},
178
+ "604":{"name":"奢侈品","parent":"518"},
179
+ "605":{"name":"珠宝","parent":"518"},
180
+ "606":{"name":"数码产品","parent":"518"},
181
+ "607":{"name":"玩具","parent":"518"},
182
+ "608":{"name":"图书","parent":"518"},
183
+ "609":{"name":"音像","parent":"518"},
184
+ "610":{"name":"钟表","parent":"518"},
185
+ "611":{"name":"箱包","parent":"518"},
186
+ "612":{"name":"母婴","parent":"518"},
187
+ "613":{"name":"营养保健","parent":"518"},
188
+ "614":{"name":"户外用品","parent":"518"},
189
+ "615":{"name":"健身器材","parent":"518"},
190
+ "616":{"name":"乐器","parent":"518"},
191
+ "617":{"name":"汽车用品","parent":"518"},
192
+ "619":{"name":"厨具","parent":"518"},
193
+ "620":{"name":"机械制造","parent":"519"},
194
+ "621":{"name":"流体控制","parent":"519"},
195
+ "622":{"name":"自动化控制","parent":"519"},
196
+ "623":{"name":"仪器仪表","parent":"519"},
197
+ "624":{"name":"航空/航天","parent":"519"},
198
+ "625":{"name":"交通设施","parent":"519"},
199
+ "626":{"name":"工业电子","parent":"519"},
200
+ "627":{"name":"建材","parent":"519"},
201
+ "628":{"name":"五金材料","parent":"519"},
202
+ "629":{"name":"汽车","parent":"519"},
203
+ "630":{"name":"印刷","parent":"519"},
204
+ "631":{"name":"造纸","parent":"519"},
205
+ "632":{"name":"包装","parent":"519"},
206
+ "633":{"name":"原材料及加工","parent":"519"},
207
+ "634":{"name":"物流","parent":"520"},
208
+ "635":{"name":"仓储","parent":"520"},
209
+ "636":{"name":"客运","parent":"520"},
210
+ "637":{"name":"快递","parent":"520"},
211
+ "638":{"name":"化学药","parent":"522"},
212
+ "639":{"name":"中药","parent":"522"},
213
+ "640":{"name":"生物制药","parent":"522"},
214
+ "641":{"name":"兽药","parent":"522"},
215
+ "642":{"name":"农药","parent":"522"},
216
+ "643":{"name":"CRO","parent":"522"},
217
+ "644":{"name":"消毒","parent":"522"},
218
+ "645":{"name":"医药商业","parent":"522"},
219
+ "646":{"name":"医疗服务","parent":"522"},
220
+ "647":{"name":"医疗器械","parent":"523"},
221
+ "648":{"name":"制药设备","parent":"523"},
222
+ "649":{"name":"医用耗材","parent":"523"},
223
+ "650":{"name":"手术器械","parent":"523"},
224
+ "651":{"name":"保健器材","parent":"524"},
225
+ "652":{"name":"性保健品","parent":"524"},
226
+ "653":{"name":"医药保养","parent":"524"},
227
+ "654":{"name":"医用保健","parent":"524"},
228
+ "655":{"name":"酒店","parent":"525"},
229
+ "656":{"name":"餐饮","parent":"525"},
230
+ "657":{"name":"旅游","parent":"525"},
231
+ "658":{"name":"生活服务","parent":"525"},
232
+ "659":{"name":"保健服务","parent":"525"},
233
+ "660":{"name":"运动健身","parent":"525"},
234
+ "661":{"name":"家政服务","parent":"525"},
235
+ "662":{"name":"婚庆服务","parent":"525"},
236
+ "663":{"name":"租赁服务","parent":"525"},
237
+ "664":{"name":"维修服务","parent":"525"},
238
+ "665":{"name":"石油天然气","parent":"526"},
239
+ "666":{"name":"电力","parent":"526"},
240
+ "667":{"name":"新能源","parent":"526"},
241
+ "668":{"name":"水利","parent":"526"},
242
+ "669":{"name":"矿产","parent":"526"},
243
+ "670":{"name":"采掘业","parent":"526"},
244
+ "671":{"name":"冶炼","parent":"526"},
245
+ "672":{"name":"环保","parent":"526"},
246
+ "673":{"name":"无机化工原料","parent":"527"},
247
+ "674":{"name":"有机化工原料","parent":"527"},
248
+ "675":{"name":"精细化学品","parent":"527"},
249
+ "676":{"name":"化工设备","parent":"527"},
250
+ "677":{"name":"化工工程","parent":"527"},
251
+ "678":{"name":"资产管理","parent":"513"},
252
+ "679":{"name":"金融租赁","parent":"513"},
253
+ "680":{"name":"征信及信评机构","parent":"513"},
254
+ "681":{"name":"资产评估机构","parent":"513"},
255
+ "683":{"name":"金融监管机构","parent":"513"},
256
+ "684":{"name":"国际贸易","parent":"521"},
257
+ "685":{"name":"海关","parent":"521"},
258
+ "686":{"name":"购物中心","parent":"536"},
259
+ "687":{"name":"超市","parent":"536"},
260
+ "688":{"name":"便利店","parent":"536"},
261
+ "689":{"name":"专卖店","parent":"536"},
262
+ "690":{"name":"专业店","parent":"536"},
263
+ "691":{"name":"百货店","parent":"536"},
264
+ "692":{"name":"杂货店","parent":"536"},
265
+ "693":{"name":"个人银行","parent":"537"},
266
+ "695":{"name":"私人银行","parent":"537"},
267
+ "696":{"name":"公司银行","parent":"537"},
268
+ "697":{"name":"投资银行","parent":"537"},
269
+ "698":{"name":"政策性银行","parent":"537"},
270
+ "699":{"name":"中央银行","parent":"537"},
271
+ "700":{"name":"人寿险","parent":"538"},
272
+ "701":{"name":"财产险","parent":"538"},
273
+ "702":{"name":"再保险","parent":"538"},
274
+ "703":{"name":"养老险","parent":"538"},
275
+ "704":{"name":"保险代理公司","parent":"538"},
276
+ "705":{"name":"公募基金","parent":"540"},
277
+ "707":{"name":"私募基金","parent":"540"},
278
+ "708":{"name":"第三方理财","parent":"679"},
279
+ "709":{"name":"资产管理公司","parent":"679"},
280
+ "711":{"name":"房产中介","parent":"566"},
281
+ "712":{"name":"职业中介","parent":"566"},
282
+ "713":{"name":"婚姻中介","parent":"566"},
283
+ "714":{"name":"战略咨询","parent":"567"},
284
+ "715":{"name":"投资咨询","parent":"567"},
285
+ "716":{"name":"心理咨询","parent":"567"},
286
+ "717":{"name":"留学移民咨询","parent":"567"},
287
+ "718":{"name":"工商注册代理","parent":"568"},
288
+ "719":{"name":"商标专利代理","parent":"568"},
289
+ "720":{"name":"财务代理","parent":"568"},
290
+ "721":{"name":"工程机械","parent":"620"},
291
+ "722":{"name":"农业机械","parent":"620"},
292
+ "723":{"name":"海工设备","parent":"620"},
293
+ "724":{"name":"包装机械","parent":"620"},
294
+ "725":{"name":"印刷机械","parent":"620"},
295
+ "726":{"name":"数控机床","parent":"620"},
296
+ "727":{"name":"矿山机械","parent":"620"},
297
+ "728":{"name":"水泵","parent":"621"},
298
+ "729":{"name":"管道","parent":"621"},
299
+ "730":{"name":"阀门","parent":"621"},
300
+ "732":{"name":"压缩机","parent":"621"},
301
+ "733":{"name":"集散控制系统","parent":"622"},
302
+ "734":{"name":"远程控制","parent":"622"},
303
+ "735":{"name":"液压系统","parent":"622"},
304
+ "736":{"name":"楼宇智能化","parent":"622"},
305
+ "737":{"name":"飞机制造","parent":"624"},
306
+ "738":{"name":"航空公司","parent":"624"},
307
+ "739":{"name":"发动机","parent":"624"},
308
+ "740":{"name":"复合材料","parent":"624"},
309
+ "741":{"name":"高铁","parent":"625"},
310
+ "742":{"name":"地铁","parent":"625"},
311
+ "743":{"name":"信号传输","parent":"625"},
312
+ "745":{"name":"结构材料","parent":"627"},
313
+ "746":{"name":"装饰材料","parent":"627"},
314
+ "747":{"name":"专用材料","parent":"627"},
315
+ "749":{"name":"经销商集团","parent":"629"},
316
+ "750":{"name":"整车制造","parent":"629"},
317
+ "751":{"name":"汽车零配件","parent":"629"},
318
+ "752":{"name":"外型设计","parent":"629"},
319
+ "753":{"name":"平版印刷","parent":"630"},
320
+ "754":{"name":"凸版印刷","parent":"630"},
321
+ "755":{"name":"凹版印刷","parent":"630"},
322
+ "756":{"name":"孔版印刷","parent":"630"},
323
+ "757":{"name":"印刷用纸","parent":"631"},
324
+ "758":{"name":"书写、制图及复制用纸","parent":"631"},
325
+ "759":{"name":"包装用纸","parent":"631"},
326
+ "760":{"name":"生活、卫生及装饰用纸","parent":"631"},
327
+ "761":{"name":"技术用纸","parent":"631"},
328
+ "762":{"name":"加工纸原纸","parent":"631"},
329
+ "763":{"name":"食品包装","parent":"632"},
330
+ "764":{"name":"医药包装","parent":"632"},
331
+ "765":{"name":"日化包装","parent":"632"},
332
+ "766":{"name":"物流包装","parent":"632"},
333
+ "767":{"name":"礼品包装","parent":"632"},
334
+ "768":{"name":"电子五金包装","parent":"632"},
335
+ "769":{"name":"汽车服务","parent":"525"},
336
+ "770":{"name":"汽车保养","parent":"769"},
337
+ "771":{"name":"租车","parent":"769"},
338
+ "773":{"name":"出租车","parent":"769"},
339
+ "774":{"name":"代驾","parent":"769"},
340
+ "775":{"name":"发电","parent":"666"},
341
+ "777":{"name":"输配电","parent":"666"},
342
+ "779":{"name":"风电","parent":"667"},
343
+ "780":{"name":"光伏/太阳能","parent":"667"},
344
+ "781":{"name":"生物质发电","parent":"667"},
345
+ "782":{"name":"煤化工","parent":"667"},
346
+ "783":{"name":"垃圾发电","parent":"667"},
347
+ "784":{"name":"核电","parent":"667"},
348
+ "785":{"name":"能源矿产","parent":"669"},
349
+ "786":{"name":"金属矿产","parent":"669"},
350
+ "787":{"name":"非金属矿产","parent":"669"},
351
+ "788":{"name":"水气矿产","parent":"669"},
352
+ "789":{"name":"锅炉","parent":"775"},
353
+ "790":{"name":"发电机","parent":"775"},
354
+ "791":{"name":"汽轮机","parent":"775"},
355
+ "792":{"name":"燃机","parent":"775"},
356
+ "793":{"name":"冷却","parent":"775"},
357
+ "794":{"name":"电力设计院","parent":"775"},
358
+ "795":{"name":"高压输配电","parent":"777"},
359
+ "796":{"name":"中压输配电","parent":"777"},
360
+ "797":{"name":"低压输配电","parent":"777"},
361
+ "798":{"name":"继电保护","parent":"777"},
362
+ "799":{"name":"智能电网","parent":"777"},
363
+ "800":{"name":"小学","parent":"516"},
364
+ "801":{"name":"电动车","parent":"519"},
365
+ "802":{"name":"皮具箱包","parent":"518"},
366
+ "803":{"name":"医药制造","parent":"522"},
367
+ "804":{"name":"电器销售","parent":"536"},
368
+ "805":{"name":"塑料制品","parent":"527"},
369
+ "806":{"name":"公益基金会","parent":"530"},
370
+ "807":{"name":"美发服务","parent":"525"},
371
+ "808":{"name":"农业养殖","parent":"531"},
372
+ "809":{"name":"金融服务","parent":"513"},
373
+ "810":{"name":"商业地产综合体","parent":"514"},
374
+ "811":{"name":"美容服务","parent":"525"},
375
+ "812":{"name":"灯饰","parent":"518"},
376
+ "813":{"name":"油墨颜料产品","parent":"527"},
377
+ "814":{"name":"眼镜制造","parent":"518"},
378
+ "815":{"name":"农业生物技术","parent":"531"},
379
+ "816":{"name":"体育用品","parent":"518"},
380
+ "817":{"name":"保健用品","parent":"524"},
381
+ "818":{"name":"化学化工产品","parent":"527"},
382
+ "819":{"name":"饲料","parent":"531"},
383
+ "821":{"name":"保安服务","parent":"525"},
384
+ "822":{"name":"干细胞技术","parent":"522"},
385
+ "824":{"name":"农药化肥","parent":"527"},
386
+ "825":{"name":"卫生洁具","parent":"518"},
387
+ "826":{"name":"体育器材、场馆","parent":"518"},
388
+ "827":{"name":"饲料加工","parent":"531"},
389
+ "828":{"name":"测绘服务","parent":"529"},
390
+ "830":{"name":"金属船舶制造","parent":"519"},
391
+ "831":{"name":"基因工程","parent":"522"},
392
+ "832":{"name":"花卉服务","parent":"536"},
393
+ "833":{"name":"农业种植","parent":"531"},
394
+ "834":{"name":"皮革制品","parent":"518"},
395
+ "835":{"name":"地理信息加工服务","parent":"529"},
396
+ "836":{"name":"机器人","parent":"519"},
397
+ "837":{"name":"礼品","parent":"518"},
398
+ "838":{"name":"理发及美容服务","parent":"525"},
399
+ "839":{"name":"其他清洁服务","parent":"525"},
400
+ "840":{"name":"硅胶材料","parent":"527"},
401
+ "841":{"name":"茶叶销售","parent":"518"},
402
+ "842":{"name":"彩票活动","parent":"529"},
403
+ "843":{"name":"化妆培训","parent":"516"},
404
+ "844":{"name":"鞋业","parent":"518"},
405
+ "845":{"name":"酒店用品","parent":"518"},
406
+ "846":{"name":"复合材料","parent":"527"},
407
+ "847":{"name":"房地产工程建设","parent":"548"},
408
+ "848":{"name":"知识产权服务","parent":"559"},
409
+ "849":{"name":"新型建材","parent":"627"},
410
+ "850":{"name":"企业投资咨询","parent":"567"},
411
+ "851":{"name":"含乳饮料和植物蛋白饮料制造","parent":"594"},
412
+ "852":{"name":"汽车检测设备","parent":"629"},
413
+ "853":{"name":"手机通讯器材","parent":"417"},
414
+ "854":{"name":"环保材料","parent":"672"},
415
+ "855":{"name":"交通设施","parent":"554"},
416
+ "856":{"name":"电子器件","parent":"419"},
417
+ "857":{"name":"啤酒","parent":"594"},
418
+ "858":{"name":"生态旅游","parent":"657"},
419
+ "859":{"name":"自动化设备","parent":"626"},
420
+ "860":{"name":"软件开发","parent":"414"},
421
+ "861":{"name":"葡萄酒销售","parent":"594"},
422
+ "862":{"name":"钢材","parent":"633"},
423
+ "863":{"name":"餐饮培训","parent":"656"},
424
+ "864":{"name":"速冻食品","parent":"593"},
425
+ "865":{"name":"空气环保","parent":"672"},
426
+ "866":{"name":"互联网房地产经纪服务","parent":"550"},
427
+ "867":{"name":"食品添加剂","parent":"593"},
428
+ "868":{"name":"演艺传播","parent":"585"},
429
+ "869":{"name":"信用卡","parent":"537"},
430
+ "870":{"name":"报纸期刊广告","parent":"579"},
431
+ "871":{"name":"摄影","parent":"525"},
432
+ "872":{"name":"手机软件","parent":"414"},
433
+ "873":{"name":"地坪建材","parent":"627"},
434
+ "874":{"name":"企业管理咨询","parent":"567"},
435
+ "875":{"name":"幼儿教育","parent":"570"},
436
+ "876":{"name":"系统集成","parent":"416"},
437
+ "877":{"name":"皮革服饰","parent":"597"},
438
+ "878":{"name":"保健食品","parent":"593"},
439
+ "879":{"name":"叉车","parent":"620"},
440
+ "880":{"name":"厨卫电器","parent":"601"},
441
+ "882":{"name":"地暖设备","parent":"627"},
442
+ "883":{"name":"钢结构制造","parent":"548"},
443
+ "884":{"name":"投影机","parent":"606"},
444
+ "885":{"name":"啤酒销售","parent":"594"},
445
+ "886":{"name":"度假村旅游","parent":"657"},
446
+ "887":{"name":"电力元件设备","parent":"626"},
447
+ "888":{"name":"管理软件","parent":"414"},
448
+ "889":{"name":"轴承","parent":"628"},
449
+ "890":{"name":"餐饮设备","parent":"656"},
450
+ "891":{"name":"肉制品及副产品加工","parent":"593"},
451
+ "892":{"name":"艺术收藏品投资交易","parent":"584"},
452
+ "893":{"name":"净水器","parent":"601"},
453
+ "894":{"name":"进口食品","parent":"593"},
454
+ "895":{"name":"娱乐文化传播","parent":"585"},
455
+ "896":{"name":"文化传播","parent":"585"},
456
+ "897":{"name":"商旅传媒","parent":"580"},
457
+ "898":{"name":"广告设计制作","parent":"579"},
458
+ "899":{"name":"金属丝绳及其制品制造","parent":"627"},
459
+ "900":{"name":"建筑涂料","parent":"627"},
460
+ "901":{"name":"抵押贷款","parent":"543"},
461
+ "902":{"name":"早教","parent":"570"},
462
+ "903":{"name":"电影放映","parent":"583"},
463
+ "904":{"name":"内衣服饰","parent":"597"},
464
+ "905":{"name":"无线网络通信","parent":"418"},
465
+ "906":{"name":"记忆卡","parent":"415"},
466
+ "907":{"name":"女装服饰","parent":"597"},
467
+ "908":{"name":"建筑机械","parent":"620"},
468
+ "909":{"name":"制冷电器","parent":"601"},
469
+ "910":{"name":"通信设备","parent":"417"},
470
+ "911":{"name":"空调设备","parent":"601"},
471
+ "912":{"name":"建筑装饰","parent":"553"},
472
+ "913":{"name":"办公设备","parent":"603"},
473
+ "916":{"name":"数据处理软件","parent":"414"},
474
+ "917":{"name":"葡萄酒贸易","parent":"594"},
475
+ "918":{"name":"通讯器材","parent":"417"},
476
+ "919":{"name":"铜业","parent":"633"},
477
+ "920":{"name":"食堂","parent":"656"},
478
+ "921":{"name":"糖果零食","parent":"593"},
479
+ "922":{"name":"文化艺术传播","parent":"584"},
480
+ "923":{"name":"太阳能电器","parent":"601"},
481
+ "924":{"name":"药品零售","parent":"645"},
482
+ "925":{"name":"果蔬食品","parent":"593"},
483
+ "926":{"name":"文化活动策划","parent":"585"},
484
+ "928":{"name":"汽车广告","parent":"657"},
485
+ "929":{"name":"条码设备","parent":"630"},
486
+ "930":{"name":"建筑石材","parent":"627"},
487
+ "931":{"name":"贵金属","parent":"545"},
488
+ "932":{"name":"体育","parent":"660"},
489
+ "933":{"name":"金融信息服务","parent":"414"},
490
+ "934":{"name":"玻璃建材","parent":"627"},
491
+ "935":{"name":"家教","parent":"569"},
492
+ "936":{"name":"歌舞厅娱乐活动","parent":"586"},
493
+ "937":{"name":"计算机服务器","parent":"415"},
494
+ "938":{"name":"管道","parent":"627"},
495
+ "939":{"name":"婴幼儿服饰","parent":"597"},
496
+ "940":{"name":"热水器","parent":"601"},
497
+ "941":{"name":"计算机及零部件制造","parent":"415"},
498
+ "942":{"name":"钢铁贸易","parent":"633"},
499
+ "944":{"name":"包装材料","parent":"632"},
500
+ "945":{"name":"计算机办公设备","parent":"603"},
501
+ "946":{"name":"白酒","parent":"594"},
502
+ "948":{"name":"发动机","parent":"620"},
503
+ "949":{"name":"快餐服务","parent":"656"},
504
+ "950":{"name":"酒类销售","parent":"594"},
505
+ "951":{"name":"电子产品、机电设备","parent":"626"},
506
+ "952":{"name":"激光设备","parent":"626"},
507
+ "953":{"name":"餐饮策划","parent":"656"},
508
+ "954":{"name":"饮料、食品","parent":"594"},
509
+ "955":{"name":"文化娱乐经纪","parent":"585"},
510
+ "956":{"name":"天然气","parent":"665"},
511
+ "957":{"name":"农副食品","parent":"593"},
512
+ "958":{"name":"艺术表演","parent":"585"},
513
+ "959":{"name":"石膏、水泥制品及类似制品制造","parent":"627"},
514
+ "960":{"name":"橱柜","parent":"602"},
515
+ "961":{"name":"管理培训","parent":"577"},
516
+ "962":{"name":"男装服饰","parent":"597"},
517
+ "963":{"name":"化肥制造","parent":"675"},
518
+ "964":{"name":"童装服饰","parent":"597"},
519
+ "965":{"name":"电源电池","parent":"626"},
520
+ "966":{"name":"家电维修","parent":"664"},
521
+ "967":{"name":"光电子器件","parent":"419"},
522
+ "968":{"name":"旅行社服务","parent":"657"},
523
+ "969":{"name":"电线、电缆制造","parent":"626"},
524
+ "970":{"name":"软件开发、信息系统集成","parent":"419"},
525
+ "971":{"name":"白酒制造","parent":"594"},
526
+ "973":{"name":"甜品服务","parent":"656"},
527
+ "974":{"name":"糕点、面包制造","parent":"593"},
528
+ "975":{"name":"木工机械","parent":"620"},
529
+ "976":{"name":"酒吧服务","parent":"656"},
530
+ "977":{"name":"火腿肠","parent":"593"},
531
+ "978":{"name":"广告策划推广","parent":"579"},
532
+ "979":{"name":"新能源产品和生产装备制造","parent":"667"},
533
+ "980":{"name":"调味品","parent":"593"},
534
+ "981":{"name":"礼仪表演","parent":"585"},
535
+ "982":{"name":"劳务派遣","parent":"560"},
536
+ "983":{"name":"建材零售","parent":"627"},
537
+ "984":{"name":"商品交易中心","parent":"545"},
538
+ "985":{"name":"体育推广","parent":"585"},
539
+ "986":{"name":"茶饮料及其他饮料制造","parent":"594"},
540
+ "987":{"name":"金属建材","parent":"627"},
541
+ "988":{"name":"职业技能培训","parent":"571"},
542
+ "989":{"name":"网吧活动","parent":"586"},
543
+ "990":{"name":"洗衣服务","parent":"658"},
544
+ "991":{"name":"管道工程","parent":"554"},
545
+ "992":{"name":"通信工程","parent":"417"},
546
+ "993":{"name":"电子元器件","parent":"626"},
547
+ "994":{"name":"电子设备","parent":"419"},
548
+ "995":{"name":"茶馆服务","parent":"656"},
549
+ "996":{"name":"旅游开发","parent":"657"},
550
+ "997":{"name":"视频通讯","parent":"417"},
551
+ "998":{"name":"白酒销售","parent":"594"},
552
+ "1000":{"name":"咖啡馆服务","parent":"656"},
553
+ "1001":{"name":"食品零售","parent":"593"},
554
+ "1002":{"name":"健康疗养旅游","parent":"655"},
555
+ "1003":{"name":"粮油食品","parent":"593"},
556
+ "1004":{"name":"儿童教育影视","parent":"583"},
557
+ "1005":{"name":"新能源发电","parent":"667"},
558
+ "1006":{"name":"旅游策划","parent":"657"},
559
+ "1007":{"name":"绘画","parent":"575"},
560
+ "1008":{"name":"方便面及其他方便食品","parent":"593"},
561
+ "1009":{"name":"房地产经纪","parent":"550"},
562
+ "1010":{"name":"母婴家政","parent":"661"},
563
+ "1011":{"name":"居家养老健康服务","parent":"661"},
564
+ "1012":{"name":"文化艺术投资","parent":"545"},
565
+ "1013":{"name":"运动健身","parent":"660"},
566
+ "1014":{"name":"瓶(罐)装饮用水制造","parent":"594"},
567
+ "1015":{"name":"金属门窗","parent":"627"},
568
+ "1016":{"name":"机动车检测","parent":"563"},
569
+ "1017":{"name":"货物运输","parent":"634"},
570
+ "1018":{"name":"服饰专卖","parent":"690"},
571
+ "1019":{"name":"酒店服装","parent":"597"},
572
+ "1020":{"name":"通讯软件","parent":"417"},
573
+ "1021":{"name":"消防工程","parent":"554"},
574
+ "1022":{"name":"嵌入式电子系统","parent":"419"},
575
+ "1023":{"name":"航空票务","parent":"636"},
576
+ "1024":{"name":"电气设备","parent":"626"},
577
+ "1025":{"name":"酒业贸易","parent":"594"},
578
+ "1027":{"name":"其他饮料及冷饮服务","parent":"656"},
579
+ "1028":{"name":"乳制品","parent":"593"},
580
+ "1029":{"name":"新闻期刊出版","parent":"588"},
581
+ "1030":{"name":"水污染治理","parent":"672"},
582
+ "1031":{"name":"谷物食品","parent":"593"},
583
+ "1032":{"name":"数字动漫设计制造服务","parent":"590"},
584
+ "1033":{"name":"医院","parent":"646"},
585
+ "1034":{"name":"旅游广告","parent":"657"},
586
+ "1035":{"name":"办公家具","parent":"602"},
587
+ "1036":{"name":"房地产营销策划","parent":"550"},
588
+ "1037":{"name":"保洁家政","parent":"661"},
589
+ "1038":{"name":"水泥制造","parent":"627"},
590
+ "1039":{"name":"市场研究咨询","parent":"567"},
591
+ "1040":{"name":"驾校","parent":"571"},
592
+ "1041":{"name":"正餐服务","parent":"656"},
593
+ "1043":{"name":"机动车燃油","parent":"665"},
594
+ "1044":{"name":"食品","parent":"593"},
595
+ "1045":{"name":"新能源汽车","parent":"629"},
596
+ "1046":{"name":"手机无线网络推广","parent":"417"},
597
+ "1047":{"name":"环保设备","parent":"672"},
598
+ "1048":{"name":"通讯工程","parent":"418"},
599
+ "1049":{"name":"半导体集成电路","parent":"419"},
600
+ "1050":{"name":"航空服务","parent":"636"},
601
+ "1051":{"name":"电机设备","parent":"626"},
602
+ "1052":{"name":"档案软件","parent":"414"},
603
+ "1053":{"name":"冷链物流服务","parent":"634"},
604
+ "1054":{"name":"小吃服务","parent":"656"},
605
+ "1055":{"name":"水产品加工","parent":"593"},
606
+ "1056":{"name":"图书出版","parent":"588"},
607
+ "1057":{"name":"固体废物治理","parent":"672"},
608
+ "1059":{"name":"坚果食品","parent":"593"},
609
+ "1060":{"name":"广告传媒","parent":"579"},
610
+ "1061":{"name":"电梯","parent":"622"},
611
+ "1062":{"name":"社区医疗与卫生院","parent":"646"},
612
+ "1063":{"name":"广告、印刷包装","parent":"630"},
613
+ "1064":{"name":"婚纱礼服","parent":"662"},
614
+ "1065":{"name":"地毯","parent":"602"},
615
+ "1066":{"name":"互联网物业","parent":"551"},
616
+ "1067":{"name":"跨境电商","parent":"3"},
617
+ "1068":{"name":"信息安全、系统集成","parent":"9"},
618
+ "1069":{"name":"专用汽车制造","parent":"750"},
619
+ "1070":{"name":"商品贸易","parent":"3"},
620
+ "1071":{"name":"墙壁装饰材料","parent":"746"},
621
+ "1072":{"name":"窗帘装饰材料","parent":"746"},
622
+ "1073":{"name":"电子商务、本地生活服务","parent":"3"},
623
+ "1075":{"name":"白酒电子商务","parent":"3"},
624
+ "1076":{"name":"商品贸易、电子商务","parent":"3"},
625
+ "1077":{"name":"木质装饰材料","parent":"746"},
626
+ "1078":{"name":"电子商务、汽车电商交易平台","parent":"3"},
627
+ "1079":{"name":"汽车轮胎","parent":"751"},
628
+ "1080":{"name":"气体压缩机械制造","parent":"732"},
629
+ "1081":{"name":"家装家具电子商务","parent":"3"},
630
+ "1082":{"name":"化妆品电子商务","parent":"3"},
631
+ "1083":{"name":"汽车销售","parent":"749"},
632
+ "1084":{"name":"新闻资讯网站","parent":"510"},
633
+ "1085":{"name":"母婴电商","parent":"3"},
634
+ "1086":{"name":"电商商务、收藏品交易","parent":"3"},
635
+ "1088":{"name":"电子商务、数码产品","parent":"3"},
636
+ "1089":{"name":"二手车交易","parent":"749"},
637
+ "1090":{"name":"游戏制作服务","parent":"5"},
638
+ "1091":{"name":"母婴服务","parent":"510"},
639
+ "1092":{"name":"家具电子商务","parent":"3"},
640
+ "1093":{"name":"汽车配件电子商务","parent":"3"},
641
+ "1094":{"name":"输配电设备","parent":"777"},
642
+ "1095":{"name":"矿山设备","parent":"727"},
643
+ "1096":{"name":"机床机械","parent":"726"},
644
+ "1097":{"name":"农产品电商","parent":"3"},
645
+ "1098":{"name":"陶瓷装饰材料","parent":"746"},
646
+ "1099":{"name":"车载联网设备","parent":"487"},
647
+ "1100":{"name":"汽车销售电子商务","parent":"3"},
648
+ "1101":{"name":"石油设备","parent":"730"},
649
+ "1102":{"name":"智能家居","parent":"487"},
650
+ "1103":{"name":"散热器","parent":"751"},
651
+ "1104":{"name":"电力工程","parent":"775"},
652
+ "1105":{"name":"生鲜电商","parent":"3"},
653
+ "1106":{"name":"互联网数据服务","parent":"490"},
654
+ "1107":{"name":"房车、商务车销售","parent":"749"},
655
+ "1108":{"name":"茶叶电子商务","parent":"3"},
656
+ "1109":{"name":"酒类电子商务","parent":"3"},
657
+ "1110":{"name":"阀门","parent":"730"},
658
+ "1111":{"name":"食品电商","parent":"3"},
659
+ "1112":{"name":"儿童摄影","parent":"871"},
660
+ "1113":{"name":"广告摄影","parent":"871"},
661
+ "1114":{"name":"婚纱摄影","parent":"871"},
662
+ "1115":{"name":"模具制造","parent":"620"},
663
+ "1116":{"name":"汽车模具","parent":"629"},
664
+ "1117":{"name":"认证咨询","parent":"567"},
665
+ "1118":{"name":"数字视觉制作服务","parent":"590"},
666
+ "1119":{"name":"牙科及医疗器械","parent":"646"},
667
+ "1120":{"name":"猎头招聘","parent":"560"},
668
+ "1121":{"name":"家居","parent":"518"},
669
+ "1122":{"name":"收藏品","parent":"518"},
670
+ "1123":{"name":"首饰","parent":"518"},
671
+ "1124":{"name":"工艺品","parent":"518"},
672
+ "1125":{"name":"财务","parent":"515"},
673
+ "1126":{"name":"税务","parent":"515"},
674
+ "1127":{"name":"分类信息","parent":"2"},
675
+ "1128":{"name":"宠物","parent":"0"},
676
+ "1129":{"name":"快消品","parent":"518"},
677
+ "1130":{"name":"人工智能","parent":"2"},
678
+ "1131":{"name":"农/林/牧/渔","parent":"0"}
679
+ }
680
+
681
+ def get_names(id):
682
+ id = str(id)
683
+ nms = []
684
+ d = TBL.get(id)
685
+ if not d:return []
686
+ nms.append(d["name"])
687
+ p = get_names(d["parent"])
688
+ if p: nms.extend(p)
689
+ return nms
690
+
691
+ if __name__ == "__main__":
692
+ print(get_names("1119"))
deepdoc/parser/resume/entities/regions.py ADDED
@@ -0,0 +1,762 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TBL = {
2
+ "2":{"name":"北京","parent":"1"},
3
+ "3":{"name":"天津","parent":"1"},
4
+ "4":{"name":"河北","parent":"1"},
5
+ "5":{"name":"山西","parent":"1"},
6
+ "6":{"name":"内蒙古","parent":"1"},
7
+ "7":{"name":"辽宁","parent":"1"},
8
+ "8":{"name":"吉林","parent":"1"},
9
+ "9":{"name":"黑龙江","parent":"1"},
10
+ "10":{"name":"上海","parent":"1"},
11
+ "11":{"name":"江苏","parent":"1"},
12
+ "12":{"name":"浙江","parent":"1"},
13
+ "13":{"name":"安徽","parent":"1"},
14
+ "14":{"name":"福建","parent":"1"},
15
+ "15":{"name":"江西","parent":"1"},
16
+ "16":{"name":"山东","parent":"1"},
17
+ "17":{"name":"河南","parent":"1"},
18
+ "18":{"name":"湖北","parent":"1"},
19
+ "19":{"name":"湖南","parent":"1"},
20
+ "20":{"name":"广东","parent":"1"},
21
+ "21":{"name":"广西","parent":"1"},
22
+ "22":{"name":"海南","parent":"1"},
23
+ "23":{"name":"重庆","parent":"1"},
24
+ "24":{"name":"四川","parent":"1"},
25
+ "25":{"name":"贵州","parent":"1"},
26
+ "26":{"name":"云南","parent":"1"},
27
+ "27":{"name":"西藏","parent":"1"},
28
+ "28":{"name":"陕西","parent":"1"},
29
+ "29":{"name":"甘肃","parent":"1"},
30
+ "30":{"name":"青海","parent":"1"},
31
+ "31":{"name":"宁夏","parent":"1"},
32
+ "32":{"name":"新疆","parent":"1"},
33
+ "33":{"name":"北京市","parent":"2"},
34
+ "34":{"name":"天津市","parent":"3"},
35
+ "35":{"name":"石家庄市","parent":"4"},
36
+ "36":{"name":"唐山市","parent":"4"},
37
+ "37":{"name":"秦皇岛市","parent":"4"},
38
+ "38":{"name":"邯郸市","parent":"4"},
39
+ "39":{"name":"邢台市","parent":"4"},
40
+ "40":{"name":"保定市","parent":"4"},
41
+ "41":{"name":"张家口市","parent":"4"},
42
+ "42":{"name":"承德市","parent":"4"},
43
+ "43":{"name":"沧州市","parent":"4"},
44
+ "44":{"name":"廊坊市","parent":"4"},
45
+ "45":{"name":"衡水市","parent":"4"},
46
+ "46":{"name":"太原市","parent":"5"},
47
+ "47":{"name":"大同市","parent":"5"},
48
+ "48":{"name":"阳泉市","parent":"5"},
49
+ "49":{"name":"长治市","parent":"5"},
50
+ "50":{"name":"晋城市","parent":"5"},
51
+ "51":{"name":"朔州市","parent":"5"},
52
+ "52":{"name":"晋中市","parent":"5"},
53
+ "53":{"name":"运城市","parent":"5"},
54
+ "54":{"name":"忻州市","parent":"5"},
55
+ "55":{"name":"临汾市","parent":"5"},
56
+ "56":{"name":"吕梁市","parent":"5"},
57
+ "57":{"name":"呼和浩特市","parent":"6"},
58
+ "58":{"name":"包头市","parent":"6"},
59
+ "59":{"name":"乌海市","parent":"6"},
60
+ "60":{"name":"赤峰市","parent":"6"},
61
+ "61":{"name":"通辽市","parent":"6"},
62
+ "62":{"name":"鄂尔多斯市","parent":"6"},
63
+ "63":{"name":"呼伦贝尔市","parent":"6"},
64
+ "64":{"name":"巴彦淖尔市","parent":"6"},
65
+ "65":{"name":"乌兰察布市","parent":"6"},
66
+ "66":{"name":"兴安盟","parent":"6"},
67
+ "67":{"name":"锡林郭勒盟","parent":"6"},
68
+ "68":{"name":"阿拉善盟","parent":"6"},
69
+ "69":{"name":"沈阳市","parent":"7"},
70
+ "70":{"name":"大连市","parent":"7"},
71
+ "71":{"name":"鞍山市","parent":"7"},
72
+ "72":{"name":"抚顺市","parent":"7"},
73
+ "73":{"name":"本溪市","parent":"7"},
74
+ "74":{"name":"丹东市","parent":"7"},
75
+ "75":{"name":"锦州市","parent":"7"},
76
+ "76":{"name":"营口市","parent":"7"},
77
+ "77":{"name":"阜新市","parent":"7"},
78
+ "78":{"name":"辽阳市","parent":"7"},
79
+ "79":{"name":"盘锦市","parent":"7"},
80
+ "80":{"name":"铁岭市","parent":"7"},
81
+ "81":{"name":"朝阳市","parent":"7"},
82
+ "82":{"name":"葫芦岛市","parent":"7"},
83
+ "83":{"name":"长春市","parent":"8"},
84
+ "84":{"name":"吉林市","parent":"8"},
85
+ "85":{"name":"四平市","parent":"8"},
86
+ "86":{"name":"辽源市","parent":"8"},
87
+ "87":{"name":"通化市","parent":"8"},
88
+ "88":{"name":"白山市","parent":"8"},
89
+ "89":{"name":"松原市","parent":"8"},
90
+ "90":{"name":"白城市","parent":"8"},
91
+ "91":{"name":"延边朝鲜族自治州","parent":"8"},
92
+ "92":{"name":"哈尔滨市","parent":"9"},
93
+ "93":{"name":"齐齐哈尔市","parent":"9"},
94
+ "94":{"name":"鸡西市","parent":"9"},
95
+ "95":{"name":"鹤岗市","parent":"9"},
96
+ "96":{"name":"双鸭山市","parent":"9"},
97
+ "97":{"name":"大庆市","parent":"9"},
98
+ "98":{"name":"伊春市","parent":"9"},
99
+ "99":{"name":"佳木斯市","parent":"9"},
100
+ "100":{"name":"七台河市","parent":"9"},
101
+ "101":{"name":"牡丹江市","parent":"9"},
102
+ "102":{"name":"黑河市","parent":"9"},
103
+ "103":{"name":"绥化市","parent":"9"},
104
+ "104":{"name":"大兴安岭地区","parent":"9"},
105
+ "105":{"name":"上海市","parent":"10"},
106
+ "106":{"name":"南京市","parent":"11"},
107
+ "107":{"name":"无锡市","parent":"11"},
108
+ "108":{"name":"徐州市","parent":"11"},
109
+ "109":{"name":"常州市","parent":"11"},
110
+ "110":{"name":"苏州市","parent":"11"},
111
+ "111":{"name":"南通市","parent":"11"},
112
+ "112":{"name":"连云港市","parent":"11"},
113
+ "113":{"name":"淮安市","parent":"11"},
114
+ "114":{"name":"盐城市","parent":"11"},
115
+ "115":{"name":"扬州市","parent":"11"},
116
+ "116":{"name":"镇江市","parent":"11"},
117
+ "117":{"name":"泰州市","parent":"11"},
118
+ "118":{"name":"宿迁市","parent":"11"},
119
+ "119":{"name":"杭州市","parent":"12"},
120
+ "120":{"name":"宁波市","parent":"12"},
121
+ "121":{"name":"温州市","parent":"12"},
122
+ "122":{"name":"嘉兴市","parent":"12"},
123
+ "123":{"name":"湖州市","parent":"12"},
124
+ "124":{"name":"绍兴市","parent":"12"},
125
+ "125":{"name":"金华市","parent":"12"},
126
+ "126":{"name":"衢州市","parent":"12"},
127
+ "127":{"name":"舟山市","parent":"12"},
128
+ "128":{"name":"台州市","parent":"12"},
129
+ "129":{"name":"丽水市","parent":"12"},
130
+ "130":{"name":"合肥市","parent":"13"},
131
+ "131":{"name":"芜湖市","parent":"13"},
132
+ "132":{"name":"蚌埠市","parent":"13"},
133
+ "133":{"name":"淮南市","parent":"13"},
134
+ "134":{"name":"马鞍山市","parent":"13"},
135
+ "135":{"name":"淮北市","parent":"13"},
136
+ "136":{"name":"铜陵市","parent":"13"},
137
+ "137":{"name":"安庆市","parent":"13"},
138
+ "138":{"name":"黄山市","parent":"13"},
139
+ "139":{"name":"滁州市","parent":"13"},
140
+ "140":{"name":"阜阳市","parent":"13"},
141
+ "141":{"name":"宿州市","parent":"13"},
142
+ "143":{"name":"六安市","parent":"13"},
143
+ "144":{"name":"亳州市","parent":"13"},
144
+ "145":{"name":"池州市","parent":"13"},
145
+ "146":{"name":"宣城市","parent":"13"},
146
+ "147":{"name":"福州市","parent":"14"},
147
+ "148":{"name":"厦门市","parent":"14"},
148
+ "149":{"name":"莆田市","parent":"14"},
149
+ "150":{"name":"三明市","parent":"14"},
150
+ "151":{"name":"泉州市","parent":"14"},
151
+ "152":{"name":"漳州市","parent":"14"},
152
+ "153":{"name":"南平市","parent":"14"},
153
+ "154":{"name":"龙岩市","parent":"14"},
154
+ "155":{"name":"宁德市","parent":"14"},
155
+ "156":{"name":"南昌市","parent":"15"},
156
+ "157":{"name":"景德镇市","parent":"15"},
157
+ "158":{"name":"萍乡市","parent":"15"},
158
+ "159":{"name":"九江市","parent":"15"},
159
+ "160":{"name":"新余市","parent":"15"},
160
+ "161":{"name":"鹰潭市","parent":"15"},
161
+ "162":{"name":"赣州市","parent":"15"},
162
+ "163":{"name":"吉安市","parent":"15"},
163
+ "164":{"name":"宜春市","parent":"15"},
164
+ "165":{"name":"抚州市","parent":"15"},
165
+ "166":{"name":"上饶市","parent":"15"},
166
+ "167":{"name":"济南市","parent":"16"},
167
+ "168":{"name":"青岛市","parent":"16"},
168
+ "169":{"name":"淄博市","parent":"16"},
169
+ "170":{"name":"枣庄市","parent":"16"},
170
+ "171":{"name":"东营市","parent":"16"},
171
+ "172":{"name":"烟台市","parent":"16"},
172
+ "173":{"name":"潍坊市","parent":"16"},
173
+ "174":{"name":"济宁市","parent":"16"},
174
+ "175":{"name":"泰安市","parent":"16"},
175
+ "176":{"name":"威海市","parent":"16"},
176
+ "177":{"name":"日照市","parent":"16"},
177
+ "179":{"name":"临沂市","parent":"16"},
178
+ "180":{"name":"德州市","parent":"16"},
179
+ "181":{"name":"聊城市","parent":"16"},
180
+ "182":{"name":"滨州市","parent":"16"},
181
+ "183":{"name":"菏泽市","parent":"16"},
182
+ "184":{"name":"郑州市","parent":"17"},
183
+ "185":{"name":"开封市","parent":"17"},
184
+ "186":{"name":"洛阳市","parent":"17"},
185
+ "187":{"name":"平顶山市","parent":"17"},
186
+ "188":{"name":"安阳市","parent":"17"},
187
+ "189":{"name":"鹤壁市","parent":"17"},
188
+ "190":{"name":"新乡市","parent":"17"},
189
+ "191":{"name":"焦作市","parent":"17"},
190
+ "192":{"name":"濮阳市","parent":"17"},
191
+ "193":{"name":"许昌市","parent":"17"},
192
+ "194":{"name":"漯河市","parent":"17"},
193
+ "195":{"name":"三门峡市","parent":"17"},
194
+ "196":{"name":"南阳市","parent":"17"},
195
+ "197":{"name":"商丘市","parent":"17"},
196
+ "198":{"name":"信阳市","parent":"17"},
197
+ "199":{"name":"周口市","parent":"17"},
198
+ "200":{"name":"驻马店市","parent":"17"},
199
+ "201":{"name":"武汉市","parent":"18"},
200
+ "202":{"name":"黄石市","parent":"18"},
201
+ "203":{"name":"十堰市","parent":"18"},
202
+ "204":{"name":"宜昌市","parent":"18"},
203
+ "205":{"name":"襄阳市","parent":"18"},
204
+ "206":{"name":"鄂州市","parent":"18"},
205
+ "207":{"name":"荆门市","parent":"18"},
206
+ "208":{"name":"孝感市","parent":"18"},
207
+ "209":{"name":"荆州市","parent":"18"},
208
+ "210":{"name":"黄冈市","parent":"18"},
209
+ "211":{"name":"咸宁市","parent":"18"},
210
+ "212":{"name":"随州市","parent":"18"},
211
+ "213":{"name":"恩施土家族苗族自治州","parent":"18"},
212
+ "215":{"name":"长沙市","parent":"19"},
213
+ "216":{"name":"株洲市","parent":"19"},
214
+ "217":{"name":"湘潭市","parent":"19"},
215
+ "218":{"name":"衡阳市","parent":"19"},
216
+ "219":{"name":"邵阳市","parent":"19"},
217
+ "220":{"name":"岳阳市","parent":"19"},
218
+ "221":{"name":"常德市","parent":"19"},
219
+ "222":{"name":"张家界市","parent":"19"},
220
+ "223":{"name":"益阳市","parent":"19"},
221
+ "224":{"name":"郴州市","parent":"19"},
222
+ "225":{"name":"永州市","parent":"19"},
223
+ "226":{"name":"怀化市","parent":"19"},
224
+ "227":{"name":"娄底市","parent":"19"},
225
+ "228":{"name":"湘西土家族苗族自治州","parent":"19"},
226
+ "229":{"name":"广州市","parent":"20"},
227
+ "230":{"name":"韶关市","parent":"20"},
228
+ "231":{"name":"深圳市","parent":"20"},
229
+ "232":{"name":"珠海市","parent":"20"},
230
+ "233":{"name":"汕头市","parent":"20"},
231
+ "234":{"name":"佛山市","parent":"20"},
232
+ "235":{"name":"江门市","parent":"20"},
233
+ "236":{"name":"湛江市","parent":"20"},
234
+ "237":{"name":"茂名市","parent":"20"},
235
+ "238":{"name":"肇庆市","parent":"20"},
236
+ "239":{"name":"惠州市","parent":"20"},
237
+ "240":{"name":"梅州市","parent":"20"},
238
+ "241":{"name":"汕尾市","parent":"20"},
239
+ "242":{"name":"河源市","parent":"20"},
240
+ "243":{"name":"阳江市","parent":"20"},
241
+ "244":{"name":"清远市","parent":"20"},
242
+ "245":{"name":"东莞市","parent":"20"},
243
+ "246":{"name":"中山市","parent":"20"},
244
+ "247":{"name":"潮州市","parent":"20"},
245
+ "248":{"name":"揭阳市","parent":"20"},
246
+ "249":{"name":"云浮市","parent":"20"},
247
+ "250":{"name":"南宁市","parent":"21"},
248
+ "251":{"name":"柳州市","parent":"21"},
249
+ "252":{"name":"桂林市","parent":"21"},
250
+ "253":{"name":"梧州市","parent":"21"},
251
+ "254":{"name":"北海市","parent":"21"},
252
+ "255":{"name":"防城港市","parent":"21"},
253
+ "256":{"name":"钦州市","parent":"21"},
254
+ "257":{"name":"贵港市","parent":"21"},
255
+ "258":{"name":"玉林市","parent":"21"},
256
+ "259":{"name":"百色市","parent":"21"},
257
+ "260":{"name":"贺州市","parent":"21"},
258
+ "261":{"name":"河池市","parent":"21"},
259
+ "262":{"name":"来宾市","parent":"21"},
260
+ "263":{"name":"崇左市","parent":"21"},
261
+ "264":{"name":"海口市","parent":"22"},
262
+ "265":{"name":"三亚市","parent":"22"},
263
+ "267":{"name":"重庆市","parent":"23"},
264
+ "268":{"name":"成都市","parent":"24"},
265
+ "269":{"name":"自贡市","parent":"24"},
266
+ "270":{"name":"攀枝花市","parent":"24"},
267
+ "271":{"name":"泸州市","parent":"24"},
268
+ "272":{"name":"德阳市","parent":"24"},
269
+ "273":{"name":"绵阳市","parent":"24"},
270
+ "274":{"name":"广元市","parent":"24"},
271
+ "275":{"name":"遂宁市","parent":"24"},
272
+ "276":{"name":"内江市","parent":"24"},
273
+ "277":{"name":"乐山市","parent":"24"},
274
+ "278":{"name":"南充市","parent":"24"},
275
+ "279":{"name":"眉山市","parent":"24"},
276
+ "280":{"name":"宜宾市","parent":"24"},
277
+ "281":{"name":"广安市","parent":"24"},
278
+ "282":{"name":"达州市","parent":"24"},
279
+ "283":{"name":"雅安市","parent":"24"},
280
+ "284":{"name":"巴中市","parent":"24"},
281
+ "285":{"name":"资阳市","parent":"24"},
282
+ "286":{"name":"阿坝藏族羌族自治州","parent":"24"},
283
+ "287":{"name":"甘孜藏族自治州","parent":"24"},
284
+ "288":{"name":"凉山彝族自治州","parent":"24"},
285
+ "289":{"name":"贵阳市","parent":"25"},
286
+ "290":{"name":"六盘水市","parent":"25"},
287
+ "291":{"name":"遵义市","parent":"25"},
288
+ "292":{"name":"安顺市","parent":"25"},
289
+ "293":{"name":"铜仁市","parent":"25"},
290
+ "294":{"name":"黔西南布依族苗族自治州","parent":"25"},
291
+ "295":{"name":"毕节市","parent":"25"},
292
+ "296":{"name":"黔东南苗族侗族自治州","parent":"25"},
293
+ "297":{"name":"黔南布依族苗族自治州","parent":"25"},
294
+ "298":{"name":"昆明市","parent":"26"},
295
+ "299":{"name":"曲靖市","parent":"26"},
296
+ "300":{"name":"玉溪市","parent":"26"},
297
+ "301":{"name":"保山市","parent":"26"},
298
+ "302":{"name":"昭通市","parent":"26"},
299
+ "303":{"name":"丽江市","parent":"26"},
300
+ "304":{"name":"普洱市","parent":"26"},
301
+ "305":{"name":"临沧市","parent":"26"},
302
+ "306":{"name":"楚雄彝族自治州","parent":"26"},
303
+ "307":{"name":"红河哈尼族彝族自治州","parent":"26"},
304
+ "308":{"name":"文山壮族苗族自治州","parent":"26"},
305
+ "309":{"name":"西双版纳傣族自治州","parent":"26"},
306
+ "310":{"name":"大理白族自治州","parent":"26"},
307
+ "311":{"name":"德宏傣族景颇族自治州","parent":"26"},
308
+ "312":{"name":"怒江傈僳族自治州","parent":"26"},
309
+ "313":{"name":"迪庆藏族自治州","parent":"26"},
310
+ "314":{"name":"拉萨市","parent":"27"},
311
+ "315":{"name":"昌都市","parent":"27"},
312
+ "316":{"name":"山南市","parent":"27"},
313
+ "317":{"name":"日喀则市","parent":"27"},
314
+ "318":{"name":"那曲市","parent":"27"},
315
+ "319":{"name":"阿里地区","parent":"27"},
316
+ "320":{"name":"林芝市","parent":"27"},
317
+ "321":{"name":"西安市","parent":"28"},
318
+ "322":{"name":"铜川市","parent":"28"},
319
+ "323":{"name":"宝鸡市","parent":"28"},
320
+ "324":{"name":"咸阳市","parent":"28"},
321
+ "325":{"name":"渭南市","parent":"28"},
322
+ "326":{"name":"延安市","parent":"28"},
323
+ "327":{"name":"汉中市","parent":"28"},
324
+ "328":{"name":"榆林市","parent":"28"},
325
+ "329":{"name":"安康市","parent":"28"},
326
+ "330":{"name":"商洛市","parent":"28"},
327
+ "331":{"name":"兰州市","parent":"29"},
328
+ "332":{"name":"嘉峪关市","parent":"29"},
329
+ "333":{"name":"金昌市","parent":"29"},
330
+ "334":{"name":"白银市","parent":"29"},
331
+ "335":{"name":"天水市","parent":"29"},
332
+ "336":{"name":"武威市","parent":"29"},
333
+ "337":{"name":"张掖市","parent":"29"},
334
+ "338":{"name":"平凉市","parent":"29"},
335
+ "339":{"name":"酒泉市","parent":"29"},
336
+ "340":{"name":"庆阳市","parent":"29"},
337
+ "341":{"name":"定西市","parent":"29"},
338
+ "342":{"name":"陇南市","parent":"29"},
339
+ "343":{"name":"临夏回族自治州","parent":"29"},
340
+ "344":{"name":"甘南藏族自治州","parent":"29"},
341
+ "345":{"name":"西宁市","parent":"30"},
342
+ "346":{"name":"海东市","parent":"30"},
343
+ "347":{"name":"海北藏族自治州","parent":"30"},
344
+ "348":{"name":"黄南藏族自治州","parent":"30"},
345
+ "349":{"name":"海南藏族自治州","parent":"30"},
346
+ "350":{"name":"果洛藏族自治州","parent":"30"},
347
+ "351":{"name":"玉树藏族自治州","parent":"30"},
348
+ "352":{"name":"海西蒙古族藏族自治州","parent":"30"},
349
+ "353":{"name":"银川市","parent":"31"},
350
+ "354":{"name":"石嘴山市","parent":"31"},
351
+ "355":{"name":"吴忠市","parent":"31"},
352
+ "356":{"name":"固原市","parent":"31"},
353
+ "357":{"name":"中卫市","parent":"31"},
354
+ "358":{"name":"乌鲁木齐市","parent":"32"},
355
+ "359":{"name":"克拉玛依市","parent":"32"},
356
+ "360":{"name":"吐鲁番市","parent":"32"},
357
+ "361":{"name":"哈密市","parent":"32"},
358
+ "362":{"name":"昌吉回族自治州","parent":"32"},
359
+ "363":{"name":"博尔塔拉蒙古自治州","parent":"32"},
360
+ "364":{"name":"巴音郭楞蒙古自治州","parent":"32"},
361
+ "365":{"name":"阿克苏地区","parent":"32"},
362
+ "366":{"name":"克孜勒苏柯尔克孜自治州","parent":"32"},
363
+ "367":{"name":"喀什地区","parent":"32"},
364
+ "368":{"name":"和田地区","parent":"32"},
365
+ "369":{"name":"伊犁哈萨克自治州","parent":"32"},
366
+ "370":{"name":"塔城地区","parent":"32"},
367
+ "371":{"name":"阿勒泰地区","parent":"32"},
368
+ "372":{"name":"新疆省直辖行政单位","parent":"32"},
369
+ "373":{"name":"可克达拉市","parent":"32"},
370
+ "374":{"name":"昆玉市","parent":"32"},
371
+ "375":{"name":"胡杨河市","parent":"32"},
372
+ "376":{"name":"双河市","parent":"32"},
373
+ "3560":{"name":"北票市","parent":"7"},
374
+ "3615":{"name":"高州市","parent":"20"},
375
+ "3651":{"name":"济源市","parent":"17"},
376
+ "3662":{"name":"胶南市","parent":"16"},
377
+ "3683":{"name":"老河口市","parent":"18"},
378
+ "3758":{"name":"沙河市","parent":"4"},
379
+ "3822":{"name":"宜城市","parent":"18"},
380
+ "3842":{"name":"枣阳市","parent":"18"},
381
+ "3850":{"name":"肇东市","parent":"9"},
382
+ "3905":{"name":"澳门","parent":"1"},
383
+ "3906":{"name":"澳门","parent":"3905"},
384
+ "3907":{"name":"香港","parent":"1"},
385
+ "3908":{"name":"香港","parent":"3907"},
386
+ "3947":{"name":"仙桃市","parent":"18"},
387
+ "3954":{"name":"台湾","parent":"1"},
388
+ "3955":{"name":"台湾","parent":"3954"},
389
+ "3956":{"name":"海外","parent":"1"},
390
+ "3957":{"name":"海外","parent":"3956"},
391
+ "3958":{"name":"美国","parent":"3956"},
392
+ "3959":{"name":"加拿大","parent":"3956"},
393
+ "3961":{"name":"日本","parent":"3956"},
394
+ "3962":{"name":"韩国","parent":"3956"},
395
+ "3963":{"name":"德国","parent":"3956"},
396
+ "3964":{"name":"英国","parent":"3956"},
397
+ "3965":{"name":"意大利","parent":"3956"},
398
+ "3966":{"name":"西班牙","parent":"3956"},
399
+ "3967":{"name":"法国","parent":"3956"},
400
+ "3968":{"name":"澳大利亚","parent":"3956"},
401
+ "3969":{"name":"东城区","parent":"2"},
402
+ "3970":{"name":"西城区","parent":"2"},
403
+ "3971":{"name":"崇文区","parent":"2"},
404
+ "3972":{"name":"宣武区","parent":"2"},
405
+ "3973":{"name":"朝阳区","parent":"2"},
406
+ "3974":{"name":"海淀区","parent":"2"},
407
+ "3975":{"name":"丰台区","parent":"2"},
408
+ "3976":{"name":"石景山区","parent":"2"},
409
+ "3977":{"name":"门头沟区","parent":"2"},
410
+ "3978":{"name":"房山区","parent":"2"},
411
+ "3979":{"name":"通州区","parent":"2"},
412
+ "3980":{"name":"顺义区","parent":"2"},
413
+ "3981":{"name":"昌平区","parent":"2"},
414
+ "3982":{"name":"大兴区","parent":"2"},
415
+ "3983":{"name":"平谷区","parent":"2"},
416
+ "3984":{"name":"怀柔区","parent":"2"},
417
+ "3985":{"name":"密云区","parent":"2"},
418
+ "3986":{"name":"延庆区","parent":"2"},
419
+ "3987":{"name":"黄浦区","parent":"10"},
420
+ "3988":{"name":"徐汇区","parent":"10"},
421
+ "3989":{"name":"长宁区","parent":"10"},
422
+ "3990":{"name":"静安区","parent":"10"},
423
+ "3991":{"name":"普陀区","parent":"10"},
424
+ "3992":{"name":"闸北区","parent":"10"},
425
+ "3993":{"name":"虹口区","parent":"10"},
426
+ "3994":{"name":"杨浦区","parent":"10"},
427
+ "3995":{"name":"宝山区","parent":"10"},
428
+ "3996":{"name":"闵行区","parent":"10"},
429
+ "3997":{"name":"嘉定区","parent":"10"},
430
+ "3998":{"name":"浦东新区","parent":"10"},
431
+ "3999":{"name":"松江区","parent":"10"},
432
+ "4000":{"name":"金山区","parent":"10"},
433
+ "4001":{"name":"青浦区","parent":"10"},
434
+ "4002":{"name":"奉贤区","parent":"10"},
435
+ "4003":{"name":"崇明区","parent":"10"},
436
+ "4004":{"name":"和平区","parent":"3"},
437
+ "4005":{"name":"河东区","parent":"3"},
438
+ "4006":{"name":"河西区","parent":"3"},
439
+ "4007":{"name":"南开区","parent":"3"},
440
+ "4008":{"name":"红桥区","parent":"3"},
441
+ "4009":{"name":"河北区","parent":"3"},
442
+ "4010":{"name":"滨海新区","parent":"3"},
443
+ "4011":{"name":"东丽区","parent":"3"},
444
+ "4012":{"name":"西青区","parent":"3"},
445
+ "4013":{"name":"北辰区","parent":"3"},
446
+ "4014":{"name":"津南区","parent":"3"},
447
+ "4015":{"name":"武清区","parent":"3"},
448
+ "4016":{"name":"宝坻区","parent":"3"},
449
+ "4017":{"name":"静海区","parent":"3"},
450
+ "4018":{"name":"宁河区","parent":"3"},
451
+ "4019":{"name":"蓟州区","parent":"3"},
452
+ "4020":{"name":"渝中区","parent":"23"},
453
+ "4021":{"name":"江北区","parent":"23"},
454
+ "4022":{"name":"南岸区","parent":"23"},
455
+ "4023":{"name":"沙坪坝区","parent":"23"},
456
+ "4024":{"name":"九龙坡区","parent":"23"},
457
+ "4025":{"name":"大渡口区","parent":"23"},
458
+ "4026":{"name":"渝北区","parent":"23"},
459
+ "4027":{"name":"巴南区","parent":"23"},
460
+ "4028":{"name":"北碚区","parent":"23"},
461
+ "4029":{"name":"万州区","parent":"23"},
462
+ "4030":{"name":"黔江区","parent":"23"},
463
+ "4031":{"name":"永川区","parent":"23"},
464
+ "4032":{"name":"涪陵区","parent":"23"},
465
+ "4033":{"name":"江津区","parent":"23"},
466
+ "4034":{"name":"合川区","parent":"23"},
467
+ "4035":{"name":"双桥区","parent":"23"},
468
+ "4036":{"name":"万盛区","parent":"23"},
469
+ "4037":{"name":"荣昌区","parent":"23"},
470
+ "4038":{"name":"大足区","parent":"23"},
471
+ "4039":{"name":"璧山区","parent":"23"},
472
+ "4040":{"name":"铜梁区","parent":"23"},
473
+ "4041":{"name":"潼南区","parent":"23"},
474
+ "4042":{"name":"綦江区","parent":"23"},
475
+ "4043":{"name":"忠县","parent":"23"},
476
+ "4044":{"name":"开州区","parent":"23"},
477
+ "4045":{"name":"云阳县","parent":"23"},
478
+ "4046":{"name":"梁平区","parent":"23"},
479
+ "4047":{"name":"垫江县","parent":"23"},
480
+ "4048":{"name":"丰都县","parent":"23"},
481
+ "4049":{"name":"奉节县","parent":"23"},
482
+ "4050":{"name":"巫山县","parent":"23"},
483
+ "4051":{"name":"巫溪县","parent":"23"},
484
+ "4052":{"name":"城口县","parent":"23"},
485
+ "4053":{"name":"武隆区","parent":"23"},
486
+ "4054":{"name":"石柱土家族自治县","parent":"23"},
487
+ "4055":{"name":"秀山土家族苗族自治县","parent":"23"},
488
+ "4056":{"name":"酉阳土家族苗族自治县","parent":"23"},
489
+ "4057":{"name":"彭水苗族土家族自治县","parent":"23"},
490
+ "4058":{"name":"潜江市","parent":"18"},
491
+ "4059":{"name":"三沙市","parent":"22"},
492
+ "4060":{"name":"石河子市","parent":"32"},
493
+ "4061":{"name":"阿拉尔市","parent":"32"},
494
+ "4062":{"name":"图木舒克市","parent":"32"},
495
+ "4063":{"name":"五家渠市","parent":"32"},
496
+ "4064":{"name":"北屯市","parent":"32"},
497
+ "4065":{"name":"铁门关市","parent":"32"},
498
+ "4066":{"name":"儋州市","parent":"22"},
499
+ "4067":{"name":"五指山市","parent":"22"},
500
+ "4068":{"name":"文昌市","parent":"22"},
501
+ "4069":{"name":"琼海市","parent":"22"},
502
+ "4070":{"name":"万宁市","parent":"22"},
503
+ "4072":{"name":"定安县","parent":"22"},
504
+ "4073":{"name":"屯昌县","parent":"22"},
505
+ "4074":{"name":"澄迈县","parent":"22"},
506
+ "4075":{"name":"临高县","parent":"22"},
507
+ "4076":{"name":"琼中黎族苗族自治县","parent":"22"},
508
+ "4077":{"name":"保亭黎族苗族自治县","parent":"22"},
509
+ "4078":{"name":"白沙黎族自治县","parent":"22"},
510
+ "4079":{"name":"昌江黎族自治县","parent":"22"},
511
+ "4080":{"name":"乐东黎族自治县","parent":"22"},
512
+ "4081":{"name":"陵水黎族自治县","parent":"22"},
513
+ "4082":{"name":"马来西亚","parent":"3956"},
514
+ "6047":{"name":"长寿区","parent":"23"},
515
+ "6857":{"name":"阿富汗","parent":"3956"},
516
+ "6858":{"name":"阿尔巴尼亚","parent":"3956"},
517
+ "6859":{"name":"阿尔及利亚","parent":"3956"},
518
+ "6860":{"name":"美属萨摩亚","parent":"3956"},
519
+ "6861":{"name":"安道尔","parent":"3956"},
520
+ "6862":{"name":"安哥拉","parent":"3956"},
521
+ "6863":{"name":"安圭拉","parent":"3956"},
522
+ "6864":{"name":"南极洲","parent":"3956"},
523
+ "6865":{"name":"安提瓜和巴布达","parent":"3956"},
524
+ "6866":{"name":"阿根廷","parent":"3956"},
525
+ "6867":{"name":"亚美尼亚","parent":"3956"},
526
+ "6869":{"name":"奥地利","parent":"3956"},
527
+ "6870":{"name":"阿塞拜疆","parent":"3956"},
528
+ "6871":{"name":"巴哈马","parent":"3956"},
529
+ "6872":{"name":"巴林","parent":"3956"},
530
+ "6873":{"name":"孟加拉国","parent":"3956"},
531
+ "6874":{"name":"巴巴多斯","parent":"3956"},
532
+ "6875":{"name":"白俄罗斯","parent":"3956"},
533
+ "6876":{"name":"比利时","parent":"3956"},
534
+ "6877":{"name":"伯利兹","parent":"3956"},
535
+ "6878":{"name":"贝宁","parent":"3956"},
536
+ "6879":{"name":"百慕大","parent":"3956"},
537
+ "6880":{"name":"不丹","parent":"3956"},
538
+ "6881":{"name":"玻利维亚","parent":"3956"},
539
+ "6882":{"name":"波黑","parent":"3956"},
540
+ "6883":{"name":"博茨瓦纳","parent":"3956"},
541
+ "6884":{"name":"布维岛","parent":"3956"},
542
+ "6885":{"name":"巴西","parent":"3956"},
543
+ "6886":{"name":"英属印度洋领土","parent":"3956"},
544
+ "6887":{"name":"文莱","parent":"3956"},
545
+ "6888":{"name":"保加利亚","parent":"3956"},
546
+ "6889":{"name":"布基纳法索","parent":"3956"},
547
+ "6890":{"name":"布隆迪","parent":"3956"},
548
+ "6891":{"name":"柬埔寨","parent":"3956"},
549
+ "6892":{"name":"喀麦隆","parent":"3956"},
550
+ "6893":{"name":"佛得角","parent":"3956"},
551
+ "6894":{"name":"开曼群岛","parent":"3956"},
552
+ "6895":{"name":"中非","parent":"3956"},
553
+ "6896":{"name":"乍得","parent":"3956"},
554
+ "6897":{"name":"智利","parent":"3956"},
555
+ "6898":{"name":"圣诞岛","parent":"3956"},
556
+ "6899":{"name":"科科斯(基林)群岛","parent":"3956"},
557
+ "6900":{"name":"哥伦比亚","parent":"3956"},
558
+ "6901":{"name":"科摩罗","parent":"3956"},
559
+ "6902":{"name":"刚果(布)","parent":"3956"},
560
+ "6903":{"name":"刚果(金)","parent":"3956"},
561
+ "6904":{"name":"库克群岛","parent":"3956"},
562
+ "6905":{"name":"哥斯达黎加","parent":"3956"},
563
+ "6906":{"name":"科特迪瓦","parent":"3956"},
564
+ "6907":{"name":"克罗地亚","parent":"3956"},
565
+ "6908":{"name":"古巴","parent":"3956"},
566
+ "6909":{"name":"塞浦路斯","parent":"3956"},
567
+ "6910":{"name":"捷克","parent":"3956"},
568
+ "6911":{"name":"丹麦","parent":"3956"},
569
+ "6912":{"name":"吉布提","parent":"3956"},
570
+ "6913":{"name":"多米尼克","parent":"3956"},
571
+ "6914":{"name":"多米尼加共和国","parent":"3956"},
572
+ "6915":{"name":"东帝汶","parent":"3956"},
573
+ "6916":{"name":"厄瓜多尔","parent":"3956"},
574
+ "6917":{"name":"埃及","parent":"3956"},
575
+ "6918":{"name":"萨尔瓦多","parent":"3956"},
576
+ "6919":{"name":"赤道几内亚","parent":"3956"},
577
+ "6920":{"name":"厄立特里亚","parent":"3956"},
578
+ "6921":{"name":"爱沙尼亚","parent":"3956"},
579
+ "6922":{"name":"埃塞俄比亚","parent":"3956"},
580
+ "6923":{"name":"福克兰群岛(马尔维纳斯)","parent":"3956"},
581
+ "6924":{"name":"法罗群岛","parent":"3956"},
582
+ "6925":{"name":"斐济","parent":"3956"},
583
+ "6926":{"name":"芬兰","parent":"3956"},
584
+ "6927":{"name":"法属圭亚那","parent":"3956"},
585
+ "6928":{"name":"法属波利尼西亚","parent":"3956"},
586
+ "6929":{"name":"法属南部领土","parent":"3956"},
587
+ "6930":{"name":"加蓬","parent":"3956"},
588
+ "6931":{"name":"冈比亚","parent":"3956"},
589
+ "6932":{"name":"格鲁吉亚","parent":"3956"},
590
+ "6933":{"name":"加纳","parent":"3956"},
591
+ "6934":{"name":"直布罗陀","parent":"3956"},
592
+ "6935":{"name":"希腊","parent":"3956"},
593
+ "6936":{"name":"格陵兰","parent":"3956"},
594
+ "6937":{"name":"格林纳达","parent":"3956"},
595
+ "6938":{"name":"瓜德罗普","parent":"3956"},
596
+ "6939":{"name":"关岛","parent":"3956"},
597
+ "6940":{"name":"危地马拉","parent":"3956"},
598
+ "6941":{"name":"几内亚","parent":"3956"},
599
+ "6942":{"name":"几内亚比绍","parent":"3956"},
600
+ "6943":{"name":"圭亚那","parent":"3956"},
601
+ "6944":{"name":"海地","parent":"3956"},
602
+ "6945":{"name":"赫德岛和麦克唐纳岛","parent":"3956"},
603
+ "6946":{"name":"洪都拉斯","parent":"3956"},
604
+ "6947":{"name":"匈牙利","parent":"3956"},
605
+ "6948":{"name":"冰岛","parent":"3956"},
606
+ "6949":{"name":"印度","parent":"3956"},
607
+ "6950":{"name":"印度尼西亚","parent":"3956"},
608
+ "6951":{"name":"伊朗","parent":"3956"},
609
+ "6952":{"name":"伊拉克","parent":"3956"},
610
+ "6953":{"name":"爱尔兰","parent":"3956"},
611
+ "6954":{"name":"以色列","parent":"3956"},
612
+ "6955":{"name":"牙买加","parent":"3956"},
613
+ "6956":{"name":"约旦","parent":"3956"},
614
+ "6957":{"name":"哈萨克斯坦","parent":"3956"},
615
+ "6958":{"name":"肯尼亚","parent":"3956"},
616
+ "6959":{"name":"基里巴斯","parent":"3956"},
617
+ "6960":{"name":"朝鲜","parent":"3956"},
618
+ "6961":{"name":"科威特","parent":"3956"},
619
+ "6962":{"name":"吉尔吉斯斯坦","parent":"3956"},
620
+ "6963":{"name":"老挝","parent":"3956"},
621
+ "6964":{"name":"拉脱维亚","parent":"3956"},
622
+ "6965":{"name":"黎巴嫩","parent":"3956"},
623
+ "6966":{"name":"莱索托","parent":"3956"},
624
+ "6967":{"name":"利比里亚","parent":"3956"},
625
+ "6968":{"name":"利比亚","parent":"3956"},
626
+ "6969":{"name":"列支敦士登","parent":"3956"},
627
+ "6970":{"name":"立陶宛","parent":"3956"},
628
+ "6971":{"name":"卢森堡","parent":"3956"},
629
+ "6972":{"name":"前南马其顿","parent":"3956"},
630
+ "6973":{"name":"马达加斯加","parent":"3956"},
631
+ "6974":{"name":"马拉维","parent":"3956"},
632
+ "6975":{"name":"马尔代夫","parent":"3956"},
633
+ "6976":{"name":"马里","parent":"3956"},
634
+ "6977":{"name":"马耳他","parent":"3956"},
635
+ "6978":{"name":"马绍尔群岛","parent":"3956"},
636
+ "6979":{"name":"马提尼克","parent":"3956"},
637
+ "6980":{"name":"毛里塔尼亚","parent":"3956"},
638
+ "6981":{"name":"毛里求斯","parent":"3956"},
639
+ "6982":{"name":"马约特","parent":"3956"},
640
+ "6983":{"name":"墨西哥","parent":"3956"},
641
+ "6984":{"name":"密克罗尼西亚联邦","parent":"3956"},
642
+ "6985":{"name":"摩尔多瓦","parent":"3956"},
643
+ "6986":{"name":"摩纳哥","parent":"3956"},
644
+ "6987":{"name":"蒙古","parent":"3956"},
645
+ "6988":{"name":"蒙特塞拉特","parent":"3956"},
646
+ "6989":{"name":"摩洛哥","parent":"3956"},
647
+ "6990":{"name":"莫桑比克","parent":"3956"},
648
+ "6991":{"name":"缅甸","parent":"3956"},
649
+ "6992":{"name":"纳米比亚","parent":"3956"},
650
+ "6993":{"name":"瑙鲁","parent":"3956"},
651
+ "6994":{"name":"尼泊尔","parent":"3956"},
652
+ "6995":{"name":"荷兰","parent":"3956"},
653
+ "6996":{"name":"荷属安的列斯","parent":"3956"},
654
+ "6997":{"name":"新喀里多尼亚","parent":"3956"},
655
+ "6998":{"name":"新西兰","parent":"3956"},
656
+ "6999":{"name":"尼加拉瓜","parent":"3956"},
657
+ "7000":{"name":"尼日尔","parent":"3956"},
658
+ "7001":{"name":"尼日利亚","parent":"3956"},
659
+ "7002":{"name":"纽埃","parent":"3956"},
660
+ "7003":{"name":"诺福克岛","parent":"3956"},
661
+ "7004":{"name":"北马里亚纳","parent":"3956"},
662
+ "7005":{"name":"挪威","parent":"3956"},
663
+ "7006":{"name":"阿曼","parent":"3956"},
664
+ "7007":{"name":"巴基斯坦","parent":"3956"},
665
+ "7008":{"name":"帕劳","parent":"3956"},
666
+ "7009":{"name":"巴勒斯坦","parent":"3956"},
667
+ "7010":{"name":"巴拿马","parent":"3956"},
668
+ "7011":{"name":"巴布亚新几内亚","parent":"3956"},
669
+ "7012":{"name":"巴拉圭","parent":"3956"},
670
+ "7013":{"name":"秘鲁","parent":"3956"},
671
+ "7014":{"name":"菲律宾","parent":"3956"},
672
+ "7015":{"name":"皮特凯恩群岛","parent":"3956"},
673
+ "7016":{"name":"波兰","parent":"3956"},
674
+ "7017":{"name":"葡萄牙","parent":"3956"},
675
+ "7018":{"name":"波多黎各","parent":"3956"},
676
+ "7019":{"name":"卡塔尔","parent":"3956"},
677
+ "7020":{"name":"留尼汪","parent":"3956"},
678
+ "7021":{"name":"罗马尼亚","parent":"3956"},
679
+ "7022":{"name":"俄罗斯联邦","parent":"3956"},
680
+ "7023":{"name":"卢旺达","parent":"3956"},
681
+ "7024":{"name":"圣赫勒拿","parent":"3956"},
682
+ "7025":{"name":"圣基茨和尼维斯","parent":"3956"},
683
+ "7026":{"name":"圣卢西亚","parent":"3956"},
684
+ "7027":{"name":"圣皮埃尔和密克隆","parent":"3956"},
685
+ "7028":{"name":"圣文森特和格林纳丁斯","parent":"3956"},
686
+ "7029":{"name":"萨摩亚","parent":"3956"},
687
+ "7030":{"name":"圣马力诺","parent":"3956"},
688
+ "7031":{"name":"圣多美和普林西比","parent":"3956"},
689
+ "7032":{"name":"沙特阿拉伯","parent":"3956"},
690
+ "7033":{"name":"塞内加尔","parent":"3956"},
691
+ "7034":{"name":"塞舌尔","parent":"3956"},
692
+ "7035":{"name":"塞拉利昂","parent":"3956"},
693
+ "7036":{"name":"新加坡","parent":"3956"},
694
+ "7037":{"name":"斯洛伐克","parent":"3956"},
695
+ "7038":{"name":"斯洛文尼亚","parent":"3956"},
696
+ "7039":{"name":"所罗门群岛","parent":"3956"},
697
+ "7040":{"name":"索马里","parent":"3956"},
698
+ "7041":{"name":"南非","parent":"3956"},
699
+ "7042":{"name":"南乔治亚岛和南桑德韦奇岛","parent":"3956"},
700
+ "7043":{"name":"斯里兰卡","parent":"3956"},
701
+ "7044":{"name":"苏丹","parent":"3956"},
702
+ "7045":{"name":"苏里南","parent":"3956"},
703
+ "7046":{"name":"斯瓦尔巴群岛","parent":"3956"},
704
+ "7047":{"name":"斯威士兰","parent":"3956"},
705
+ "7048":{"name":"瑞典","parent":"3956"},
706
+ "7049":{"name":"瑞士","parent":"3956"},
707
+ "7050":{"name":"叙利亚","parent":"3956"},
708
+ "7051":{"name":"塔吉克斯坦","parent":"3956"},
709
+ "7052":{"name":"坦桑尼亚","parent":"3956"},
710
+ "7053":{"name":"泰国","parent":"3956"},
711
+ "7054":{"name":"多哥","parent":"3956"},
712
+ "7055":{"name":"托克劳","parent":"3956"},
713
+ "7056":{"name":"汤加","parent":"3956"},
714
+ "7057":{"name":"特立尼达和多巴哥","parent":"3956"},
715
+ "7058":{"name":"突尼斯","parent":"3956"},
716
+ "7059":{"name":"土耳其","parent":"3956"},
717
+ "7060":{"name":"土库曼斯坦","parent":"3956"},
718
+ "7061":{"name":"特克斯科斯群岛","parent":"3956"},
719
+ "7062":{"name":"图瓦卢","parent":"3956"},
720
+ "7063":{"name":"乌干达","parent":"3956"},
721
+ "7064":{"name":"乌克兰","parent":"3956"},
722
+ "7065":{"name":"阿联酋","parent":"3956"},
723
+ "7066":{"name":"美国本土外小岛屿","parent":"3956"},
724
+ "7067":{"name":"乌拉圭","parent":"3956"},
725
+ "7068":{"name":"乌兹别克斯坦","parent":"3956"},
726
+ "7069":{"name":"瓦努阿图","parent":"3956"},
727
+ "7070":{"name":"梵蒂冈","parent":"3956"},
728
+ "7071":{"name":"委内瑞拉","parent":"3956"},
729
+ "7072":{"name":"越南","parent":"3956"},
730
+ "7073":{"name":"英属维尔京群岛","parent":"3956"},
731
+ "7074":{"name":"美属维尔京群岛","parent":"3956"},
732
+ "7075":{"name":"瓦利斯和富图纳","parent":"3956"},
733
+ "7076":{"name":"西撒哈拉","parent":"3956"},
734
+ "7077":{"name":"也门","parent":"3956"},
735
+ "7078":{"name":"南斯拉夫","parent":"3956"},
736
+ "7079":{"name":"赞比亚","parent":"3956"},
737
+ "7080":{"name":"津巴布韦","parent":"3956"},
738
+ "7081":{"name":"塞尔维亚","parent":"3956"},
739
+ "7082":{"name":"雄安新区","parent":"4"},
740
+ "7084":{"name":"天门市","parent":"18"}
741
+ }
742
+
743
+ NM_SET = set([v["name"] for _,v in TBL.items()])
744
+
745
+ def get_names(id):
746
+ if not id or str(id).lower() == "none":return []
747
+ id = str(id)
748
+ if not re.match("[0-9]+$", id.strip()):return [id]
749
+ nms = []
750
+ d = TBL.get(id)
751
+ if not d:return[]
752
+ nms.append(d["name"])
753
+ p = get_names(d["parent"])
754
+ if p: nms.extend(p)
755
+ return nms
756
+
757
+ import re
758
+ def isName(nm):
759
+ if nm in NM_SET:return True
760
+ if nm + "市" in NM_SET:return True
761
+ if re.sub(r"(省|(回族|壮族|维吾尔)*自治区)$", "", nm) in NM_SET:return True
762
+ return False
deepdoc/parser/resume/entities/res/corp.tks.freq.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ "科技",
3
+ "集团",
4
+ "网络科技",
5
+ "技术",
6
+ "信息",
7
+ "分公司",
8
+ "信息技术",
9
+ "发展",
10
+ "科技股份",
11
+ "网络",
12
+ "贸易",
13
+ "商贸",
14
+ "工程",
15
+ "企业",
16
+ "集团股份",
17
+ "商务",
18
+ "工业",
19
+ "控股集团",
20
+ "国际贸易",
21
+ "软件技术",
22
+ "数码科技",
23
+ "软件开发",
24
+ "有限",
25
+ "经营",
26
+ "科技开发",
27
+ "股份公司",
28
+ "电子技术",
29
+ "实业集团",
30
+ "责任",
31
+ "无限",
32
+ "工程技术",
33
+ "上市公司",
34
+ "技术开发",
35
+ "软件系统",
36
+ "总公司",
37
+ "网络服务",
38
+ "ltd.",
39
+ "technology",
40
+ "company",
41
+ "服务公司",
42
+ "计算机技术",
43
+ "计算机软件",
44
+ "电子信息",
45
+ "corporation",
46
+ "计算机服务",
47
+ "计算机系统",
48
+ "有限公司",
49
+ "事业部",
50
+ "公司",
51
+ "股份",
52
+ "有限责任",
53
+ "软件",
54
+ "控股",
55
+ "高科技",
56
+ "房地产",
57
+ "事业群",
58
+ "部门",
59
+ "电子商务",
60
+ "人力资源顾问",
61
+ "人力资源",
62
+ "株式会社",
63
+ "网络营销"
64
+ ]
65
+
deepdoc/parser/resume/entities/res/corp_baike_len.csv ADDED
The diff for this file is too large to render. See raw diff
 
deepdoc/parser/resume/entities/res/corp_tag.json ADDED
The diff for this file is too large to render. See raw diff
 
deepdoc/parser/resume/entities/res/good_corp.json ADDED
@@ -0,0 +1,911 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ "google assistant investments",
3
+ "amazon",
4
+ "dingtalk china information",
5
+ "zhejiang alibaba communication",
6
+ "yunos",
7
+ "腾讯云",
8
+ "新浪新闻",
9
+ "网邻通",
10
+ "蚂蚁集团",
11
+ "大疆",
12
+ "恒生股份",
13
+ "sf express",
14
+ "智者天下",
15
+ "shanghai hema network",
16
+ "papayamobile",
17
+ "lexinfintech",
18
+ "industrial consumer finance",
19
+ "360搜索",
20
+ "世纪光速",
21
+ "迅雷区块链",
22
+ "赛盒科技",
23
+ "齐力电子商务",
24
+ "平安养老险",
25
+ "平安证券",
26
+ "平安好贷",
27
+ "五八新服",
28
+ "呯嘭智能",
29
+ "阿里妈妈",
30
+ "mdt",
31
+ "tencent",
32
+ "weibo",
33
+ "浪潮软件",
34
+ "阿里巴巴广告",
35
+ "mashang consumer finance",
36
+ "维沃",
37
+ "hqg , limited",
38
+ "moodys",
39
+ "搜狐支付",
40
+ "百度秀",
41
+ "新浪服务",
42
+ "零售通",
43
+ "同城艺龙",
44
+ "虾米音乐",
45
+ "贝壳集团",
46
+ "小米有品",
47
+ "滴滴自动驾驶",
48
+ "图记",
49
+ "阿里影业",
50
+ "卓联软件",
51
+ "zhejiang tmall",
52
+ "谷歌中国",
53
+ "hithink flush",
54
+ "时装科技",
55
+ "程会玩国际旅行社",
56
+ "amazon china holding limited",
57
+ "中信消金",
58
+ "当当比特物流",
59
+ "新浪新媒体咨询",
60
+ "tongcheng network",
61
+ "金山在线",
62
+ "shopping cart",
63
+ "犀互动",
64
+ "五八",
65
+ "bilibili",
66
+ "阿里星球",
67
+ "滴滴金科服务",
68
+ "美团",
69
+ "哈啰出行",
70
+ "face",
71
+ "平安健康",
72
+ "招商银行",
73
+ "连亚",
74
+ "盒马网络",
75
+ "b站",
76
+ "华为机器",
77
+ "shanghai mdt infotech",
78
+ "ping an healthkonnect",
79
+ "beijing home link real estate broker",
80
+ "花海仓",
81
+ "beijing jingdong shangke information",
82
+ "微影智能",
83
+ "酷狗游戏",
84
+ "health.pingan.com",
85
+ "众安",
86
+ "陌陌",
87
+ "海康威视数字",
88
+ "同程网",
89
+ "艾丁金融",
90
+ "知乎",
91
+ " lu",
92
+ "国际商业机器公司",
93
+ "捷信消费金融",
94
+ "恒生利融",
95
+ "china merchants bank",
96
+ "企鹅电竞",
97
+ "捷信信驰",
98
+ "360智能家居",
99
+ "小桔车服",
100
+ "homecredit",
101
+ "皮皮虾",
102
+ "畅游",
103
+ "聚爱聊",
104
+ "suning.com",
105
+ "途牛旅游网",
106
+ "花呗",
107
+ "盈店通",
108
+ "sina",
109
+ "阿里巴巴音乐",
110
+ "华为技术有限公司",
111
+ "国付宝",
112
+ "shanghai lianshang network",
113
+ "oppo",
114
+ "华为投资控股",
115
+ "beijing sohu new media information",
116
+ "times square",
117
+ "菜鸟物流",
118
+ "lingxing",
119
+ "jd digits",
120
+ "同程旅游",
121
+ "分期乐",
122
+ "火锅视频",
123
+ "天天快报",
124
+ "猎豹移动",
125
+ "五八人力资源",
126
+ "宝宝树",
127
+ "顺丰科技",
128
+ "上海西翠",
129
+ "诗程文化传播",
130
+ "dewu",
131
+ "领星网络",
132
+ "aliexpress",
133
+ "贝塔通科技",
134
+ "链家",
135
+ "花小猪",
136
+ "趣输入",
137
+ "搜狐新媒体",
138
+ "一淘",
139
+ "56",
140
+ "qq阅读",
141
+ "青桔单车",
142
+ "iflytek",
143
+ "每日优鲜电子商务",
144
+ "腾讯觅影",
145
+ "微医",
146
+ "松果网",
147
+ "paypal",
148
+ "递瑞供应链管理",
149
+ "领星",
150
+ "qunar",
151
+ "三快",
152
+ "lu.com",
153
+ "携程旅行网",
154
+ "新潮传媒",
155
+ "链家经纪",
156
+ "景域文化",
157
+ "阿里健康",
158
+ "pingpeng",
159
+ "聚划算",
160
+ "零机科技",
161
+ "街兔电单车",
162
+ "快乐购",
163
+ "华为数字能源",
164
+ "搜狐",
165
+ "陆家嘴国际金融资产交易市场",
166
+ "nanjing tuniu",
167
+ "亚马逊",
168
+ "苏宁易购",
169
+ "携程旅游",
170
+ "苏宁金服",
171
+ "babytree",
172
+ "悟空问答",
173
+ "同花顺",
174
+ "eastmoney",
175
+ "浪潮信息",
176
+ "滴滴智慧交通",
177
+ "beijing ruixun lingtong",
178
+ "平安综合金融服务",
179
+ "爱奇艺",
180
+ "小米集团",
181
+ "华为云",
182
+ "微店",
183
+ "恒生集团",
184
+ "网易有道",
185
+ "boccfc",
186
+ "世纪思速科技",
187
+ "海康消防",
188
+ "beijing xiaomi",
189
+ "众安科技",
190
+ "五八同城",
191
+ "霆程汽车租赁",
192
+ "云卖分销",
193
+ "乐信集团",
194
+ "蚂蚁",
195
+ "舶乐蜜电子商务",
196
+ "支付宝中国",
197
+ "砖块消消消",
198
+ "vivo",
199
+ "阿里互娱",
200
+ "中国平安",
201
+ "lingxihudong",
202
+ "百度网盘",
203
+ "1号店",
204
+ "字节跳动",
205
+ "京东科技",
206
+ "驴妈妈兴旅国际旅行社",
207
+ "hangzhou alibaba music",
208
+ "xunlei",
209
+ "灵犀互动娱乐",
210
+ "快手",
211
+ "youtube",
212
+ "连尚慧眼",
213
+ "腾讯体育",
214
+ "爱商在线",
215
+ "酷我音乐",
216
+ "金融壹账通",
217
+ "搜狗服务",
218
+ "banma information",
219
+ "a站",
220
+ "罗汉堂",
221
+ "薇仕网络",
222
+ "搜狐新闻",
223
+ "贝宝",
224
+ "薇仕",
225
+ "口袋时尚科技",
226
+ "穆迪咨询",
227
+ "新狐投资管理",
228
+ "hikvision",
229
+ "alimama china holding limited",
230
+ "超聚变数字",
231
+ "腾讯视频",
232
+ "恒生电子",
233
+ "百度游戏",
234
+ "绿洲",
235
+ "木瓜移动",
236
+ "红袖添香",
237
+ "店匠科技",
238
+ "易贝",
239
+ "一淘网",
240
+ "博览群书",
241
+ "唯品会",
242
+ "lazglobal",
243
+ "amap",
244
+ "芒果网",
245
+ "口碑",
246
+ "海康慧影",
247
+ "腾讯音乐娱乐",
248
+ "网易严选",
249
+ "微信",
250
+ "shenzhen lexin holding",
251
+ "hangzhou pingpeng intelligent",
252
+ "连尚网络",
253
+ "海思",
254
+ "isunor",
255
+ "蝉翼",
256
+ "阿里游戏",
257
+ "广州优视",
258
+ "优视",
259
+ "腾讯征信",
260
+ "识装",
261
+ "finserve.pingan.com",
262
+ "papaya",
263
+ "阅文",
264
+ "平安健康保险",
265
+ "考拉海购",
266
+ "网易印象",
267
+ "wifi万能钥匙",
268
+ "新浪互联服务",
269
+ "亚马逊云科技",
270
+ "迅雷看看",
271
+ "华为朗新科技",
272
+ "adyen hong kong limited",
273
+ "谷歌",
274
+ "得物",
275
+ "网心",
276
+ "cainiao network",
277
+ "沐瞳",
278
+ "linkedln",
279
+ "hundsun",
280
+ "阿里旅行",
281
+ "珍爱网",
282
+ "阿里巴巴通信",
283
+ "金山奇剑",
284
+ "tongtool",
285
+ "华为安捷信电气",
286
+ "快乐时代",
287
+ "平安寿险",
288
+ "微博",
289
+ "微跳蚤",
290
+ "oppo移动通信",
291
+ "毒",
292
+ "alimama",
293
+ "shoplazza",
294
+ "shenzhen dianjiang science and",
295
+ "众鸣世科",
296
+ "平安金融",
297
+ "狐友",
298
+ "维沃移动通信",
299
+ "tobosoft",
300
+ "齐力电商",
301
+ "ali",
302
+ "诚信通",
303
+ "行吟",
304
+ "跳舞的线",
305
+ "橙心优选",
306
+ "众安健康",
307
+ "亚马逊中国投资",
308
+ "德絮投资管理中心合伙",
309
+ "招联消费金融",
310
+ "百度文学",
311
+ "芝麻信用",
312
+ "阿里零售通",
313
+ "时装",
314
+ "花样直播",
315
+ "sogou",
316
+ "uc",
317
+ "海思半导体",
318
+ "zhongan online p&c insurance",
319
+ "新浪数字",
320
+ "驴妈妈旅游网",
321
+ "华为数字能源技术",
322
+ "京东数科",
323
+ "oracle",
324
+ "xiaomi",
325
+ "nyse",
326
+ "阳光消费金融",
327
+ "天天动听",
328
+ "大众点评",
329
+ "上海瑞家",
330
+ "trustpass",
331
+ "hundsun technologies",
332
+ "美团小贷",
333
+ "ebay",
334
+ "通途",
335
+ "tcl",
336
+ "鸿蒙",
337
+ "酷狗计算机",
338
+ "品诺保险",
339
+ "capitalg",
340
+ "康盛创想",
341
+ "58同城",
342
+ "闲鱼",
343
+ "微软",
344
+ "吉易付科技",
345
+ "理财通",
346
+ "ctrip",
347
+ "yy",
348
+ "华为数字",
349
+ "kingsoft",
350
+ "孙宁金融",
351
+ "房江湖经纪",
352
+ "youku",
353
+ "ant financial services group",
354
+ "盒马",
355
+ "sensetime",
356
+ "伊千网络",
357
+ "小豹ai翻译棒",
358
+ "shopify",
359
+ "前海微众银行",
360
+ "qd",
361
+ "gmail",
362
+ "pingpong",
363
+ "alibaba group holding limited",
364
+ "捷信时空电子商务",
365
+ "orientsec",
366
+ "乔戈里管理咨询",
367
+ "ant",
368
+ "锐讯灵通",
369
+ "兴业消费金融",
370
+ "京东叁佰陆拾度电子商务",
371
+ "新浪",
372
+ "优酷土豆",
373
+ "海康机器人",
374
+ "美团单车",
375
+ "海康存储",
376
+ "领英",
377
+ "阿里全球速卖通",
378
+ "美菜网",
379
+ "京邦达",
380
+ "安居客",
381
+ "阿里体育",
382
+ "相互宝",
383
+ "cloudwalk",
384
+ "百度智能云",
385
+ "贝壳",
386
+ "酷狗",
387
+ "sunshine consumer finance",
388
+ "掌宜",
389
+ "奇酷网",
390
+ "核新同花顺",
391
+ "阿里巴巴影业",
392
+ "节创",
393
+ "学而思网校",
394
+ "速途",
395
+ "途牛",
396
+ "阿里云计算",
397
+ "beijing sensetime",
398
+ "alibaba cloud",
399
+ "西瓜视频",
400
+ "美团优选",
401
+ "orient securities limited",
402
+ "华为朗新",
403
+ "店匠",
404
+ "shanghai weishi network",
405
+ "友盟",
406
+ "飞猪旅行",
407
+ "滴滴出行",
408
+ "alipay",
409
+ "mogu",
410
+ "dangdang",
411
+ "大麦网",
412
+ "汉军智能系统",
413
+ "百度地图",
414
+ "货车帮",
415
+ "狐狸金服",
416
+ "众安在线保险经纪",
417
+ "华为通信",
418
+ "新浪支付",
419
+ "zhihu",
420
+ "alibaba cloud computing",
421
+ "沙发视频",
422
+ "金山软件",
423
+ "ping an good doctor",
424
+ "携程",
425
+ "脉脉",
426
+ "youku information beijing",
427
+ "zhongan",
428
+ "艾丁软件",
429
+ "乒乓智能",
430
+ "蘑菇街",
431
+ "taobao",
432
+ "华为技术服务",
433
+ "仕承文化传播",
434
+ "安捷信",
435
+ "狐狸互联网小额贷款",
436
+ "节点迅捷",
437
+ "中国银行",
438
+ "搜镇",
439
+ "众安在线",
440
+ "dingtalk",
441
+ "云从科技",
442
+ "beijing jingbangda trade",
443
+ "moody s",
444
+ "滚动的天空",
445
+ "yl.pingan.com",
446
+ "奇虎",
447
+ "alihealth",
448
+ "芒果tv",
449
+ "lufax",
450
+ "美团打车",
451
+ "小桔",
452
+ "贝壳找房网",
453
+ "小米科技",
454
+ "vips",
455
+ "kindle",
456
+ "亚马逊服务",
457
+ "citic consumer finance",
458
+ "微众",
459
+ "搜狗智慧互联网医院",
460
+ "盒马鲜生",
461
+ "life.pinan.com",
462
+ "ph.com.cn",
463
+ "银联",
464
+ "cmbchina",
465
+ "平安金融科技咨询",
466
+ "微保",
467
+ "甲骨文中国",
468
+ "飞书",
469
+ "koubei shanghai information",
470
+ "企鹅辅导",
471
+ "斑马",
472
+ "平安租赁",
473
+ "云从",
474
+ "马上消费",
475
+ "hangzhou ali baba advertising",
476
+ "金山",
477
+ "赛盒",
478
+ "科大讯飞",
479
+ "金星创业投资",
480
+ "平安国际融资租赁",
481
+ "360你财富",
482
+ "西山居",
483
+ "shenzhen qianhai fourth paradigm data",
484
+ "海思光电子",
485
+ "猎户星空",
486
+ "网易公司",
487
+ "浪潮",
488
+ "粒粒橙传媒",
489
+ "招联金融",
490
+ "100. me",
491
+ "捷信信驰咨询",
492
+ "唯品仓",
493
+ "orient",
494
+ "趣拿",
495
+ "摩拜单车",
496
+ "天猫精灵",
497
+ "菜鸟",
498
+ "豹小贩",
499
+ "去哪儿",
500
+ "米家",
501
+ "哈啰单车",
502
+ "搜狐体育",
503
+ "shopify payments usa",
504
+ "高德软件",
505
+ "讯联智付",
506
+ "乐信",
507
+ "唯你搭",
508
+ "第四范式",
509
+ "菜鸟网络",
510
+ "同程",
511
+ "yy语音",
512
+ "浪潮云",
513
+ "东财",
514
+ "淘宝",
515
+ "寻梦",
516
+ "citic securities limited",
517
+ "青橙之旅",
518
+ "阿里巴巴",
519
+ "番茄小说",
520
+ "上海亿贝",
521
+ "inspur",
522
+ "babytree inc",
523
+ "海康智慧产业股权投资基金合伙合伙",
524
+ "adyen",
525
+ "艺龙",
526
+ "蚂蚁金服",
527
+ "平安金服",
528
+ "百度百科",
529
+ "unionpay",
530
+ "当当",
531
+ "阅文集团",
532
+ "东方财富",
533
+ "东方证券",
534
+ "哈罗单车",
535
+ "优酷",
536
+ "海康",
537
+ "alipay china network",
538
+ "网商银行",
539
+ "钧正",
540
+ "property.pingan.com",
541
+ "豹咖啡",
542
+ "网易",
543
+ "我爱cba",
544
+ "theduapp",
545
+ "360",
546
+ "金山数字娱乐",
547
+ "新浪阅读",
548
+ "alibabagames",
549
+ "顺丰",
550
+ "支点商贸",
551
+ "同程旅行",
552
+ "citic securities",
553
+ "ele.com",
554
+ "tal",
555
+ "fresh hema",
556
+ "运满满",
557
+ "贝壳网",
558
+ "酷狗音乐",
559
+ "鲜城",
560
+ "360健康",
561
+ "浪潮世科",
562
+ "迅雷网络",
563
+ "哔哩哔哩",
564
+ "华为电动",
565
+ "淘友天下",
566
+ "华多网络",
567
+ "xunlei networking technologies",
568
+ "云杉",
569
+ "当当网电子商务",
570
+ "津虹网络",
571
+ "wedoc cloud hangzhou holdings",
572
+ "alisports shanghai",
573
+ "旷视金智",
574
+ "钉钉中国",
575
+ "微影",
576
+ "金山快快",
577
+ "亿贝",
578
+ "wedoc",
579
+ "autonavi",
580
+ "哈啰助力车",
581
+ "google cloud",
582
+ "新浪乐居",
583
+ "京东股票",
584
+ "搜狗智慧远程医疗中心",
585
+ "中银消金",
586
+ "merchants union consumer finance",
587
+ "王者荣耀",
588
+ "百度手机",
589
+ "美团民宿",
590
+ "kaola",
591
+ "小屋",
592
+ "金山网络",
593
+ "来往",
594
+ "顺丰速运",
595
+ "腾讯课堂",
596
+ "百度在线网络",
597
+ "美团买菜",
598
+ "威视汽车",
599
+ "uc mobile",
600
+ "来赞达",
601
+ "平安健康医疗",
602
+ "豹小秘",
603
+ "尚网",
604
+ "哈勃投资",
605
+ " ping an insurance group of china ,",
606
+ "小米",
607
+ "360好药",
608
+ "qq音乐",
609
+ "lingxigames",
610
+ "faceu激萌",
611
+ "搜狗",
612
+ "sohu",
613
+ "满帮",
614
+ "vipshop",
615
+ "wishpost",
616
+ "金山世游",
617
+ "shanghai yibaimi network",
618
+ "1688",
619
+ "海康汽车",
620
+ "顺丰控股",
621
+ "华为",
622
+ "妙镜vr",
623
+ "paybkj.com",
624
+ "hellobike",
625
+ "豹来电",
626
+ "京东",
627
+ "驴妈妈",
628
+ "momo",
629
+ "平安健康险",
630
+ "哈勃科技",
631
+ "美菜",
632
+ "众安在线财产保险",
633
+ "海康威视",
634
+ "east money information",
635
+ "阿里云",
636
+ "蝉游记",
637
+ "余额宝",
638
+ "屋客",
639
+ "滴滴",
640
+ "shopify international limited",
641
+ "百度",
642
+ "阿里健康中国",
643
+ "阿里通信",
644
+ "微梦创科",
645
+ "微医云",
646
+ "轻颜相机",
647
+ "搜易居",
648
+ "趣店集团",
649
+ "美团云",
650
+ "ant group",
651
+ "金山云",
652
+ "beijing express hand",
653
+ "觅觅",
654
+ "支付宝",
655
+ "滴滴承信科技咨询服务",
656
+ "拼多多",
657
+ "众安运动",
658
+ "乞力电商",
659
+ "youcash",
660
+ "唯品金融",
661
+ "陆金所",
662
+ "本地生活",
663
+ "sz dji",
664
+ "海康智能",
665
+ "魔方网聘",
666
+ "青藤大学",
667
+ "international business machines",
668
+ "学而思",
669
+ "beijing zhongming century science and",
670
+ "猎豹清理大师",
671
+ "asinking",
672
+ "高德",
673
+ "苏宁",
674
+ "优酷网",
675
+ "艾丁",
676
+ "中银消费金融",
677
+ "京东健康",
678
+ "五八教育",
679
+ "pingpongx",
680
+ "搜狐时尚",
681
+ "阿里广告",
682
+ "平安财险",
683
+ "中邮消金",
684
+ "etao",
685
+ "怕怕",
686
+ "nyse:cmcm",
687
+ "华为培训中心",
688
+ "高德地图",
689
+ "云狐天下征信",
690
+ "大疆创新",
691
+ "连尚",
692
+ "壹佰米",
693
+ "康健公司",
694
+ "iqiyi.com",
695
+ "360安全云盘",
696
+ "馒头直播",
697
+ "淘友网",
698
+ "东方赢家",
699
+ "bank of china",
700
+ "微众银行",
701
+ "阿里巴巴国际站",
702
+ "虾米",
703
+ "去哪儿网",
704
+ "ctrip travel network shanghai",
705
+ "潇湘书院",
706
+ "腾讯",
707
+ "快乐阳光互动娱乐传媒",
708
+ "迅雷",
709
+ "weidian",
710
+ "滴滴货运",
711
+ "ping an puhui enterprise management",
712
+ "新浪仓石基金销售",
713
+ "搜狐焦点",
714
+ "alibaba pictures",
715
+ "wps",
716
+ "平安",
717
+ "lazmall",
718
+ "百度开放平台",
719
+ "兴业消金",
720
+ " 珍爱网",
721
+ "京东云",
722
+ "小红书",
723
+ "1688. com",
724
+ "如视智数",
725
+ "missfresh",
726
+ "pazl.pingan.cn",
727
+ "平安集团",
728
+ "kugou",
729
+ "懂车帝",
730
+ "斑马智行",
731
+ "浪潮集团",
732
+ "netease hangzhou network",
733
+ "pagd.net",
734
+ "探探",
735
+ "chinaliterature",
736
+ "amazon亚马逊",
737
+ "alphabet",
738
+ "当当文创手工艺品电子商务",
739
+ "五八邦",
740
+ "shenzhen zhenai network information",
741
+ "lingshoutong",
742
+ "字节",
743
+ "lvmama",
744
+ "金山办公",
745
+ "众安保险",
746
+ "时装信息",
747
+ "优视科技",
748
+ "guangzhou kugou",
749
+ "ibm",
750
+ "滴滴打车",
751
+ "beijing sogou information service",
752
+ "megvii",
753
+ "健谈哥",
754
+ "cloudwalk group",
755
+ "蜂联科技",
756
+ "冬云",
757
+ "京东尚科",
758
+ "钢琴块2",
759
+ "京东世纪",
760
+ "商汤",
761
+ "众鸣世纪",
762
+ "腾讯音乐",
763
+ "迅雷网文化",
764
+ "华为云计算技术",
765
+ "live.me",
766
+ "全球速卖通",
767
+ "快的打车",
768
+ "hello group inc",
769
+ "美丽说",
770
+ "suning",
771
+ "opengauss",
772
+ "lazada",
773
+ "tmall",
774
+ "acfun",
775
+ "当当网",
776
+ "中银",
777
+ "旷视科技",
778
+ "百度钱包",
779
+ "淘宝网",
780
+ "新浪微博",
781
+ "迅雷集团",
782
+ "中信消费金融",
783
+ "学而思教育",
784
+ "平安普惠",
785
+ "悟空跨境",
786
+ "irobotbox",
787
+ "平安产险",
788
+ "inspur group",
789
+ "世纪卓越快递服务",
790
+ "奇虎360",
791
+ "webank",
792
+ "偶藻",
793
+ "唯品支付",
794
+ "腾讯云计算",
795
+ "众安服务",
796
+ "亿之唐",
797
+ "beijing 58 information ttechnology",
798
+ "平安好医生",
799
+ "迅雷之锤",
800
+ "旅行小账本",
801
+ "芒果游戏",
802
+ "新浪传媒",
803
+ "旷镜博煊",
804
+ "全民k歌",
805
+ "滴滴支付",
806
+ "北京网心科技",
807
+ "挂号网",
808
+ "萤石",
809
+ "chinavision media group limited",
810
+ "猎豹安全大师",
811
+ "cmcm",
812
+ "趣店",
813
+ "蚂蚁财富",
814
+ "商汤科技",
815
+ "甲骨文",
816
+ "百度云",
817
+ "百度apollo",
818
+ "19 pay",
819
+ "stock.pingan.com",
820
+ "tiktok",
821
+ "alibaba pictures group limited",
822
+ "ele",
823
+ "考拉",
824
+ "天猫",
825
+ "腾讯优图",
826
+ "起点中文网",
827
+ "百度视频",
828
+ "shanghai bili bili",
829
+ "京东物流",
830
+ "ebay marketplaces gmbh",
831
+ "alibaba sport",
832
+ "wish",
833
+ "阿里巴巴中国",
834
+ "中国银联",
835
+ "alibaba china network",
836
+ "china ping an property insurance",
837
+ "百度糯米网",
838
+ "微软中国",
839
+ "一九付",
840
+ "4 paradigm",
841
+ "叮咚买菜",
842
+ "umeng",
843
+ "众鸣科技",
844
+ "平安财富通",
845
+ "google",
846
+ "巨量引擎",
847
+ "百度贴吧",
848
+ "beijing jingdong century information",
849
+ "讯飞",
850
+ "beijing yunshan information",
851
+ "满运软件",
852
+ "中邮消费金融",
853
+ "饿了么",
854
+ "alios",
855
+ "腾讯ai实验室",
856
+ "第四范式智能",
857
+ "瀚星创业投资",
858
+ "gradient ventures",
859
+ "microsoft",
860
+ "哈啰共享汽车",
861
+ "乞力电子商务",
862
+ "mscf",
863
+ "网易影业文化",
864
+ "铁友旅游咨询",
865
+ "kilimall",
866
+ "云企互联投资",
867
+ "ping an financial consulting",
868
+ "beijng jingdong century commerce",
869
+ "高德威智能交通系统",
870
+ "中友信息",
871
+ "平安医疗健康管理",
872
+ "eciticcfc",
873
+ "中信证券",
874
+ "fliggy",
875
+ "电子湾",
876
+ "旷云金智",
877
+ "微粒贷",
878
+ "rsi",
879
+ "滴滴云计算",
880
+ "google ventures",
881
+ "箐程",
882
+ "每日优鲜",
883
+ "音兔",
884
+ "拉扎斯",
885
+ "今日头条",
886
+ "乐信控股",
887
+ "猎豹浏览器",
888
+ "细微咨询",
889
+ "好未来",
890
+ "我乐",
891
+ "绘声绘色",
892
+ "抖音",
893
+ "搜狐新时代",
894
+ "飞猪",
895
+ "鹅厂",
896
+ "贝壳找房",
897
+ "tuniu",
898
+ "红马传媒文化",
899
+ "钉钉",
900
+ "马上消费金融",
901
+ "360手机",
902
+ "平安医保",
903
+ "快途",
904
+ "alibaba",
905
+ "小哈换电",
906
+ "大麦",
907
+ "恒睿人工智能研究院",
908
+ "谷歌资本",
909
+ "猎豹",
910
+ "穆迪信息"
911
+ ]
deepdoc/parser/resume/entities/res/good_sch.json ADDED
@@ -0,0 +1,595 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ "中国科技大学",
3
+ "国防科学技术大学",
4
+ "清华大学",
5
+ "清华",
6
+ "tsinghua university",
7
+ "thu",
8
+ "北京大学",
9
+ "北大",
10
+ "beijing university",
11
+ "pku",
12
+ "中国科学技术大学",
13
+ "中国科大",
14
+ "中科大",
15
+ "china science & technology university",
16
+ "ustc",
17
+ "复旦大学",
18
+ "复旦",
19
+ "fudan university",
20
+ "fdu",
21
+ "中国人民大学",
22
+ "人大",
23
+ "人民大学",
24
+ "renmin university of china",
25
+ "ruc",
26
+ "上海交通大学",
27
+ "上海交大",
28
+ "shanghai jiao tong university",
29
+ "sjtu",
30
+ "南京大学",
31
+ "南大",
32
+ "nanjing university",
33
+ "nju",
34
+ "同济大学",
35
+ "同济",
36
+ "tongji university",
37
+ "tongji",
38
+ "浙江大学",
39
+ "浙大",
40
+ "zhejiang university",
41
+ "zju",
42
+ "南开大学",
43
+ "南开",
44
+ "nankai university",
45
+ "nku",
46
+ "北京航空航天大学",
47
+ "北航",
48
+ "beihang university",
49
+ "buaa",
50
+ "北京师范大学",
51
+ "北师",
52
+ "北师大",
53
+ "beijing normal university",
54
+ "bnu",
55
+ "武汉大学",
56
+ "武大",
57
+ "wuhan university",
58
+ "whu",
59
+ "西安交通大学",
60
+ "西安交大",
61
+ "xi’an jiaotong university",
62
+ "xjtu",
63
+ "天津大学",
64
+ "天大",
65
+ "university of tianjin",
66
+ "tju",
67
+ "华中科技大学",
68
+ "华中大",
69
+ "central china university science and technology",
70
+ "hust",
71
+ "北京理工大学",
72
+ "北理",
73
+ "beijing institute of technology",
74
+ "bit",
75
+ "东南大学",
76
+ "东大",
77
+ "southeast china university",
78
+ "seu",
79
+ "中山大学",
80
+ "中大",
81
+ "zhongshan university",
82
+ "sysu",
83
+ "华东师范大学",
84
+ "华师大",
85
+ "east china normal university",
86
+ "ecnu",
87
+ "哈尔滨工业大学",
88
+ "哈工大",
89
+ "harbin institute of technology",
90
+ "hit",
91
+ "厦门大学",
92
+ "厦大",
93
+ "xiamen university",
94
+ "xmu",
95
+ "西北工业大学",
96
+ "西工大",
97
+ "西北工大",
98
+ "northwestern polytechnical university",
99
+ "npu",
100
+ "中南大学",
101
+ "中南",
102
+ "middle and southern university",
103
+ "csu",
104
+ "大连理工大学",
105
+ "大工",
106
+ "institute of technology of dalian",
107
+ "dut",
108
+ "四川大学",
109
+ "川大",
110
+ "sichuan university",
111
+ "scu",
112
+ "电子科技大学",
113
+ "电子科大",
114
+ "university of electronic science and technology of china",
115
+ "uestc",
116
+ "华南理工大学",
117
+ "华南理工",
118
+ "institutes of technology of south china",
119
+ "scut",
120
+ "吉林大学",
121
+ "吉大",
122
+ "jilin university",
123
+ "jlu",
124
+ "湖南大学",
125
+ "湖大",
126
+ "hunan university",
127
+ "hnu",
128
+ "重庆大学",
129
+ "重大",
130
+ "university of chongqing",
131
+ "cqu",
132
+ "山东大学",
133
+ "山大",
134
+ "shandong university",
135
+ "sdu",
136
+ "中国农业大学",
137
+ "中国农大",
138
+ "china agricultural university",
139
+ "cau",
140
+ "中国海洋大学",
141
+ "中国海大",
142
+ "chinese marine university",
143
+ "ouc",
144
+ "中央民族大学",
145
+ "中央民大",
146
+ "central university for nationalities",
147
+ "muc",
148
+ "东北大学",
149
+ "东北工学院",
150
+ "northeastern university",
151
+ "neu 或 nu",
152
+ "兰州大学",
153
+ "兰大",
154
+ "lanzhou university",
155
+ "lzu",
156
+ "西北农林科技大学",
157
+ "西农","西北农大",
158
+ "northwest a&f university",
159
+ "nwafu",
160
+ "中国人民解放军国防科技大学",
161
+ "国防科技大学","国防科大",
162
+ "national university of defense technology",
163
+ "nudt",
164
+ "郑州大学",
165
+ "郑大",
166
+ "zhengzhou university",
167
+ "zzu",
168
+ "云南大学",
169
+ "云大",
170
+ "yunnan university",
171
+ "ynu",
172
+ "新疆大学",
173
+ "新大",
174
+ "xinjiang university",
175
+ "xju",
176
+ "北京交通大学",
177
+ "北京交大",
178
+ "beijing jiaotong university",
179
+ "bjtu",
180
+ "北京工业大学",
181
+ "北工大",
182
+ "beijing university of technology",
183
+ "bjut",
184
+ "北京科技大学",
185
+ "北科大","北京科大",
186
+ "university of science and technology beijing",
187
+ "ustb",
188
+ "北京化工大学",
189
+ "北化",
190
+ "beijing university of chemical technology",
191
+ "buct",
192
+ "北京邮电大学",
193
+ "北邮",
194
+ "beijing university of posts and telecommunications",
195
+ "beijing university of post and telecommunications",
196
+ "beijing university of post and telecommunication",
197
+ "beijing university of posts and telecommunication",
198
+ "bupt",
199
+ "北京林业大学",
200
+ "北林",
201
+ "beijing forestry university",
202
+ "bfu",
203
+ "北京协和医学院",
204
+ "协和医学院",
205
+ "peking union medical college",
206
+ "pumc",
207
+ "北京中医药大学",
208
+ "北中医",
209
+ "beijing university of chinese medicine",
210
+ "bucm",
211
+ "首都师范大学",
212
+ "首师大",
213
+ "capital normal university",
214
+ "cnu",
215
+ "北京外国语大学",
216
+ "北外",
217
+ "beijing foreign studies university",
218
+ "bfsu",
219
+ "中国传媒大学",
220
+ "中媒",
221
+ "中传",
222
+ "北京广播学院",
223
+ "communication university of china",
224
+ "cuc",
225
+ "中央财经大学",
226
+ "中央财大",
227
+ "中财大",
228
+ "the central university of finance and economics",
229
+ "cufe",
230
+ "对外经济贸易大学",
231
+ "对外经贸大学",
232
+ "贸大",
233
+ "university of international business and economics",
234
+ "uibe",
235
+ "外交学院",
236
+ "外院",
237
+ "china foreign affairs university",
238
+ "cfau",
239
+ "中国人民公安大学",
240
+ "公安大学",
241
+ "people's public security university of china",
242
+ "ppsuc",
243
+ "北京体育大学",
244
+ "北体大",
245
+ "beijing sport university",
246
+ "bsu",
247
+ "中央音乐学院",
248
+ "央音",
249
+ "中央院",
250
+ "central conservatory of music",
251
+ "ccom",
252
+ "中国音乐学院",
253
+ "国音",
254
+ "中国院",
255
+ "china conservatory of music",
256
+ "ccmusic",
257
+ "中央美术学院",
258
+ "央美",
259
+ "central academy of fine art",
260
+ "cafa",
261
+ "中央���剧学院",
262
+ "中戏",
263
+ "the central academy of drama",
264
+ "tcad",
265
+ "中国政法大学",
266
+ "法大",
267
+ "china university of political science and law",
268
+ "zuc",
269
+ "cupl",
270
+ "中国科学院大学",
271
+ "国科大",
272
+ "科院大",
273
+ "university of chinese academy of sciences",
274
+ "ucas",
275
+ "福州大学",
276
+ "福大",
277
+ "university of fuzhou",
278
+ "fzu",
279
+ "暨南大学",
280
+ "暨大",
281
+ "ji'nan university",
282
+ "jnu",
283
+ "广州中医药大学",
284
+ "广中医",
285
+ "traditional chinese medicine university of guangzhou",
286
+ "gucm",
287
+ "华南师范大学",
288
+ "华南师大",
289
+ "south china normal university",
290
+ "scnu",
291
+ "广西大学",
292
+ "西大",
293
+ "guangxi university",
294
+ "gxu",
295
+ "贵州大学",
296
+ "贵大",
297
+ "guizhou university",
298
+ "gzu",
299
+ "海南大学",
300
+ "海大",
301
+ "university of hainan",
302
+ "hainu",
303
+ "河南大学",
304
+ "河大",
305
+ "he'nan university",
306
+ "henu",
307
+ "哈尔滨工程大学",
308
+ "哈工程",
309
+ "harbin engineering university",
310
+ "heu",
311
+ "东北农业大学",
312
+ "东北农大",
313
+ "northeast agricultural university",
314
+ "neau",
315
+ "东北林业大学",
316
+ "东北林大",
317
+ "northeast forestry university",
318
+ "nefu",
319
+ "中国地质大学",
320
+ "地大",
321
+ "china university of geosciences",
322
+ "cug",
323
+ "武汉理工大学",
324
+ "武汉理工",
325
+ "wuhan university of technology",
326
+ "wut",
327
+ "华中农业大学",
328
+ "华中农大",
329
+ "华农",
330
+ "central china agricultural university",
331
+ "hzau",
332
+ "华中师范大学",
333
+ "华中师大",
334
+ "华大",
335
+ "central china normal university",
336
+ "ccnu",
337
+ "中南财经政法大学",
338
+ "中南大",
339
+ "zhongnan university of economics & law",
340
+ "zuel",
341
+ "湖南师范大学",
342
+ "湖南师大",
343
+ "hunan normal university",
344
+ "hunnu",
345
+ "延边大学",
346
+ "延大",
347
+ "yanbian university",
348
+ "ybu",
349
+ "东北师范大学",
350
+ "东北师大",
351
+ "northeast normal university",
352
+ "nenu",
353
+ "苏州大学",
354
+ "苏大",
355
+ "soochow university",
356
+ "suda",
357
+ "南京航空航天大学",
358
+ "南航",
359
+ "nanjing aero-space university",
360
+ "nuaa",
361
+ "南京理工大学",
362
+ "南理工",
363
+ "institutes of technology of nanjing",
364
+ "njust",
365
+ "中国矿业大学",
366
+ "中国矿大",
367
+ "china mining university",
368
+ "cumt",
369
+ "南京邮电大学",
370
+ "南邮",
371
+ "nanjing university of posts and telecommunications",
372
+ "njupt",
373
+ "河海大学",
374
+ "河海",
375
+ "river sea university",
376
+ "hhu",
377
+ "江南大学",
378
+ "江南大",
379
+ "jiangnan university",
380
+ "jiangnan",
381
+ "南京林业大学",
382
+ "南林",
383
+ "nanjing forestry university",
384
+ "njfu",
385
+ "南京信息工程大学",
386
+ "南信大",
387
+ "nanjing university of information science and technology",
388
+ "nuist",
389
+ "南京农业大学",
390
+ "南农",
391
+ "南农大",
392
+ "南京农大",
393
+ "agricultural university of nanjing",
394
+ "njau",
395
+ "nau",
396
+ "南京中医药大学",
397
+ "南中医",
398
+ "nanjing university of chinese medicine",
399
+ "njucm",
400
+ "中国药科大学",
401
+ "中国药大",
402
+ "china medicine university",
403
+ "cpu",
404
+ "南京师范大学",
405
+ "南京师大",
406
+ "南师大",
407
+ "南师",
408
+ "nanjing normal university",
409
+ "nnu",
410
+ "南昌大学",
411
+ "昌大",
412
+ "university of nanchang","nanchang university",
413
+ "ncu",
414
+ "辽宁大学",
415
+ "辽大",
416
+ "liaoning university",
417
+ "lnu",
418
+ "大连海事大学",
419
+ "大连海大",
420
+ "海大",
421
+ "maritime affairs university of dalian",
422
+ "dmu",
423
+ "内蒙古大学",
424
+ "内大",
425
+ "university of the inner mongol","inner mongolia university",
426
+ "imu",
427
+ "宁夏大学",
428
+ "宁大",
429
+ "ningxia university",
430
+ "nxu",
431
+ "青海大学",
432
+ "清大",
433
+ "qinghai university",
434
+ "qhu",
435
+ "中国石油大学",
436
+ "中石大",
437
+ "china university of petroleum beijing",
438
+ "upc",
439
+ "太原理工大学",
440
+ "太原理工",
441
+ "institutes of technology of taiyuan","taiyuan university of technology",
442
+ "tyut",
443
+ "西北大学",
444
+ "西大",
445
+ "northwest university",
446
+ "nwu",
447
+ "西安电子科技大学",
448
+ "西电",
449
+ "xidian university",
450
+ "xdu",
451
+ "长安大学",
452
+ "长大",
453
+ "chang`an university",
454
+ "chu",
455
+ "陕西师范大学",
456
+ "陕西师大",
457
+ "陕师大",
458
+ "shaanxi normal university",
459
+ "snnu",
460
+ "第四军医大学",
461
+ "空军军医大学","四医大",
462
+ "air force medical university",
463
+ "fmmu",
464
+ "华东理工大学",
465
+ "华理",
466
+ "east china university of science",
467
+ "ecust",
468
+ "东华大学",
469
+ "东华",
470
+ "donghua university",
471
+ "dhu",
472
+ "上海海洋大学",
473
+ "上海海大",
474
+ "shanghai ocean university",
475
+ "shou",
476
+ "上海中医药大学",
477
+ "上中医",
478
+ "shanghai university of traditional chinese medicine",
479
+ "shutcm",
480
+ "上海外国语大学",
481
+ "上外",
482
+ "shanghai international studies university",
483
+ "sisu",
484
+ "上海财经大学",
485
+ "上海财大",
486
+ "上财",
487
+ "shanghai university of finance",
488
+ "sufe",
489
+ "上海体育学院",
490
+ "shanghai university of sport",
491
+ "上海音乐学院",
492
+ "上音",
493
+ "shanghai conservatory of music",
494
+ "shcm",
495
+ "上海大学",
496
+ "上大",
497
+ "shanghai university",
498
+ "第二军医大学",
499
+ "海军军医大学",
500
+ "naval medical university",
501
+ "西南交通大学",
502
+ "西南交大",
503
+ "southwest jiaotong university",
504
+ "swjtu",
505
+ "西南石油大学",
506
+ "西南石大",
507
+ "southwest petroleum university",
508
+ "swpu",
509
+ "成都理工大学",
510
+ "成都理工",
511
+ "chengdu university of technology",
512
+ "cdut ",
513
+ "四川农业大学",
514
+ "川农",
515
+ "川农大",
516
+ "sichuan agricultural university",
517
+ "sicau",
518
+ "成都中医药大学",
519
+ "成中医",
520
+ "chengdu university of tcm",
521
+ "cdutcm",
522
+ "西南财经大学",
523
+ "西南财大",
524
+ "西财",
525
+ "southwestern university of finance and economics",
526
+ "swufe",
527
+ "天津工业大学",
528
+ "天工大",
529
+ "tianjin university of technology",
530
+ "tgu",
531
+ "天津医科大学",
532
+ "天津医大",
533
+ "medical university of tianjin",
534
+ "tmu",
535
+ "天津中医药大学",
536
+ "天中",
537
+ "tianjin university of traditional chinese medicine",
538
+ "tutcm",
539
+ "华北电力大学",
540
+ "华电",
541
+ "north china electric power university",
542
+ "ncepu",
543
+ "河北工业大学",
544
+ "河工大",
545
+ "hebei university of technology",
546
+ "hebut",
547
+ "西藏大学",
548
+ "藏大",
549
+ "tibet university",
550
+ "tu",
551
+ "石河子大学",
552
+ "石大",
553
+ "shihezi university",
554
+ "中国美术学院",
555
+ "中国美院",
556
+ "国美",
557
+ "china academy of art",
558
+ "caa",
559
+ "宁波大学",
560
+ "宁大",
561
+ "ningbo university",
562
+ "nbu",
563
+ "西南大学",
564
+ "西大",
565
+ "southwest university",
566
+ "swu",
567
+ "安徽大学",
568
+ "安大",
569
+ "university of anhui",
570
+ "ahu",
571
+ "合肥工业大学",
572
+ "合肥工大",
573
+ "合工大",
574
+ "hefei university of technology",
575
+ "hfut",
576
+ "中国地质大学",
577
+ "地大",
578
+ "china university of geosciences",
579
+ "cug",
580
+ "中国地质大学",
581
+ "地大",
582
+ "北京地大",
583
+ "cugb",
584
+ "中国矿业大学",
585
+ "中国矿大",
586
+ "china university of mining & technology",
587
+ "cumtb",
588
+ "中国石油大学",
589
+ "中石大",
590
+ "石大",
591
+ "china university of petroleum",
592
+ "cup",
593
+ "中国石油大学",
594
+ "中石大",
595
+ "cup"]
deepdoc/parser/resume/entities/res/school.rank.csv ADDED
@@ -0,0 +1,1627 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 清华大学,2,985,清华
2
+ 清华大学,2,985,Tsinghua University
3
+ 清华大学,2,985,THU
4
+ 北京大学,1,985,北大
5
+ 北京大学,1,985,Beijing University
6
+ 北京大学,1,985,PKU
7
+ 中国科学技术大学,14,985,中国科大
8
+ 中国科学技术大学,14,985,中科大
9
+ 中国科学技术大学,14,985,China Science & Technology University
10
+ 中国科学技术大学,14,985,USTC
11
+ 复旦大学,5,985,复旦
12
+ 复旦大学,5,985,Fudan University
13
+ 复旦大学,5,985,FDU
14
+ 中国人民大学,15,985,人大
15
+ 中国人民大学,15,985,人民大学
16
+ 中国人民大学,15,985,Renmin University Of China
17
+ 中国人民大学,15,985,RUC
18
+ 上海交通大学,4,985,上海交大
19
+ 上海交通大学,4,985,Shanghai Jiao Tong University
20
+ 上海交通大学,4,985,SJTU
21
+ 南京大学,11,985,南大
22
+ 南京大学,11,985,Nanjing University
23
+ 南京大学,11,985,NJU
24
+ 同济大学,17,985,同济
25
+ 同济大学,17,985,Tongji University
26
+ 同济大学,17,985,Tongji
27
+ 浙江大学,3,985,浙大
28
+ 浙江大学,3,985,Zhejiang University
29
+ 浙江大学,3,985,ZJU
30
+ 南开大学,27,985,南开
31
+ 南开大学,27,985,Nankai University
32
+ 南开大学,27,985,NKU
33
+ 北京航空航天大学,21,985,北航
34
+ 北京航空航天大学,21,985,Beihang University
35
+ 北京航空航天大学,21,985,BUAA
36
+ 北京师范大学,23,985,北师
37
+ 北京师范大学,23,985,北师大
38
+ 北京师范大学,23,985,Beijing Normal University
39
+ 北京师范大学,23,985,BNU
40
+ 武汉大学,6,985,武大
41
+ 武汉大学,6,985,Wuhan University
42
+ 武汉大学,6,985,WHU
43
+ 西安交通大学,13,985,西安交大
44
+ 西安交通大学,13,985,Xi’an Jiaotong University
45
+ 西安交通大学,13,985,XJTU
46
+ 天津大学,22,985,天大
47
+ 天津大学,22,985,University Of Tianjin
48
+ 天津大学,22,985,TJU
49
+ 华中科技大学,8,985,华中大
50
+ 华中科技大学,8,985,Central China University Science and Technology
51
+ 华中科技大学,8,985,HUST
52
+ 北京理工大学,24,985,北理
53
+ 北京理工大学,24,985,Beijing Institute of Technology
54
+ 北京理工大学,24,985,BIT
55
+ 东南大学,20,985,东大
56
+ 东南大学,20,985,Southeast China University
57
+ 东南大学,20,985,SEU
58
+ 中山大学,9,985,中大
59
+ 中山大学,9,985,Zhongshan University
60
+ 中山大学,9,985,SYSU
61
+ 华东师范大学,30,985,华师大
62
+ 华东师范大学,30,985,East China Normal University
63
+ 华东师范大学,30,985,ECNU
64
+ 哈尔滨工业大学,10,985,哈工大
65
+ 哈尔滨工业大学,10,985,Harbin Institute Of Technology
66
+ 哈尔滨工业大学,10,985,HIT
67
+ 厦门大学,28,985,厦大
68
+ 厦门大学,28,985,Xiamen University
69
+ 厦门大学,28,985,XMU
70
+ 西北工业大学,31,985,西工大
71
+ 西北工业大学,31,985,西北工大
72
+ 西北工业大学,31,985,Northwestern Polytechnical University
73
+ 西北工业大学,31,985,NPU
74
+ 中南大学,19,985,中南
75
+ 中南大学,19,985,Middle and Southern University
76
+ 中南大学,19,985,CSU
77
+ 大连理工大学,26,985,大工
78
+ 大连理工大学,26,985,Institute Of Technology Of Dalian
79
+ 大连理工大学,26,985,DUT
80
+ 四川大学,7,985,川大
81
+ 四川大学,7,985,Sichuan University
82
+ 四川大学,7,985,SCU
83
+ 电子科技大学,32,985,电子科大
84
+ 电子科技大学,32,985,University of Electronic Science and Technology of China
85
+ 电子科技大学,32,985,UESTC
86
+ 华南理工大学,25,985,华南理工
87
+ 华南理工大学,25,985,Institutes Of Technology Of South China
88
+ 华南理工大学,25,985,SCUT
89
+ 吉林大学,12,985,吉大
90
+ 吉林大学,12,985,Jilin University
91
+ 吉林大学,12,985,JLU
92
+ 湖南大学,38,985,湖大
93
+ 湖南大学,38,985,Hunan University
94
+ 湖南大学,38,985,HNU
95
+ 重庆大学,29,985,重大
96
+ 重庆大学,29,985,University Of Chongqing
97
+ 重庆大学,29,985,CQU
98
+ 山东大学,16,985,山大
99
+ 山东大学,16,985,Shandong University
100
+ 山东大学,16,985,SDU
101
+ 中国农业大学,37,985,中国农大
102
+ 中国农业大学,37,985,China Agricultural University
103
+ 中国农业大学,37,985,CAU
104
+ 中国海洋大学,60,985,中国海大
105
+ 中国海洋大学,60,985,Chinese Marine University
106
+ 中国海洋大学,60,985,OUC
107
+ 中央民族大学,106,985,中央民大
108
+ 中央民族大学,106,985,Central University For Nationalities
109
+ 中央民族大学,106,985,MUC
110
+ 东北大学,41,985,东北工学院
111
+ 东北大学,41,985,Northeastern University
112
+ 东北大学,41,985,NEU
113
+ 东北大学,41,985,NU
114
+ 兰州大学,39,985,兰大
115
+ 兰州大学,39,985,Lanzhou University
116
+ 兰州大学,39,985,LZU
117
+ 西北农林科技大学,78,985,西农、西北农大
118
+ 西北农林科技大学,78,985,Northwest A&F University
119
+ 西北农林科技大学,78,985,NWAFU
120
+ 中国人民解放军国防科技大学,89,985,国防科技大学、国防科大、国防科学技术大学
121
+ 中国人民解放军国防科技大学,89,985,National University of Defense Technology
122
+ 中国人民解放军国防科技大学,89,985,NUDT
123
+ 郑州大学,34,211,郑大
124
+ 郑州大学,34,211,Zhengzhou University
125
+ 郑州大学,34,211,ZZU
126
+ 云南大学,75,211,云大
127
+ 云南大学,75,211,Yunnan University
128
+ 云南大学,75,211,YNU
129
+ 新疆大学,114,211,新大
130
+ 新疆大学,114,211,Xinjiang University
131
+ 新疆大学,114,211,XJU
132
+ 北京交通大学,49,211,北京交大
133
+ 北京交通大学,49,211,Beijing Jiaotong University
134
+ 北京交通大学,49,211,BJTU
135
+ 北京工业大学,63,211,北工大
136
+ 北京工业大学,63,211,Beijing University Of Technology
137
+ 北京工业大学,63,211,BJUT
138
+ 北京科技大学,48,211,北科大、北京科大
139
+ 北京科技大学,48,211,University Of Science and Technology Beijing
140
+ 北京科技大学,48,211,USTB
141
+ 北京化工大学,82,211,北化
142
+ 北京化工大学,82,211,Beijing University of Chemical Technology
143
+ 北京化工大学,82,211,BUCT
144
+ 北京邮电大学,76,211,北邮
145
+ 北京邮电大学,76,211,Beijing University Of Posts and Telecommunications
146
+ 北京邮电大学,76,211,BUPT
147
+ 北京林业大学,88,211,北林
148
+ 北京林业大学,88,211,Beijing Forestry University
149
+ 北京林业大学,88,211,BFU
150
+ 北京协和医学院,,双一流,协和医学院
151
+ 北京协和医学院,,双一流,Peking Union Medical College
152
+ 北京协和医学院,,双一流,PUMC
153
+ 北京中医药大学,121,211,北中医
154
+ 北京中医药大学,121,211,Beijing University Of Chinese Medicine
155
+ 北京中医药大学,121,211,BUCM
156
+ 首都师范大学,,双一流,首师大
157
+ 首都师范大学,,双一流,Capital Normal University
158
+ 首都师范大学,,双一流,CNU
159
+ 北京外国语大学,124,211,北外
160
+ 北京外国语大学,124,211,Beijing Foreign Studies University
161
+ 北京外国语大学,124,211,BFSU
162
+ 中国传媒大学,96,211,中媒
163
+ 中国传媒大学,96,211,中传
164
+ 中国传媒大学,96,211,北京广播学院
165
+ 中国传媒大学,96,211,Communication University Of China
166
+ 中国传媒大学,96,211,CUC
167
+ 中央财经大学,79,211,中央财大
168
+ 中央财经大学,79,211,中财大
169
+ 中央财经大学,79,211,The Central University Of Finance and Economics
170
+ 中央财经大学,79,211,CUFE
171
+ 对外经济贸易大学,99,211,对外经贸大学
172
+ 对外经济贸易大学,99,211,贸大
173
+ 对外经济贸易大学,99,211,University Of International Business and Economics
174
+ 对外经济贸易大学,99,211,UIBE
175
+ 外交学院,,双一流,外院
176
+ 外交学院,,双一流,China Foreign Affairs University
177
+ 外交学院,,双一流,CFAU
178
+ 中国人民公安大学,,双一流,公安大学
179
+ 中国人民公安大学,,双一流,People's Public Security University of China
180
+ 中国人民公安大学,,双一流,PPSUC
181
+ 北京体育大学,122,211,北体大
182
+ 北京体育大学,122,211,Beijing Sport University
183
+ 北京体育大学,122,211,BSU
184
+ 中央音乐学院,103,211,央音
185
+ 中央音乐学院,103,211,中央院
186
+ 中央音乐学院,103,211,Central Conservatory Of Music
187
+ 中央音乐学院,103,211,CCOM
188
+ 中国音乐学院,,双一流,国音
189
+ 中国音乐学院,,双一流,中国院
190
+ 中国音乐学院,,双一流,China Conservatory of Music
191
+ 中国音乐学院,,双一流,CCMUSIC
192
+ 中央美术学院,104,双一流,央美
193
+ 中央美术学院,104,双一流,Central Academy of Fine Art
194
+ 中央美术学院,104,双一流,CAFA
195
+ 中央戏剧学院,133,双一流,中戏
196
+ 中央戏剧学院,133,双一流,The Central Academy Of Drama
197
+ 中央戏剧学院,133,双一流,TCAD
198
+ 中国政法大学,90,211,法大
199
+ 中国政法大学,90,211,China University Of Political Science and Law
200
+ 中国政法大学,90,211,ZUC
201
+ 中国政法大学,90,211,CUPL
202
+ 中国科学院大学,18,双一流,国科大
203
+ 中国科学院大学,18,双一流,科院大
204
+ 中国科学院大学,18,双一流,University of Chinese Academy of Sciences
205
+ 中国科学院大学,18,双一流,UCAS
206
+ 福州大学,72,211,福大
207
+ 福州大学,72,211,University Of Fuzhou
208
+ 福州大学,72,211,FZU
209
+ 暨南大学,44,211,暨大
210
+ 暨南大学,44,211,Ji'nan University
211
+ 暨南大学,44,211,JNU
212
+ 广州中医药大学,,双一流,广中医
213
+ 广州中医药大学,,双一流,Traditional Chinese Medicine University Of Guangzhou
214
+ 广州中医药大学,,双一流,GUCM
215
+ 华南师范大学,55,211,华南师大
216
+ 华南师范大学,55,211,South China Normal University
217
+ 华南师范大学,55,211,SCNU
218
+ 广西大学,71,211,广西大
219
+ 广西大学,71,211,Guangxi University
220
+ 广西大学,71,211,GXU
221
+ 贵州大学,94,211,贵大
222
+ 贵州大学,94,211,Guizhou University
223
+ 贵州大学,94,211,GZU
224
+ 海南大学,101,211,海大
225
+ 海南大学,101,211,University Of Hainan
226
+ 海南大学,101,211,HAINU
227
+ 河南大学,85,双一流,河大
228
+ 河南大学,85,双一流,He'nan University
229
+ 河南大学,85,双一流,HENU
230
+ 哈尔滨工程大学,65,211,哈工程
231
+ 哈尔滨工程大学,65,211,Harbin Engineering University
232
+ 哈尔滨工程大学,65,211,HEU
233
+ 东北农业大学,98,211,东北农大
234
+ 东北农业大学,98,211,Northeast Agricultural University
235
+ 东北农业大学,98,211,NEAU
236
+ 东北林业大学,93,211,东北林大
237
+ 东北林业大学,93,211,Northeast Forestry University
238
+ 东北林业大学,93,211,NEFU
239
+ 中国地质大学,80,211,地大
240
+ 中国地质大学,80,211,China University Of Geosciences
241
+ 中国地质大学,80,211,CUG
242
+ 武汉理工大学,40,211,武汉理工
243
+ 武汉理工大学,40,211,Wuhan University of Technology
244
+ 武汉理工大学,40,211,WUT
245
+ 华中农业大学,52,211,华中农大
246
+ 华中农业大学,52,211,华农
247
+ 华中农业大学,52,211,Central China Agricultural University
248
+ 华中农业大学,52,211,HZAU
249
+ 华中师范大学,58,211,华中师大
250
+ 华中师范大学,58,211,华大
251
+ 华中师范大学,58,211,Central China Normal University
252
+ 华中师范大学,58,211,CCNU
253
+ 中南财经政法大学,105,211,中南大
254
+ 中南财经政法大学,105,211,Zhongnan University Of Economics & Law
255
+ 中南财经政法大学,105,211,ZUEL
256
+ 湖南师范大学,68,211,湖南师大
257
+ 湖南师范大学,68,211,Hunan Normal University
258
+ 湖南师范大学,68,211,HUNNU
259
+ 延边大学,130,211,延大
260
+ 延边大学,130,211,Yanbian University
261
+ 延边大学,130,211,YBU
262
+ 东北师范大学,69,211,东北师大
263
+ 东北师范大学,69,211,Northeast Normal University
264
+ 东北师范大学,69,211,NENU
265
+ 苏州大学,35,211,苏大
266
+ 苏州大学,35,211,Soochow University
267
+ 苏州大学,35,211,SUDA
268
+ 南京航空航天大学,33,211,南航
269
+ 南京航空航天大学,33,211,Nanjing Aero-Space University
270
+ 南京航空航天大学,33,211,NUAA
271
+ 南京理工大学,42,211,南理工
272
+ 南京理工大学,42,211,Institutes Of Technology Of Nanjing
273
+ 南京理工大学,42,211,NJUST
274
+ 中国矿业大学,61,211,中国矿大
275
+ 中国矿业大学,61,211,China Mining University
276
+ 中国矿业大学,61,211,CUMT
277
+ 南京邮电大学,,双一流,南邮
278
+ 南京邮电大学,,双一流,Nanjing University of Posts and Telecommunications
279
+ 南京邮电大学,,双一流,NJUPT
280
+ 河海大学,54,211,河海
281
+ 河海大学,54,211,River Sea University
282
+ 河海大学,54,211,HHU
283
+ 江南大学,57,211,江南大
284
+ 江南大学,57,211,Jiangnan University
285
+ 江南大学,57,211,Jiangnan
286
+ 南京林业大学,,双一流,南林
287
+ 南京林业大学,,双一流,Nanjing Forestry University
288
+ 南京林业大学,,双一流,NJFU
289
+ 南京信息工程大学,91,双一流,南信大
290
+ 南京信息工程大学,91,双一流,Nanjing University of Information Science and Technology
291
+ 南京信息工程大学,91,双一流,NUIST
292
+ 南京农业大学,53,211,南农
293
+ 南京农业大学,53,211,南农大
294
+ 南京农业大学,53,211,南京农大
295
+ 南京农业大学,53,211,Agricultural University Of Nanjing
296
+ 南京农业大学,53,211,NJAU
297
+ 南京农业大学,53,211,NAU
298
+ 南京中医药大学,,双一流,南中医
299
+ 南京中医药大学,,双一流,Nanjing University Of Chinese Medicine
300
+ 南京中医药大学,,双一流,NJUCM
301
+ 中国药科大学,119,211,中国药大
302
+ 中国药科大学,119,211,China Medicine University
303
+ 中国药科大学,119,211,CPU
304
+ 南京师范大学,56,211,南京师大
305
+ 南京师范大学,56,211,南师大
306
+ 南京师范大学,56,211,南师
307
+ 南京师范大学,56,211,Nanjing Normal University
308
+ 南京师范大学,56,211,NNU
309
+ 南昌大学,47,211,昌大
310
+ 南昌大学,47,211,University Of Nanchang、Nanchang University
311
+ 南昌大学,47,211,NCU
312
+ 辽宁大学,118,211,辽大
313
+ 辽宁大学,118,211,Liaoning University
314
+ 辽宁大学,118,211,LNU
315
+ 大连海事大学,111,211,大连海大
316
+ 大连海事大学,111,211,Maritime Affairs University Of Dalian
317
+ 大连海事大学,111,211,DMU
318
+ 内蒙古大学,116,211,内大
319
+ 内蒙古大学,116,211,University Of The Inner Mongol、Inner Mongolia University
320
+ 内蒙古大学,116,211,IMU
321
+ 宁夏大学,125,211,Ningxia University
322
+ 宁夏大学,125,211,NXU
323
+ 青海大学,129,211,清大
324
+ 青海大学,129,211,Qinghai University
325
+ 青海大学,129,211,QHU
326
+ 中国石油大学,77,双一流,中石大
327
+ 中国石油大学,77,双一流,China University Of Petroleum Beijing
328
+ 中国石油大学,77,双一流,UPC
329
+ 太原理工大学,84,211,太原理工
330
+ 太原理工大学,84,211,Institutes Of Technology Of Taiyuan、Taiyuan University of Technology
331
+ 太原理工大学,84,211,TYUT
332
+ 西北大学,59,211,西北大
333
+ 西北大学,59,211,Northwest University
334
+ 西北大学,59,211,NWU
335
+ 西安电子科技大学,50,211,西电
336
+ 西安电子科技大学,50,211,Xidian University
337
+ 西安电子科技大学,50,211,XDU
338
+ 长安大学,83,211,长大
339
+ 长安大学,83,211,Chang`an University
340
+ 长安大学,83,211,CHU
341
+ 陕西师范大学,67,211,陕西师大
342
+ 陕西师范大学,67,211,陕师大
343
+ 陕西师范大学,67,211,Shaanxi Normal University
344
+ 陕西师范大学,67,211,SNNU
345
+ 第四军医大学,,211,空军军医大学、四医大
346
+ 第四军医大学,,211,Air Force Medical University
347
+ 第四军医大学,,211,FMMU
348
+ 华东理工大学,45,211,华理
349
+ 华东理工大学,45,211,East China University Of Science
350
+ 华东理工大学,45,211,ECUST
351
+ 东华大学,74,211,东华
352
+ 东华大学,74,211,Donghua University
353
+ 东华大学,74,211,DHU
354
+ 上海海洋大学,,双一流,上海海大
355
+ 上海海洋大学,,双一流,Shanghai Ocean University
356
+ 上海海洋大学,,双一流,SHOU
357
+ 上海中医药大学,,211,上中医
358
+ 上海中医药大学,,211,Shanghai University of Traditional Chinese Medicine
359
+ 上海中医药大学,,211,SHUTCM
360
+ 上海外国语大学,123,211,上外
361
+ 上海外国语大学,123,211,Shanghai International Studies University
362
+ 上海外国语大学,123,211,SISU
363
+ 上海财经大学,95,211,上海财大
364
+ 上海财经大学,95,211,上财
365
+ 上海财经大学,95,211,Shanghai University Of Finance
366
+ 上海财经大学,95,211,SUFE
367
+ 上海体育学院,,双一流,Shanghai University Of Sport
368
+ 上海音乐学院,,双一流,上音
369
+ 上海音乐学院,,双一流,Shanghai Conservatory Of Music
370
+ 上海音乐学院,,双一流,SHCM
371
+ 上海大学,43,211,上大
372
+ 上海大学,43,211,Shanghai University
373
+ 第二军医大学,,211,海军军医大学
374
+ 第二军医大学,,211,Naval Medical University
375
+ 西南交通大学,36,211,西南交大
376
+ 西南交通大学,36,211,Southwest Jiaotong University
377
+ 西南交通大学,36,211,SWJTU
378
+ 西南石油大学,,双一流,西南石大
379
+ 西南石油大学,,双一流,Southwest Petroleum University
380
+ 西南石油大学,,双一流,SWPU
381
+ 成都理工大学,,双一流,成都理工
382
+ 成都理工大学,,双一流,Chengdu University Of Technology
383
+ 成都理工大学,,双一流,CDUT
384
+ 四川农业大学,113,211,川农
385
+ 四川农业大学,113,211,川农大
386
+ 四川农业大学,113,211,Sichuan Agricultural University
387
+ 四川农业大学,113,211,SICAU
388
+ 成都中医药大学,,双一流,成中医
389
+ 成都中医药大学,,双一流,Chengdu University of TCM
390
+ 成都中医药大学,,双一流,CDUTCM
391
+ 西南财经大学,97,211,西南财大
392
+ 西南财经大学,97,211,西财
393
+ 西南财经大学,97,211,Southwestern University Of Finance And Economics
394
+ 西南财经大学,97,211,SWUFE
395
+ 天津工业大学,,双一流,天工大
396
+ 天津工业大学,,双一流,Tianjin University of Technology
397
+ 天津工业大学,,双一流,TGU
398
+ 天津医科大学,107,211,天津医大
399
+ 天津医科大学,107,211,Medical University Of Tianjin
400
+ 天津医科大学,107,211,TMU
401
+ 天津中医药大学,,双一流,天中
402
+ 天津中医药大学,,双一流,Tianjin University of Traditional Chinese Medicine
403
+ 天津中医药大学,,双一流,TUTCM
404
+ 华北电力大学,73,211,华电
405
+ 华北电力大学,73,211,North China Electric Power University
406
+ 华北电力大学,73,211,NCEPU
407
+ 河北工业大学,92,211,河工大
408
+ 河北工业大学,92,211,Hebei University of Technology
409
+ 河北工业大学,92,211,HEBUT
410
+ 西藏大学,135,211,藏大
411
+ 西藏大学,135,211,Tibet University
412
+ 西藏大学,135,211,TU
413
+ 石河子大学,117,211,石大
414
+ 石河子大学,117,211,Shihezi University
415
+ 中国美术学院,,双一流,中国美院
416
+ 中国美术学院,,双一流,国美
417
+ 中国美术学院,,双一流,China Academy of Art
418
+ 中国美术学院,,双一流,CAA
419
+ 宁波大学,70,双一流,Ningbo University
420
+ 宁波大学,70,双一流,NBU
421
+ 西南大学,46,985,西南大
422
+ 西南大学,46,985,Southwest University
423
+ 西南大学,46,985,SWU
424
+ 安徽大学,81,211,安大
425
+ 安徽大学,81,211,University Of Anhui
426
+ 安徽大学,81,211,AHU
427
+ 合肥工业大学,51,211,合肥工大
428
+ 合肥工业大学,51,211,合工大
429
+ 合肥工业大学,51,211,HeFei University of Technology
430
+ 合肥工业大学,51,211,HFUT
431
+ 麻省理工学院,1,海外名校,Massachusetts Institute of Technology
432
+ 麻省理工学院,1,海外名校,MIT
433
+ 麻省理工学院,1,海外名校,麻省
434
+ 麻省理工学院,1,海外名校,马萨诸塞理工学院
435
+ 牛津大学,2,海外名校,University of Oxford
436
+ 牛津大学,2,海外名校,Oxford
437
+ 牛津大学,2,海外名校,Oxon
438
+ 牛津大学,2,海外名校,牛津
439
+ 斯坦福大学,3,海外名校,Stanford University
440
+ 斯坦福大学,3,海外名校,Leland Stanford Junior University
441
+ 斯坦福大学,3,海外名校,斯坦福
442
+ 斯坦福大学,3,海外名校,Stanford
443
+ 剑桥大学,3,海外名校,University of Cambridge
444
+ 剑桥大学,3,海外名校,剑桥
445
+ 哈佛大学,5,海外名校,Harvard University
446
+ 哈佛大学,5,海外名校,哈佛
447
+ 加州理工学院,6,海外名校,California Institute of Technology
448
+ 加州理工学院,6,海外名校,Caltech
449
+ 加州理工学院,6,海外名校,加州理工
450
+ 帝国理工学院,,海外名校,Imperial College London
451
+ 帝国理工学院,,海外名校,帝国理工医学院
452
+ 帝国理工学院,,海外名校,Imperial College of Science
453
+ 帝国理工学院,,海外名校,Technology and Medicine
454
+ 帝国理工学院,,海外名校,帝国学院
455
+ 帝国理工学院,,海外名校,帝国理工
456
+ 帝国理工学院,,海外名校,Imperial
457
+ 帝国理工学院,,海外名校,IC
458
+ 苏黎世联邦理工学院,8,海外名校,苏黎世理工
459
+ 苏黎世联邦理工学院,8,海外名校,ETH
460
+ 苏黎世联邦理工学院,8,海外名校,Swiss Federal Institute of Technology in Zurich
461
+ 伦敦大学学院,8,海外名校,UCL
462
+ 伦敦大学学院,8,海外名校,University College London
463
+ 芝加哥大学,10,海外名校,University of Chicago
464
+ 芝加哥大学,10,海外名校,芝大
465
+ 芝加哥大学,10,海外名校,UChicago
466
+ 新加坡国立大学,11,海外名校,National University of Singapore
467
+ 新加坡国立大学,11,海外名校,NUS
468
+ 南洋理工大学,12,海外名校,Nanyang Technological University
469
+ 南洋理工大学,12,海外名校,Singapore
470
+ 南洋理工大学,12,海外名校,NTU
471
+ 宾夕法尼亚大学,13,海外名校,University of Pennsylvania
472
+ 宾夕法尼亚大学,13,海外名校,UPenn
473
+ 宾夕法尼亚大学,13,海外名校,宾大
474
+ 洛桑联邦理工学院,14,海外名校,EPFL
475
+ 洛桑联邦理工学院,14,海外名校,Swiss federal Institute of Technology in Lausanne
476
+ 耶鲁大学,14,海外名校,Yale University
477
+ 耶鲁大学,14,海外名校,耶鲁
478
+ 耶鲁大学,14,海外名校,Yale
479
+ 爱丁堡大学,16,海外名校,The University of Edinburgh
480
+ 爱丁堡大学,16,海外名校,Edin
481
+ 爱丁堡大学,16,海外名校,爱大
482
+ 哥伦比亚大学,19,海外名校,Columbia University
483
+ 哥伦比亚大学,19,海外名校,哥大
484
+ 哥伦比亚大学,19,海外名校,Columbia University in the City of New York
485
+ 普林斯顿大学,20,海外名校,Princeton University
486
+ 普林斯顿大学,20,海外名校,普林斯顿
487
+ 康奈尔大学,21,海外名校,Cornell University
488
+ 康奈尔大学,21,海外名校,Cornell
489
+ 康奈尔大学,21,海外名校,康奈尔
490
+ 香港大学,22,海外名校,The University of Hong Kong
491
+ 香港大学,22,海外名校,HKU
492
+ 香港大学,22,海外名校,港大
493
+ 东京大学,23,海外名校,The University of Tokyo
494
+ 东京大学,23,海外名校,東京大学
495
+ 东京大学,23,海外名校,UTokyo
496
+ 密歇根大学安娜堡分校,,海外名校,University of Michigan-Ann Arbor
497
+ 密歇根大学安娜堡分校,,海外名校,UMich
498
+ 约翰霍普金斯大学,25,海外名校,Johns Hopkins University
499
+ 约翰霍普金斯大学,25,海外名校,JHU
500
+ 约翰霍普金斯大学,25,海外名校,Hopkins
501
+ 约翰霍普金斯大学,25,海外名校,霍普金斯大学
502
+ 多伦多大学,26,海外名校,University of Toronto
503
+ 多伦多大学,26,海外名校,UofT
504
+ 多伦多大学,26,海外名校,UToronto
505
+ 麦吉尔大学,27,海外名校,McGill University
506
+ 麦吉尔大学,27,海外名校,McGill
507
+ 澳洲国立大学,,海外名校,The Australian National University
508
+ 澳洲国立大学,,海外名校,ANU
509
+ 曼彻斯特大学,27,海外名校,The University of Manchester
510
+ 曼彻斯特大学,27,海外名校,UoM
511
+ 京都大学,33,海外名校,Kyoto University
512
+ 香港科技大学,34,海外名校,The Hong Kong University of Science and Technology
513
+ 香港科技大学,34,海外名校,HKUST
514
+ 伦敦大学国王学院,,海外名校,King's College London
515
+ 伦敦大学国王学院,,海外名校,King's
516
+ 伦敦大学国王学院,,海外名校,KCL
517
+ 首尔国立大学,36,海外名校,Seoul National University
518
+ 首尔国立大学,36,海外名校,SNU
519
+ 墨尔本大学,37,海外名校,The University of Melbourne
520
+ 墨尔本大学,37,海外名校,UniMelb
521
+ 墨尔本大学,37,海外名校,Melb Uni
522
+ 悉尼大学,38,海外名校,The University of Sydney
523
+ 悉尼大学,38,海外名校,USYD
524
+ 悉尼大学,38,海外名校,Sydney U
525
+ 香港中文大学,39,海外名校,The Chinese University of Hong Kong
526
+ 香港中文大学,39,海外名校,CUHK
527
+ 韩国高等科技学院,,海外名校,Korea Advanced Institute of Science And Technology
528
+ 韩国高等科技学院,,海外名校,KAIST
529
+ 纽约大学,42,海外名校,New York University
530
+ 纽约大学,42,海外名校,NYU
531
+ 新南威尔士大学,43,海外名校,The University of New South Wales
532
+ 新南威尔士大学,43,海外名校,UNSW
533
+ 巴黎第九大学,,海外名校,University Paris Dauphine
534
+ 英属哥伦比亚大学,,海外名校,University of British Columbia
535
+ 英属哥伦比亚大学,,海外名校,UBC
536
+ 昆士兰大学,47,海外名校,The University of Queensland
537
+ 昆士兰大学,47,海外名校,UQ
538
+ 巴黎理工学院,49,海外名校,Institut Polytechnique de Paris
539
+ 巴黎理工学院,49,海外名校,IP-Paris
540
+ 伦敦政治经济学院,49,海外名校,The London School of Economics and Political Science
541
+ 伦敦政治经济学院,49,海外名校,LSE
542
+ 伦敦政治经济学院,49,海外名校,London School of Economics
543
+ 慕尼黑工业大学,,海外名校,Technical University of Munich
544
+ 杜克大学,52,海外名校,Duke University
545
+ 卡耐基梅隆大学,,海外名校,Carnegie Mellon University
546
+ 卡耐基梅隆大学,,海外名校,TU München
547
+ 卡耐基梅隆大学,,海外名校,TUM
548
+ 香港城市大学,53,海外名校,City University of Hong Kong
549
+ 香港城市大学,53,海外名校,CityU
550
+ 阿姆斯特丹大学,55,海外名校,University of Amsterdam
551
+ 阿姆斯特丹大学,55,海外名校,UvA
552
+ 东京工业大学,56,海外名校,Tokyo Institute of Technology
553
+ 东京工业大学,56,海外名校,Tokyo Tech
554
+ 代尔夫特理工大学,57,海外名校,Delft University of Technology
555
+ 莫纳什大学,,海外名校,Monash University
556
+ 莫纳什大学,,海外名校,Monash
557
+ 莫纳什大学,,海外名校,蒙纳士
558
+ 莫纳什大学,,海外名校,莫纳什
559
+ 莫纳什大学,,海外名校,莫大
560
+ 布朗大学,60,海外名校,Brown University
561
+ 布朗大学,60,海外名校,Brown
562
+ 布朗大学,60,海外名校,布朗
563
+ 华威大学,61,海外名校,The University of Warwick
564
+ 华威大学,61,海外名校,华威
565
+ 华威大学,61,海外名校,Warwick
566
+ 华威大学,61,海外名校,UoW
567
+ 布里斯托大学,62,海外名校,University of Bristol
568
+ 布里斯托大学,62,海外名校,布大
569
+ 海德堡大学,63,海外名校,Heidelberg University
570
+ 海德堡大学,63,海外名校,海德堡大学
571
+ 慕尼黑大学,64,海外名校,LMU
572
+ 慕尼黑大学,64,海外名校,University of Munich
573
+ 马来亚大学,65,海外名校,University of Malaya
574
+ 马来亚大学,65,海外名校,马大
575
+ 香港理工大学,66,海外名校,The Hong Kong Polytechnic University
576
+ 香港理工大学,66,海外名校,PolyU
577
+ 香港理工大学,66,海外名校,理大
578
+ 香港理工大学,66,海外名校,港理工
579
+ 德克萨斯大学奥斯汀分校,,海外名校,University of Texas at Austin
580
+ 德克萨斯大学奥斯汀分校,,海外名校,得州大学
581
+ 德克萨斯大学奥斯汀分校,,海外名校,UT-Austin
582
+ 国立台湾大学,68,海外名校,National Taiwan University
583
+ 国立台湾大学,68,海外名校,台大
584
+ 布宜诺斯艾利斯大学,69,海外名校,Universidad de Buenos Aires
585
+ 布宜诺斯艾利斯大学,69,海外名校,UBA
586
+ 鲁汶大学(荷语),,海外名校,University of Leuven
587
+ 鲁汶大学(荷语),,海外名校,KU Leuven
588
+ 苏黎世大学,70,海外名校,University of Zurich
589
+ 苏黎世大学,70,海外名校,UZH
590
+ 索邦大学,72,海外名校,Sorbonne University
591
+ 索邦大学,72,海外名校,索邦
592
+ 索邦大学,72,海外名校,Sorbonne
593
+ 格拉斯哥大学,73,海外名校,University of Glasgow
594
+ 格拉斯哥大学,73,海外名校,格大
595
+ 高丽大学,,海外名校,Korea University
596
+ 高丽大学,,海外名校,高丽大
597
+ 大阪大学,75,海外名校,Osaka University
598
+ 大阪大学,75,海外名校,阪大
599
+ 威斯康辛大学麦迪逊分校,,海外名校,University of Wisconsin-Madison
600
+ 威斯康辛大学麦迪逊分校,,海外名校,UW-Madison
601
+ 南安普敦大学,,海外名校,University of Southampton
602
+ 南安普敦大学,,海外名校,Soton
603
+ 莫斯科国立大学,,海外名校,Lomonosov Moscow State University
604
+ 莫斯科国立大学,,海外名校,莫斯科大学
605
+ 哥本哈根大学,79,海外名校,University of Copenhagen
606
+ 哥本哈根大学,79,海外名校,UCPH
607
+ 延世大学,79,海外名校,Yonsei University
608
+ 延世大学,79,海外名校,연세대학교
609
+ 浦项科技大学,81,海外名校,Pohang University of Science And Technology
610
+ 浦项科技大学,81,海外名校,POSTECH
611
+ 杜伦大学,,海外名校,Durham University
612
+ 杜伦大学,,海外名校,Durham
613
+ 伊利诺伊大学厄巴纳-香槟分校,,海外名校,University of Illinois at Urbana-Champaign
614
+ 伊利诺伊大学厄巴纳-香槟分校,,海外名校,UIUC
615
+ 奥克兰大学,85,海外名校,The University of Auckland
616
+ 奥克兰大学,85,海外名校,UoA
617
+ 华盛顿大学,85,海外名校,University of Washington
618
+ 华盛顿大学,85,海外名校,UWashington
619
+ 华盛顿大学,85,海外名校,UW
620
+ 巴黎萨克雷大学,86,海外名校,Université Paris-Saclay
621
+ 巴黎萨克雷大学,86,海外名校,UPSaclay
622
+ 巴黎萨克雷大学,86,海外名校,UPS
623
+ 隆德大学,87,海外名校,Lund University
624
+ 佐治亚理工学院,88,海外名校,Georgia Institute of Technology
625
+ 佐治亚理工学院,88,海外名校,Georgia Tech
626
+ 佐治亚理工学院,88,海外名校,Gatech
627
+ 佐治亚理工学院,88,海外名校,GT
628
+ 瑞典皇家理工学院,89,海外名校,KTH Royal Institute of Technology
629
+ 瑞典皇家理工学院,89,海外名校,皇家理工学院
630
+ 瑞典皇家理工学院,89,海外名校,KTH
631
+ 伯明翰大学,90,海外名校,University of Birmingham
632
+ 伯明翰大学,90,海外名校,伯大
633
+ 伯明翰大学,90,海外名校,UoB
634
+ 圣安德鲁斯大学,91,海外名校,University of St Andrews
635
+ 圣安德鲁斯大学,91,海外名校,St Andrews
636
+ 圣安德鲁斯大学,91,海外名校,St And
637
+ 圣安德鲁斯大学,91,海外名校,圣安
638
+ 利兹大学,92,海外名校,University of Leeds
639
+ 西澳大学,93,海外名校,The University of Western Australia
640
+ 西澳大学,93,海外名校,UWA
641
+ 莱斯大学,94,海外名校,Rice University
642
+ 莱斯大学,94,海外名校,Rice
643
+ 谢菲尔德大学,95,海外名校,The University of Sheffield
644
+ 谢菲尔德大学,95,海外名校,谢菲
645
+ 谢菲尔德大学,95,海外名校,谢大
646
+ 谢菲尔德大学,95,海外名校,TUoS
647
+ 宾州州立大学公园分校,,海外名校,Pennsylvania State University
648
+ 宾州州立大学公园分校,,海外名校,University Park
649
+ 成均馆大学,97,海外名校,Sungkyunkwan University(SKKU)
650
+ 成均馆大学,97,海外名校,成均馆大
651
+ 成均馆大学,97,海外名校,SKKU
652
+ 丹麦技术大学,99,海外名校,Technical University of Denmark
653
+ 丹麦技术大学,99,海外名校,DTU
654
+ 北卡罗来纳大学教堂山分校,100,海外名校,University of North Carolina at Chapel Hill
655
+ 北卡罗来纳大学教堂山分校,100,海外名校,UNC
656
+ 都柏林三一学院,,海外名校,Trinity College Dublin
657
+ 都柏林三一学院,,海外名校,The University of Dublin
658
+ 都柏林三一学院,,海外名校,TCD
659
+ 奥斯陆大学,102,海外名校,University of Oslo
660
+ 奥斯陆大学,102,海外名校,UiO
661
+ 奥斯陆大学,102,海外名校,奥大
662
+ 诺丁汉大学,103,海外名校,University of Nottingham
663
+ 诺丁汉大学,103,海外名校,UoN
664
+ 诺丁汉大学,103,海外名校,诺大
665
+ 赫尔辛基大学,104,海外名校,University of Helsinki
666
+ 赫尔辛基大学,104,海外名校,UH
667
+ 墨西哥国立自治大学,105,海外名校,Universidad Nacional Autónoma de México
668
+ 墨西哥国立自治大学,105,海外名校,墨国大
669
+ 墨西哥国立自治大学,105,海外名校,UNAM
670
+ 日内瓦大学,105,海外名校,University of Geneva
671
+ 日内瓦大学,105,海外名校,UNIGE
672
+ 圣路易斯华盛顿大学,107,海外名校,Washington University in St. Louis
673
+ 圣路易斯华盛顿大学,107,海外名校,WashU
674
+ 圣路易斯华盛顿大学,107,海外名校,WUSTL
675
+ 阿德雷德大学,,海外名校,The University of Adelaide
676
+ 阿德雷德大学,,海外名校,阿大
677
+ 阿卜杜勒阿齐兹国王大学,109,海外名校,King Abdulaziz University (KAU)
678
+ 乌得勒支大学,,��外名校,Utrecht University
679
+ 乌得勒支大学,,海外名校,UU
680
+ 蒙特利尔大学,111,海外名校,Université de Montréal
681
+ 蒙特利尔大学,111,海外名校,UdeM
682
+ 蒙特利尔大学,111,海外名校,蒙大
683
+ 阿尔托大学,112,海外名校,Aalto University
684
+ 阿尔托大学,112,海外名校,Aalto
685
+ 波士顿大学,112,海外名校,Boston University
686
+ 波士顿大学,112,海外名校,BU
687
+ 莱顿大学,112,海外名校,Leiden University
688
+ 南加州大学,11,海外名校,University of Southern California
689
+ 南加州大学,11,海外名校,南加州大学
690
+ 南加州大学,11,海外名校,南加大
691
+ 南加州大学,11,海外名校,USC
692
+ 普渡大学西拉法叶分校,116,海外名校,Purdue University
693
+ 普渡大学西拉法叶分校,116,海外名校,普渡大学
694
+ 伦敦大学玛丽女王学院,,海外名校,Queen Mary University of London
695
+ 伦敦大学玛丽女王学院,,海外名校,QMUL
696
+ 伦敦大学玛丽女王学院,,海外名校,Queen Mary
697
+ 名古屋大学,118,海外名校,Nagoya University
698
+ 名古屋大学,118,海外名校,名大
699
+ 伯尔尼大学,119,海外名校,University of Bern
700
+ 伯尔尼大学,119,海外名校,UniBe
701
+ 俄亥俄州立大学,120,海外名校,The Ohio State University
702
+ 俄亥俄州立大学,120,海外名校,Ohio State
703
+ 俄亥俄州立大学,120,海外名校,OSU
704
+ 查尔姆斯理工大学,121,海外名校,Chalmers University of Technology
705
+ 查尔姆斯理工大学,121,海外名校,Chalmers
706
+ 圣保罗大学,121,海外名校,Universidade de São Paulo
707
+ 圣保罗大学,121,海外名校,USP
708
+ 圣保罗大学,121,海外名校,圣大
709
+ 瓦格宁根大学,,海外名校,Wageningen University & Research
710
+ 瓦格宁根大学,,海外名校,Wageningen UR
711
+ 乌普萨拉大学,124,海外名校,Uppsala University
712
+ 乌普萨拉大学,124,海外名校,乌大
713
+ 埃因霍温理工大学,,海外名校,Eindhoven University of Technology
714
+ 埃因霍温理工大学,,海外名校,TU/e
715
+ 埃因霍温理工大学,,海外名校,TU Eindhoven
716
+ 柏林自由大学,127,海外名校,Freie Universitaet Berlin
717
+ 柏林自由大学,127,海外名校,FU Berlin
718
+ 柏林洪堡大学,128,海外名校,Humboldt-Universität zu Berlin
719
+ 柏林洪堡大学,128,海外名校,洪堡大学
720
+ 柏林洪堡大学,128,海外名校,柏林大学
721
+ 格罗宁根大学,128,海外名校,University of Groningen
722
+ 格罗宁根大学,128,海外名校,RuG
723
+ 里昂高等师范学院,,海外名校,École Normale Supérieure de Lyon
724
+ 里昂高等师范学院,,海外名校,ENS Lyon
725
+ 兰卡斯特大学,,海外名校,Lancaster University
726
+ 兰卡斯特大学,,海外名校,兰卡
727
+ 悉尼科技大学,133,海外名校,University of Technology Sydney
728
+ 悉尼科技大学,133,海外名校,UTS
729
+ 悉尼科技大学,133,海外名校,悉尼科大
730
+ 纽卡斯尔大学,134,海外名校,Newcastle University
731
+ 纽卡斯尔大学,134,海外名校,NCL
732
+ 纽卡斯尔大学,134,海外名校,纽大
733
+ 智利天主大学,,海外名校,Pontificia Universidad Católica de Chile (UC)
734
+ 卡尔斯鲁厄理工学院,136,海外名校,KIT
735
+ 卡尔斯鲁厄理工学院,136,海外名校,Karlsruhe Institute of Technology
736
+ 九州大学,137,海外名校,Kyushu University
737
+ 九州大学,137,海外名校,九大
738
+ 巴塞尔大学,138,海外名校,University of Basel
739
+ 巴塞尔大学,138,海外名校,Unibasel
740
+ 麦克马斯特大学,140,海外名校,McMaster University
741
+ 麦克马斯特大学,140,海外名校,麦马
742
+ 根特大学,141,海外名校,Ghent University
743
+ 根特大学,141,海外名校,UGent
744
+ 米兰理工大学,142,海外名校,Politecnico di Milano
745
+ 米兰理工大学,142,海外名校,POLIMI
746
+ 米兰理工大学,142,海外名校,米理
747
+ 米兰理工大学,142,海外名校,米兰理工
748
+ 马来西亚博特拉大学,143,海外名校,Universiti Putra Malaysia (UPM)
749
+ 马来西亚博特拉大学,143,海外名校,UPM
750
+ 马来西亚国民大学,144,海外名校,Universiti Kebangsaan Malaysia (UKM)
751
+ 马来西亚国民大学,144,海外名校,UKM
752
+ 北海道大学,145,海外名校,Hokkaido University
753
+ 北海道大学,145,海外名校,北大(ほくだい)
754
+ 马来西亚理科大学,147,海外名校,Universiti Sains Malaysia (USM)
755
+ 马来西亚理科大学,147,海外名校,USM
756
+ 马来西亚理科大学,147,海外名校,槟城理大
757
+ 斯德哥尔摩大学,148,海外名校,Stockholm University
758
+ 斯德哥尔摩大学,148,海外名校,SU
759
+ 埃克塞特大学,149,海外名校,The University of Exeter
760
+ 埃克塞特大学,149,海外名校,Exon
761
+ 滑铁卢大学,149,海外名校,University of Waterloo
762
+ 滑铁卢大学,149,海外名校,Waterloo(UW)
763
+ 卡迪夫大学,151,海外名校,Cardiff University
764
+ 卡迪夫大学,151,海外名校,卡大
765
+ 维也纳大学,151,海外名校,University of Vienna
766
+ 维也纳大学,151,海外名校,univie
767
+ 约克大学(英国),,海外名校,University of York
768
+ 约克大学(英国),,海外名校,York
769
+ 罗切斯特大学,,海外名校,University of Rochester
770
+ 罗切斯特大学,,海外名校,U of R
771
+ 奥胡斯大学,155,海外名校,Aarhus University
772
+ 奥胡斯大学,155,海外名校,AU
773
+ 汉阳大学,156,海外名校,Hanyang University
774
+ 汉阳大学,156,海外名校,汉阳大
775
+ 密歇根州立大学,157,海外名��,Michigan State University
776
+ 密歇根州立大学,157,海外名校,MSU
777
+ 马里兰大学学院公园分校,,海外名校,University of Maryland
778
+ 马里兰大学学院公园分校,,海外名校,College Park
779
+ 马里兰大学学院公园分校,,海外名校,UMD
780
+ 马里兰大学学院公园分校,,海外名校,UMCP
781
+ 柏林工业大学,159,海外名校,Technische Universität Berlin (TU Berlin)
782
+ 柏林工业大学,159,海外名校,TUB
783
+ 柏林工业大学,159,海外名校,TU Berlin
784
+ 埃默里大学,160,海外名校,Emory University
785
+ 埃默里大学,160,海外名校,Emory
786
+ 凯斯西储大学,161,海外名校,Case Western Reserve University
787
+ 凯斯西储大学,161,海外名校,CWRU
788
+ 凯斯西储大学,161,海外名校,CASE
789
+ 蒙特雷理工学院,,海外名校,Tecnológico de Monterrey
790
+ 法赫德国王石油与矿产大学,,海外名校,King Fahd University of Petroleum & Minerals
791
+ 法赫德国王石油与矿产大学,,海外名校,KFUPM
792
+ 匹兹堡大学,163,海外名校,University of Pittsburgh
793
+ 匹兹堡大学,163,海外名校,PITT
794
+ 匹兹堡大学,163,海外名校,匹大
795
+ 亚琛工业大学,165,海外名校,RWTH Aachen University
796
+ 亚琛工业大学,165,海外名校,RWTH Aachen
797
+ 亚琛工业大学,165,海外名校,RWTH
798
+ 博洛尼亚大学,166,海外名校,Alma Mater Studiorum - University of Bologna
799
+ 博洛尼亚大学,166,海外名校,Unibo
800
+ 博洛尼亚大学,166,海外名校,博大
801
+ 巴斯大学,166,海外名校,University of Bath
802
+ 德州农工大学,168,海外名校,Texas A&M University
803
+ 德州农工大学,168,海外名校,TAMU
804
+ 巴塞罗那大学,168,海外名校,Universitat de Barcelona
805
+ 西安大略大学,,海外名校,Western University
806
+ 西安大略大学,,海外名校,韦仕敦大学
807
+ 西安大略大学,,海外名校,UWO
808
+ 罗马第一大学,,海外名校,Sapienza University of Rome
809
+ 罗马第一大学,,海外名校,罗马一大
810
+ 弗莱堡大学,,海外名校,Albert-Ludwigs-Universitaet Freiburg
811
+ 都柏林大学学院,,海外名校,University College Dublin
812
+ 都柏林大学学院,,海外名校,UCD
813
+ 都柏林大学学院,,海外名校,UC Dublin
814
+ 佛罗里达大学,173,海外名校,University of Florida
815
+ 佛罗里达大学,173,海外名校,UF
816
+ 佛罗里达大学,173,海外名校,UFL
817
+ 国立哈萨克大学,,海外名校,Al-Farabi Kazakh National University
818
+ 洛桑大学,176,海外名校,University of Lausanne
819
+ 蒂宾根大学,177,海外名校,Eberhard Karls Universität Tübingen
820
+ 印度理工学院孟买分校,,海外名校,Indian Institute of Technology Bombay
821
+ 印度理工学院孟买分校,,海外名校,IITB
822
+ 伊拉斯姆斯大学,,海外名校,Erasmus University Rotterdam
823
+ 国立清华大学,180,海外名校,National Tsing Hua University
824
+ 维也纳技术大学,,海外名校,Technische Universität Wien
825
+ 哥德堡大学,180,海外名校,University of Gothenburg
826
+ 哈里发大学,,海外名校,Khalifa University
827
+ 智利大学,183,海外名校,Universidad de Chile
828
+ 印度理工学院德里分校,,海外名校,Indian Institute of Technology Delhi
829
+ 印度理工学院德里分校,,海外名校,IITD
830
+ 印度科学学院,,海外名校,Indian Institute of Science
831
+ 明尼苏达大学双城分校,186,海外名校,University of Minnesota Twin Cities
832
+ 鲁汶大学(法语),,海外名校,Université catholique de Louvain
833
+ 鲁汶大学(法语),,海外名校,UCLouvain
834
+ 利物浦大学,189,海外名校,University of Liverpool
835
+ 特文特大学,,海外名校,University of Twente
836
+ 达特茅斯学院,191,海外名校,Dartmouth College
837
+ 马来西亚理工大学,,海外名校,Universiti Teknologi Malaysia
838
+ 卧龙岗大学,193,海外名校,University of Wollongong
839
+ 科廷大学,194,海外名校,Curtin University
840
+ 德累斯顿工业大学,194,海外名校,Technische Universität Dresden
841
+ 奥塔戈大学,,海外名校,University of Otago
842
+ 纽卡斯尔大学(澳洲),,海外名校,The University of Newcastle
843
+ 纽卡斯尔大学(澳洲),,海外名校,Australia
844
+ 纽卡斯尔大学(澳洲),,海外名校,UON
845
+ 耶路撒冷希伯来大学,198,海外名校,The Hebrew University of Jerusalem
846
+ 卑尔根大学,199,海外名校,University of Bergen
847
+ 麦考瑞大学,200,海外名校,Macquarie University
848
+ 加州大学伯克利分校,32,海外名校,"University of California, Berkeley"
849
+ 加州大学伯克利分校,32,海外名校,UC Berkeley
850
+ 加州大学伯克利分校,32,海外名校,Cal
851
+ 加州大学圣地亚哥分校,48,海外名校,"University of California, San Diego"
852
+ 加州大学圣地亚哥分校,48,海外名校,UCSD
853
+ 加州大学圣地亚哥分校,48,海外名校,UC San Diego
854
+ 加州大学戴维斯分校,138,海外名校,"University of California, Davis"
855
+ 加州大学戴维斯分校,138,海外名校,UC Davis
856
+ 加州大学圣塔芭芭拉分校,146,海外名校,"University of California, Santa Barbara"
857
+ 加州大学圣塔芭芭拉分校,146,海外名校,UCSB
858
+ 加州大学圣塔芭芭拉分校,146,海外名校,UC Santa Barbara
859
+ 上海体育学院,,双一流,SUS
860
+ 密歇根大学-安娜堡,23,海外名校,University of Michigan - Ann Arbor
861
+ 伦敦国王学院,35,海外名校,King’s College London
862
+ 加州大学洛杉矶分校,40,海外名校,"University of California, Los Angeles"
863
+ 韩国科学技术院,41,海外名校,KAIST - Korea Advanced Institute of Science & Technology
864
+ 巴黎文理研究大学,44,海外名校,PSL University
865
+ 德克萨斯州大学奥斯汀分校,67,海外名校,The University of Texas at Austin
866
+ 威斯康星大学-麦迪逊,75,海外名校,University of Wisconsin - Madison
867
+ 宾夕法尼亚州立大学,96,海外名校,Penn State (Main campus)
868
+ 里昂高等师范学校,130,海外名校,Ecole Normale Superieure - Lyon
869
+ 智利天主教大学,135,海外名校,Pontifical Catholic University of Chile
870
+ 约克大学,494,海外名校,York University
871
+ 马里兰大学帕克分校,158,海外名校,"University of Maryland, College Park"
872
+ 蒙特雷技术学院,161,海外名校,Tecnologico de Monterrey
873
+ 天主教鲁汶大学(法语区),188,海外名校,Université Catholique de Louvain
874
+ 澳大利亚纽卡斯尔大学,197,海外名校,"The University of Newcastle, Australia"
875
+ 庆应义塾大学,201,海外名校,Keio University
876
+ 雷丁大学,202,海外名校,University of Reading
877
+ 早稻田大学,203,海外名校,Waseda University
878
+ 哥廷根大学,204,海外名校,University of Göttingen
879
+ 阿伯丁大学,205,海外名校,University of Aberdeen
880
+ 墨尔本皇家理工大学,206,海外名校,RMIT University
881
+ 马德里自治大学,207,海外名校,Autonomous University of Madrid
882
+ 布鲁塞尔自由大学,207,海外名校,Universite libre de Bruxelles
883
+ 阿姆斯特丹自由大学,209,海外名校,Vrije Universiteit Amsterdam
884
+ 巴塞罗那自治大学,209,海外名校,Autonomous University of Barcelona
885
+ 蔚山国立科学技术研究所,212,海外名校,Ulsan National Institute of Science & Technology
886
+ 昆士兰科技大学,213,海外名校,Queensland University of Technology
887
+ 汉堡大学,214,海外名校,Universität Hamburg
888
+ 朱拉隆功大学,215,海外名校,Chulalongkorn University
889
+ 亚利桑那州立大学,216,海外名校,Arizona State University
890
+ 贝尔法斯特女王大学,216,海外名校,Queen’s University Belfast
891
+ 范德堡大学,218,海外名校,Vanderbilt University
892
+ 坎皮纳斯州立大学,219,海外名校,State University of Campinas
893
+ 拉德堡德大学,220,海外名校,Radboud University
894
+ 布鲁塞尔大学,221,海外名校,Vrije Universiteit Brussel
895
+ 圣母大学,701,海外名校,Notre Dame University-Louaize NDU
896
+ 马德里康普顿斯大学,223,海外名校,Complutense University of Madrid
897
+ 卡塔尔大学,224,海外名校,Qatar University
898
+ 波恩大学,226,海外名校,Rheinische Friedrich-Wilhelms-Universität Bonn
899
+ 开普敦大学,226,海外名校,University of Cape Town
900
+ 萨塞克斯大学,226,海外名校,University of Sussex
901
+ 弗吉尼亚大学,226,海外名校,University of Virginia
902
+ 渥太华大学,230,海外名校,University of Ottawa
903
+ 拉夫堡大学,231,海外名校,Loughborough University
904
+ 加州大学欧文分校,232,海外名校,"University of California, Irvine"
905
+ 安特卫普大学,233,海外名校,University of Antwerp
906
+ 马斯特里赫特大学,233,海外名校,Maastricht University
907
+ 卡尔加里大学,235,海外名校,University of Calgary
908
+ 安德斯大学,236,海外名校,Universidad de los Andes
909
+ 莱斯特大学,236,海外名校,University of Leicester
910
+ 维多利亚大学,601,海外名校,Victoria University
911
+ 金斯敦女王大学,240,海外名校,Queen's University at Kingston
912
+ 瑞士提契诺大学,240,海外名校,USI - Università della Svizzera italiana
913
+ 贝鲁特美国大学,242,海外名校,American University of Beirut
914
+ 帕多瓦大学,242,海外名校,Università di Padova
915
+ 圣彼得堡国立大学,242,海外名校,Saint Petersburg State University
916
+ 巴黎高科桥梁学院,245,海外名校,Ecole des Ponts ParisTech
917
+ 新西伯利亚州立大学,246,海外名校,Novosibirsk State University
918
+ 马萨诸塞大学安姆斯特分校,246,海外名校,University of Massachusetts
919
+ 乔治敦大学,248,海外名校,Georgetown University
920
+ 庞培法布拉大学,248,海外名校,Pompeu Fabra University
921
+ 文莱达鲁萨兰国大学,250,海外名校,Universiti Brunei Darussalam (UBD)
922
+ 科罗拉多大学-玻尔得,251,海外名校,University of Colorado at Boulder
923
+ 国立成功大学,252,海外名校,National Cheng Kung University
924
+ 叶史瓦大学,252,海外名校,Yeshiva University
925
+ 加查马达大学,254,海外名校,Gadjah Mada University
926
+ 印度马德拉斯技术学院,255,海外名校,Indian Institute of Technology Madras
927
+ 玛希隆大学,255,海外名校,Mahidol University
928
+ 特拉维夫大学,255,海外名校,Tel Aviv University
929
+ 哥伦比亚国立大学,258,海外名校,National University of Colombia
930
+ 爱尔兰国立大学戈尔韦分校,258,海外名校,National University of Ireland Galway
931
+ 坎特伯雷大学,258,海外名校,University of Canterbury
932
+ 科学宝,261,海外名校,Sciences Po
933
+ 巴黎大学,261,海外名校,Université de Paris
934
+ 庆熙大学,264,海外名校,Kyung Hee University
935
+ 罗格斯大学,264,海外名校,Rutgers University–New Brunswick
936
+ 纳瓦拉大学,266,海外名校,University of Navarra
937
+ 布拉格查理大学,266,海外名校,Charles University
938
+ 国立阳明大学,268,海外名校,National Yang Ming Chiao Tung University
939
+ 亚利桑那大学,268,海外名校,The University of Arizona
940
+ 达姆施塔特工业大学,269,海外名校,Technical University Darmstadt
941
+ 赫瑞瓦特大学,270,海外名校,Heriot Watt University
942
+ 达尔豪斯大学,272,海外名校,Dalhousie University
943
+ 托木斯克州立大学,272,海外名校,Tomsk State University
944
+ 萨里大学,272,海外名校,University of Surrey
945
+ 南方科技大学,275,海外名校,Southern University of Science & Technology
946
+ 塔夫茨大学,275,海外名校,Tufts University
947
+ 格拉茨科技大学,277,海外名校,Graz University of Technology
948
+ 印度理工学院坎普尔,277,海外名校,Indian Institute of Technology Kanpur
949
+ 沙特国王大学,277,海外名校,King Saud University
950
+ 印度理工学院卡拉格普尔分校,280,海外名校,Indian Institute of Technology Kharagpur
951
+ 鲍曼莫斯科国立技术大学,281,海外名校,Bauman Moscow State Technical University
952
+ 因斯布鲁克大学,281,海外名校,University of Innsbruck
953
+ 迪肯大学,283,海外名校,Deakin University
954
+ 梅西大学,284,海外名校,Massey University
955
+ 伊利诺伊大学芝加哥分校,285,海外名校,University of Illinois at Chicago
956
+ 筑波大学,285,海外名校,University of Tsukuba
957
+ 香港浸会大学,287,海外名校,Hong Kong Baptist University
958
+ 阿联酋大学,288,海外名校,United Arab Emirates University
959
+ 印度尼西亚大学,290,海外名校,University of Indonesia
960
+ 格里菲斯大学,290,海外名校,Griffith University
961
+ 莫斯科物理科学与技术学院,290,海外名校,Moscow Institute of Physics and Technology
962
+ 巴黎第一大学,290,海外名校,Panthéon-Sorbonne University – Paris 1
963
+ 白俄罗斯国立大学,295,海外名校,Belarusian State University
964
+ 波尔图大学,295,海外名校,University of Porto
965
+ 图尔库大学,295,海外名校,University of Turku
966
+ 科克大学,298,海外名校,University College Cork
967
+ 西蒙弗雷泽大学,298,海外名校,Simon Fraser University
968
+ 北卡罗来纳州立大学,300,海外名校,North Carolina State University
969
+ 塔尔图大学,300,海外名校,University of Tartu
970
+ 斯特拉斯克莱德大学,302,海外名校,University of Strathclyde
971
+ 塔斯马尼亚大学,303,海外名校,University of Tasmania
972
+ 万隆理工学院(ITB),303,海外名校,Bandung Institute of Technology (ITB)
973
+ 光州科学技术学院,305,海外名校,Gwangju Institute of Science and Technology
974
+ 俄罗斯国立高等经济大学,305,海外名校,HSE University
975
+ 东英吉利大学,307,海外名校,University of East Anglia
976
+ 华沙大学,308,海外名校,University of Warsaw
977
+ 南丹麦大学,309,海外名校,University of Southern Denmark
978
+ 雅盖隆大学,309,海外名校,Jagiellonian University
979
+ 印第安纳大学伯明顿分校,311,海外名校,Indiana University
980
+ 科隆大学,311,海外名校,University of Cologne
981
+ 迈阿密大学,311,海外名校,University of Miami
982
+ 格勒诺布尔阿尔卑斯大学,314,海外名校,Grenoble Alpes University
983
+ 国立台湾科技大学,314,海外名校,National Taiwan University of Science and Technology
984
+ 米兰大学,316,海外名校,University of Milan
985
+ 俄罗斯人民友谊大学,317,海外名校,RUDN University
986
+ IE大学,317,海外名校,IE University
987
+ 国立核研究大学-莫斯科工程物理学院,319,海外名校,National Research Nuclear University MEPhI
988
+ 加泰罗尼亚理工大学,319,海外名校,Polytechnic University of Catalonia
989
+ 斯威本科技大学,321,海外名校,Swinburne University of Technology
990
+ 阿根廷天主教大学,322,海外名校,Pontificia Universidad Católica Argentina
991
+ 邓迪大学,322,海外名校,University of Dundee
992
+ 澳门大学,322,海外名校,University of Macau
993
+ 埃尔朗根-纽伦堡大学,322,海外名校,Friedrich-Alexander-Universität Erlangen-Nürnberg
994
+ 奥尔堡大学,326,海外名校,Aalborg University
995
+ 南澳大利亚大学,326,海外名校,University of South Australia
996
+ 国立欧亚大学,328,海外名校,L.N. Gumilyov Eurasian National University
997
+ 林雪平大学,329,海外名校,Linkoping University
998
+ 以色列理工学院,330,海外名校,Technion - Israel Institute of Technology
999
+ 伦敦大学城市学院,330,海外名校,City University London
1000
+ 伦敦大学伯贝克学院,332,海外名校,Birkbeck University London
1001
+ 圣光机大学,365,海外名校,ITMO University
1002
+ 国立台湾师范大学,334,海外名校,National Taiwan Normal University
1003
+ 都灵理工大学,334,海外名校,Politecnico di Torino
1004
+ 伦敦大学皇家霍洛威学院,334,海外名校,Royal Holloway University of London
1005
+ 法兰克福大学,340,海外名校,Goethe University Frankfurt
1006
+ 夏威夷大学马诺阿分校,340,海外名校,University of Hawai’i at Mānoa
1007
+ 广岛大学,343,海外名校,Hiroshima University
1008
+ 耶拿大学,344,海外名校,Universität Jena
1009
+ 文莱大学,344,海外名校,Universiti Teknologi Brunei
1010
+ 弗吉尼亚理工,346,海外名校,Virginia Polytechnic Institute and State University
1011
+ 斯图加特大学,347,海外名校,Universität Stuttgart
1012
+ 加州大学圣克鲁兹分校,347,海外名校,"University of California, Santa Cruz"
1013
+ 喀山联邦大学,347,海外名校,Kazan Federal University
1014
+ 斯特雅大学,347,海外名校,UCSI University
1015
+ 马德里卡洛斯三世大学,351,海外名校,Universidad Carlos III de Madrid
1016
+ 乌拉尔联邦大学,351,海外名校,Ural Federal University
1017
+ 伦敦布鲁内尔大学,351,海外名校,Brunel University London
1018
+ 约翰内斯开普勒大学林茨,354,海外名校,Johannes Kepler University Linz
1019
+ 乔治华盛顿大学,355,海外名校,George Washington University
1020
+ 里斯本大学,356,海外名校,University of Lisbon
1021
+ 蒂尔堡大学,356,海外名校,Tilburg University
1022
+ 国立科技大学-巴基斯坦,358,海外名校,National University of Sciences & Technology - Pakistan
1023
+ 于韦斯屈莱大学,358,海外名校,University of Jyvaskyla
1024
+ 犹他大学,358,海外名校,University of Utah
1025
+ 拉筹伯大学,362,海外名校,La Trobe University
1026
+ 莫斯科国立国际关系学院,362,海外名校,MGIMO University
1027
+ 梨花女子大学,362,海外名校,Ewha Womans University
1028
+ 乌尔姆大学,365,海外名校,Ulm University
1029
+ 于默奥大学,365,海外名校,Umea University
1030
+ 苏丹卡布斯大学,368,海外名校,Sultan Qaboos University
1031
+ 挪威科技大学,369,海外名校,Norwegian University of Science and Technology
1032
+ 里约热内卢联邦大学,369,海外名校,Federal University of Rio de Janeiro
1033
+ 瓦伦西亚理工大学,371,海外名校,Polytechnic University of Valencia
1034
+ 林肯大学,801,海外名校,University of Lincoln
1035
+ 布拉格化工大学,373,海外名校,"University of Chemistry & Technology, Prague"
1036
+ 康涅狄格大学,373,海外名校,University of Connecticut
1037
+ 怀卡托大学,373,海外名校,University of Waikato
1038
+ 奥卢大学,377,海外名校,University of Oulu
1039
+ 维尔茨堡大学,378,海外名校,Julius-Maximilians-Universität Würzburg
1040
+ 真纳大学,378,海外名校,Quaid I Azam University
1041
+ 纽约州立大学石溪分校,378,海外名校,Stony Brook University
1042
+ 谢里夫科技大学,381,海外名校,Sharif University of Technology
1043
+ 东京医科齿科大学,381,海外名校,Tokyo Medical and Dental University
1044
+ 美国沙迦大学,383,海外名校,American University of Sharjah
1045
+ 肯特大学,383,海外名校,University of Kent
1046
+ 神户大学,386,海外名校,Kobe University
1047
+ 堪萨斯大学,387,海外名校,University of Kansas
1048
+ 纽约州立大学布法罗分校,388,海外名校,University at Buffalo
1049
+ 比萨大学,388,海外名校,University of Pisa
1050
+ 圣拉斐尔生命健康大学,390,海外名校,Università Vita-Salute San Raffaele
1051
+ 伦敦大学亚非学院,391,海外名校,SOAS University of London
1052
+ 巴勒莫大学,801,海外名校,University of Palermo
1053
+ 彼得大帝圣彼得堡理工大学,393,海外名校,Peter the Great St Petersburg Polytechnic University
1054
+ 波鸿大学,393,海外名校,University of Bochum
1055
+ 国立研究托木斯克理工大学,395,海外名校,National Research Tomsk Polytechnic University
1056
+ 秘鲁天主教大学,395,海外名校,Pontifical Catholic University of Peru
1057
+ 印度理工学院古瓦哈提,395,海外名校,Indian Institute of Technology Guwahati
1058
+ 巴基斯坦工程与应用科学研究所,398,海外名校,Pakistan Institute of Engineering and Applied Sciences (PIEAS)
1059
+ 菲律宾大学,399,海外名校,University of the Philippines
1060
+ 南方大学,400,海外名校,Universidad Austral
1061
+ 印度理工学院鲁尔基校区,400,海外名校,Indian Institute of Technology Roorkee
1062
+ 维尔纽斯大学,400,海外名校,Vilnius University
1063
+ 加州大学河滨分校,403,海外名校,"University of California, Riverside"
1064
+ 布拉格捷克技术大学,403,海外名校,Czech Technical University in Prague
1065
+ 弗林德斯大学,407,海外名校,Flinders University
1066
+ 台北医科大学,407,海外名校,Taipei Medical University
1067
+ 明斯特大学,411,海外名校,University of Muenster
1068
+ 国立中山大学,412,海外名校,National Sun Yat-Sen University
1069
+ 哈韦里亚纳大学,412,海外名校,Pontificia Universidad Javeriana
1070
+ 拉彭兰塔理工大学,414,海外名校,Lappeenranta University of Technology
1071
+ 拉瓦尔大学,414,海外名校,Université Laval
1072
+ 国油大学,414,海外名校,Universiti Teknologi Petronas
1073
+ 科罗拉多大学-丹佛,414,海外名校,University of Colorado at Denver
1074
+ 坦佩雷大学,414,海外名校,University of Tampere
1075
+ 中央大学,414,海外名校,Chung-Ang University
1076
+ 斯特拉斯堡大学,421,海外名校,Université de Strasbourg
1077
+ 雅典国立技术大学,421,海外名校,National Technical University of Athens
1078
+ 曼海姆大学,423,海外名校,Universität Mannheim
1079
+ 那不勒斯菲里德里克第二大学,424,海外名校,University of Naples - Federico II
1080
+ 威特沃特斯兰德大学,424,海外名校,University of the Witwatersrand
1081
+ 詹姆斯库克大学,424,海外名校,James Cook University
1082
+ 约翰内斯古腾堡美因茨大学,427,海外名校,Johannes Gutenberg University of Mainz
1083
+ 华盛顿州立大学,427,海外名校,Washington State University
1084
+ 维克森林大学,429,海外名校,Wake Forest University
1085
+ 牛津布鲁克斯大学,429,海外名校,Oxford Brookes University
1086
+ 新里斯本大学,431,海外名校,NOVA University Lisbon
1087
+ 伦斯勒理工学院,431,海外名校,Rensselaer Polytechnic Institute
1088
+ 科罗拉多州立大学,431,海外名校,Colorado State University
1089
+ 圣保罗联邦大学,434,海外名校,Federal University of Sao Paulo
1090
+ 约翰内斯堡大学,434,海外名校,University of Johannesburg
1091
+ 堪培拉大学,436,海外名校,University of Canberra
1092
+ 杜兰大学,436,海外名校,Tulane University
1093
+ 埃塞克斯大学,439,海外名校,University of Essex
1094
+ 塞浦路斯大学,440,海外名校,University of Cyprus
1095
+ 特伦托大学,440,海外名校,University of Trento
1096
+ 特罗姆瑟大学挪威北极大学,440,海外名校,University of Tromsø The Arctic University of Norway
1097
+ 斯旺西大学,440,海外名校,Swansea University
1098
+ 伊利诺伊理工学院,444,海外名校,Illinois Institute of Technology
1099
+ 韩国外国语大学,445,海外名校,HUFS - Hankuk (Korea) University of Foreign Studies
1100
+ 开罗美国大学,445,海外名校,The American University in Cairo
1101
+ 萨尔大学,447,海外名校,Saarland University
1102
+ 乌姆古拉大学,447,海外名校,Umm Al-Qura University
1103
+ 莱比锡大学,447,海外名校,Universität Leipzig
1104
+ 米兰 - 比可卡大学,450,海外名校,University of Milan-Bicocca
1105
+ 佛罗伦萨大学,451,海外名校,University of Florence
1106
+ 奥克兰理工大学,451,海外名校,Auckland University of Technology
1107
+ 邦德大学,451,海外名校,Bond University
1108
+ 科英布拉大学,455,海外名校,University of Coimbra
1109
+ 爱荷华大学,455,海外名校,University of Iowa
1110
+ 布兰迪斯大学,455,海外名校,Brandeis University
1111
+ 萨斯喀彻温大学,458,海外名校,University of Saskatchewan
1112
+ 马德里理工大学,459,海外名校,Universidad Politécnica de Madrid
1113
+ 圣加仑大学,459,海外名校,University of St Gallen
1114
+ 马丁·路德·哈勒-威登堡大学,461,海外名校,Martin Luther University of Halle-Wittenberg
1115
+ 科罗拉多矿业大学,461,海外名校,Colorado School of Mines
1116
+ 远东联邦大学,461,海外名校,Far Eastern Federal University
1117
+ 伦敦大学金史密斯学院,461,海外名校,Goldsmiths University London
1118
+ 爱兰加大学,465,海外名校,Airlangga University
1119
+ 阿米尔卡比尔理工大学,465,海外名校,Amirkabir University of Technology
1120
+ 贝尔格拉诺大学,465,海外名校,Universidad de Belgrano
1121
+ 国立台北科技大学,469,海外名校,National Taipei University of Technology
1122
+ 乌拉圭蒙得维的亚大学,469,海外名校,Universidad de Montevideo (UM)
1123
+ 乌拉圭大学,471,海外名校,Universidad ORT Uruguay
1124
+ 斯特灵大学,471,海外名校,University of Stirling
1125
+ 内盖夫本古里安大学,471,海外名校,Ben-Gurion University of the Negev
1126
+ 哈瑟尔特大学,471,海外名校,Hasselt University
1127
+ 佛罗里达州立大学,475,海外名校,Florida State University
1128
+ 密苏里大学,476,海外名校,Mizzou - University of Missouri
1129
+ 德克萨斯大学-达拉斯,477,海外名校,The University of Texas at Dallas
1130
+ 巴伊兰大学,477,海外名校,Bar-Ilan University
1131
+ 千叶大学,477,海外名校,Chiba University
1132
+ 长庚大学,480,海外名校,Chang Gung University
1133
+ 列日大学,480,海外名校,University of Liege
1134
+ 南哈萨克斯坦大学,482,海外名校,Auezov South Kazakhstan University (SKU)
1135
+ 斯泰伦博斯大学,482,海外名校,Stellenbosch University
1136
+ 韩国天主教大学,482,海外名校,The Catholic University of Korea
1137
+ 都灵大学,485,海外名校,University of Turin
1138
+ 阿斯顿大学,485,海外名校,Aston University
1139
+ 俄罗斯国立科技大学,487,海外名校,The National University of Science and Technology MISIS
1140
+ 横滨市立大学,487,海外名校,Yokohama City University
1141
+ 智利圣地亚哥大学,487,海外名校,Universidad de Santiago de Chile
1142
+ 都柏林城市大学,490,海外名校,Dublin City University
1143
+ 贾森·利比希大学,490,海外名校,Justus Liebig University Giessen
1144
+ 圣保罗州立大学,492,海外名校,UNESP
1145
+ 格拉纳达大学,492,海外名校,University of Granada
1146
+ 罗马第二大学,494,海外名校,University of Roma - Tor Vergata
1147
+ 西悉尼大学,494,海外名校,University of Western Sydney
1148
+ 波士顿学院,494,海外名校,Boston College
1149
+ 基尔大学,751,海外名校,Keele University
1150
+ 东国大学,494,海外名校,Dongguk University
1151
+ 爱荷华州立大学,494,海外名校,Iowa State University
1152
+ 西江大学,494,海外名校,Sogang University
1153
+ 华沙工业大学,501,海外名校,Warsaw University of Technology
1154
+ 艾克斯 - 马赛大学,501,海外名校,Aix-Marseille University
1155
+ 哈瓦那大学,501,海外名校,Universidad de La Habana
1156
+ 康斯坦茨大学,501,海外名校,Universität Konstanz
1157
+ 波尔多大学,501,海外名校,Universite de Bordeaux
1158
+ 德里大学,501,海外名校,University of Delhi
1159
+ 克拉根福大学,501,海外名校,University of Klagenfurt
1160
+ 利默里克大学,501,海外名校,University of Limerick
1161
+ 萨特巴耶夫大学,501,海外名校,Satbayev University
1162
+ 密苏里科技大学,511,海外名校,Missouri University of Science & Technology
1163
+ 新加坡管理大学,511,海外名校,Singapore Management University
1164
+ 茂物农业大学,511,海外名校,IPB University
1165
+ 科奇大学,511,海外名校,Koç University
1166
+ 哈尔科夫大学,511,海外名校,V. N. Karazin Kharkiv National University
1167
+ 韦恩州立大学,511,海外名校,Wayne State University
1168
+ 阿尔卡拉大学,511,海外名校,Universidad de Alcala
1169
+ 马来西亚北方大学,511,海外名校,Universiti Utara Malaysia
1170
+ 巴拉曼德大学,511,海外名校,University of Balamand
1171
+ 中央大学“阿伯勒玛塔”德拉斯别墅,521,海外名校,"Universidad Central ""Marta Abreu"" de Las Villas"
1172
+ 拜罗伊特大学,521,海外名校,University of Bayreuth
1173
+ 东芬兰大学,521,海外名校,University of Eastern Finland
1174
+ 德黑兰大学,521,海外名校,University of Tehran
1175
+ 康考迪亚大学,521,海外名校,Concordia University
1176
+ 阿卜杜勒拉曼费萨尔大学,521,海外名校,Imam Abdulrahman Bin Faisal University
1177
+ 蒙彼利埃大学,521,海外名校,Montpellier University
1178
+ 国立中央大学,521,海外名校,National Central University
1179
+ 理海大学,531,海外名校,Lehigh University
1180
+ 长崎大学,531,海外名校,Nagasaki University
1181
+ 国家研究萨拉托夫州立大学,531,海外名校,National Research Saratov State University
1182
+ 俄勒冈州立大学,531,海外名校,Oregon State University
1183
+ 贝鲁特圣约瑟夫大学,531,海外名校,Saint Joseph University of Beirut (USJ)
1184
+ 南方联邦大学,531,海外名校,Southern Federal University
1185
+ 韩国亚洲大学,531,海外名校,Ajou University
1186
+ 一桥大学,531,海外名校,Hitotsubashi University
1187
+ 里昂国家科学研究院,531,海外名校,Institut National des Sciences Appliquées de Lyon (INSA)
1188
+ 哥斯达黎加大学,531,海外名校,Universidad de Costa Rica
1189
+ 萨拉戈萨大学,531,海外名校,Universidad de Zaragoza
1190
+ 特拉华大学,531,海外名校,University of Delaware
1191
+ 蔚山大学,541,海外名校,University of Ulsan
1192
+ 阿伯里斯特威斯大学,541,海外名校,Aberystwyth University
1193
+ 伊朗科技大学,541,海外名校,Iran University of Science & Technology
1194
+ 庆北国立大学,541,海外名校,Kyungpook National University
1195
+ 新泻大学,541,海外名校,Niigata University
1196
+ 萨班哲大学,541,海外名校,Sabanci University
1197
+ 佐治亚大学,541,海外名校,The University of Georgia
1198
+ 田纳西大学,541,海外名校,"The University of Tennessee, Knoxville"
1199
+ 马萨里克大学,551,海外名校,Masaryk University
1200
+ 中东技术大学,551,海外名校,Middle East Technical University
1201
+ 图卢兹第三大学,551,海外名校,Paul Sabatier University (Toulouse 3)
1202
+ 阿拜哈萨克斯坦国立师范大学,551,海外名校,Abai Kazakh National Pedagogical University
1203
+ 塞萨洛尼基亚里士多德大学,551,海外名校,Aristotle University of Thessaloniki
1204
+ 哈萨克国立农业大学,551,海外名校,Kazakh National Agrarian University KazNAU
1205
+ 塞维利亚大学,551,海外名校,Universidad de Sevilla
1206
+ 泛美大学,551,海外名校,Universidad Panamericana (UP)
1207
+ 米兰圣心天主教大学,551,海外名校,Università Cattolica del Sacro Cuore
1208
+ 塞格德大学,551,海外名校,University of Szeged
1209
+ 帕维亚大学研究学院,561,海外名校,Università degli Studi di Pavia
1210
+ 弗里堡大学,561,海外名校,Université de Fribourg
1211
+ 阿尔泰州立大学,561,海外名校,Altai State University
1212
+ 哈林大学,561,海外名校,Hallym University
1213
+ 仁荷大学,561,海外名校,Inha University
1214
+ 贾瓦哈拉尔尼赫鲁大学,561,海外名校,Jawaharlal Nehru University
1215
+ 布伦瑞克工业大学,561,海外名校,Technical University of Braunschweig
1216
+ 新学院大学,561,海外名校,The New School
1217
+ 大阪市立大学,571,海外名校,Osaka City University
1218
+ 开罗大学,571,海外名校,Cairo University
1219
+ 卡斯里克圣灵大学,571,海外名校,Holy Spirit University of Kaslik
1220
+ 伊万第比利斯国立大学,571,海外名校,Ivane Javakhishvili Tbilisi State University
1221
+ 全北国立大学,571,海外名校,Jeonbuk National University
1222
+ 不来梅大学,571,海外名校,Universität Bremen
1223
+ 瓦伦西亚大学,571,海外名校,Universitat de Valencia
1224
+ 米尼奥大学,571,海外名校,University of Minho
1225
+ 内布拉斯加大学-林肯,571,海外名校,University of Nebraska - Lincoln
1226
+ 拉普拉塔国立大学,581,海外名校,Universidad Nacional de La Plata (UNLP)
1227
+ 圭尔夫大学,581,海外名校,University of Guelph
1228
+ 南佛罗里达大学,581,海外名校,University of South Florida
1229
+ 黎巴嫩美国大学,581,海外名校,Lebanese American University
1230
+ 香港岭南大学,581,海外名校,"Lingnan University, Hong Kong"
1231
+ 默多克大学,581,海外名校,Murdoch University
1232
+ 冈山大学,581,海外名校,Okayama University
1233
+ 萨马拉国立研究大学(Samara University),581,海外名校,Samara National Research University (Samara University)
1234
+ 国立政治大学,591,海外名校,National Chengchi University
1235
+ Savitribai普鲁大学,591,海外名校,Savitribai Phule Pune University
1236
+ 索非亚大学,591,海外名校,Sofia University
1237
+ 哥伦比亚外事大学,591,海外名校,Universidad Externado de Colombia
1238
+ 雷根斯堡大学,591,海外名校,Universität Regensburg
1239
+ 魁北克大学,591,海外名校,Université du Québec
1240
+ 德布勒森大学,591,海外名校,University of Debrecen
1241
+ 卢布尔雅那大学,591,海外名校,University of Ljubljana
1242
+ 巴林应用科学大学,591,海外名校,Applied Science University - Bahrain
1243
+ 比尔肯特大学,591,海外名校,Bilkent University
1244
+ 印度理工学院海德拉巴,591,海外名校,Indian Institute of Technology Hyderabad
1245
+ 熊本大学,591,海外名校,Kumamoto University
1246
+ 阿博大学,601,海外名校,Abo Akademi University
1247
+ 迪拜美国大学,601,海外名校,American University in Dubai
1248
+ 马尼拉雅典大学,601,海外名校,Ateneo de Manila University
1249
+ 班戈大学,601,海外名校,Bangor University
1250
+ 迪拜加拿大大学,601,海外名校,Canadian University Dubai
1251
+ 卡尔顿大学,601,海外名校,Carleton University
1252
+ 中央昆士兰大学,601,海外名校,Central Queensland University
1253
+ 清迈大学,601,海外名校,Chiang Mai University
1254
+ 克拉克大学,601,海外名校,Clark University
1255
+ 科米利亚斯宗座大学,601,海外名校,Comillas Pontifical University
1256
+ 考文垂大学,601,海外名校,Coventry University
1257
+ 岐阜大学,601,海外名校,Gifu University
1258
+ 金泽大学,601,海外名校,Kanazawa University
1259
+ 金斯顿大学,601,海外名校,Kingston University
1260
+ 建国大学,601,海外名校,Konkuk University
1261
+ 墨西哥阿那瓦克大学,601,海外名校,Universidad Anáhuac México
1262
+ 康塞普西翁大学,601,海外名校,Universidad de Concepción
1263
+ 布宜诺斯艾利斯省中央大学 (UNICEN,601,海外名校,Universidad Nacional del Centro de la Provincia de Buenos Aires (UNICEN)
1264
+ 拉蒙鲁尔大学,601,海外名校,Universitat Ramon Llull
1265
+ 霍恩海姆大学,601,海外名校,University Hohenheim
1266
+ 阿威罗大学,601,海外名校,University of Aveiro
1267
+ 辛辛那提大学,601,海外名校,University of Cincinnati
1268
+ 热那亚大学,601,海外名校,University of Genoa
1269
+ 曼尼托巴大学,601,海外名校,University of Manitoba
1270
+ 普利茅斯大学,601,海外名校,University of Plymouth
1271
+ 比勒陀利亚大学,601,海外名校,University of Pretoria
1272
+ 沙迦大学,601,海外名校,University of Sharjah
1273
+ 锡耶纳大学,601,海外名校,University of Siena
1274
+ 南卡罗来纳大学,601,海外名校,University of South Carolina
1275
+ 汉诺威莱布尼兹大学,601,海外名校,Leibniz University Hannover
1276
+ 管理科学大学,601,海外名校,Management and Science University
1277
+ 南京科技大学,601,海外名校,Nanjing University of Science & Technology
1278
+ 帕拉茨基大学,601,海外名校,Palacky University Olomouc
1279
+ Pavol JozefŠafárikUniversity位于科希策,601,海外名校,Pavol Jozef Šafárik University in Košice
1280
+ 釜山国立大学,601,海外名校,Pusan National University
1281
+ S.D.阿斯芬迪亚罗夫哈萨克国立医科大学,601,海外名校,S.D. Asfendiyarov Kazakh National Medical University
1282
+ 世宗大学,601,海外名校,Sejong University
1283
+ 深圳大学,601,海外名校,Shenzhen University
1284
+ 史密斯学院,601,海外名校,Smith College
1285
+ 基辅国立大学,601,海外名校,Taras Shevchenko National University Kiev
1286
+ 法政大学,601,海外名校,Thammasat University
1287
+ 约旦大学,601,海外名校,The University of Jordan
1288
+ 东京农工大学,601,海外名校,Tokyo University of Agriculture and Technology
1289
+ 巴黎第二大学,601,海外名校,University Paris 2 Panthéon-Assas
1290
+ 扎耶德大学,651,海外名校,Zayed University
1291
+ 安蒂奥基亚大学,651,海外名校,Universidad de Antioquia
1292
+ ICESI大学,651,海外名校,Universidad ICESI
1293
+ 秘鲁秘鲁卡耶塔诺大学,651,海外名校,Universidad Peruana Cayetano Heredia
1294
+ 玻利瓦尔纳天主教大学,651,海外名校,Universidad Pontificia Bolivariana
1295
+ 罗斯托克大学,651,海外名校,Universität Rostock
1296
+ 克劳德·伯纳德·里昂大学,651,海外名校,Université Claude Bernard Lyon 1
1297
+ 玛拉工艺大学,651,海外名校,Universiti Teknologi MARA - UiTM
1298
+ 克里特岛大学,651,海外名校,University of Crete
1299
+ 赫尔大学,651,海外名校,University of Hull
1300
+ 海德拉巴大学,651,海外名校,University of Hyderabad
1301
+ 肯塔基大学,651,海外名校,University of Kentucky
1302
+ 马萨诸塞大学波士顿分校,651,海外名校,University of Massachusetts Boston
1303
+ 蒙斯大学,651,海外名校,University of Mons
1304
+ 新不伦瑞克大学,651,海外名校,University of New Brunswick
1305
+ 俄克拉荷马大学,651,海外名校,University of Oklahoma
1306
+ 俄勒冈大学,651,海外名校,University of Oregon
1307
+ 佩奇大学,651,海外名校,University of Pecs
1308
+ 朴茨茅斯大学,651,海外名校,University of Portsmouth
1309
+ 萨拉曼卡大学,651,海外名校,University of Salamanca
1310
+ 的里雅斯特大学,651,海外名校,University of Trieste
1311
+ 佛蒙特大学,651,海外名校,University of Vermont
1312
+ 美利坚大学,651,海外名校,American University
1313
+ 威廉与玛丽学院,651,海外名校,College of William and Mary
1314
+ 布拉迪斯拉发的夸美纽斯大学,651,海外名校,Comenius University in Bratislava
1315
+ 德雷塞尔大学,651,海外名校,Drexel University
1316
+ 伊迪丝科文大学,651,海外名校,Edith Cowan University
1317
+ 罗兰大学,651,海外名校,Eotvos Lorand University
1318
+ 米纳斯吉拉斯州联邦大学,651,海外名校,Universidade Federal de Minas Gerais
1319
+ 博尔扎诺自由大学,651,海外名校,Free University of Bozen-Bolzano
1320
+ 霍华德大学,651,海外名校,Howard University
1321
+ 伊曼纽尔·康德波罗的海联邦大学,651,海外名校,Immanuel Kant Baltic Federal University
1322
+ 马来西亚国际伊斯兰大学,651,海外名校,International Islamic University Malaysia
1323
+ 迦达浦大学,651,海外名校,Jadavpur University
1324
+ 鹿儿岛大学,651,海外名校,Kagoshima University
1325
+ 卡尔·弗朗岑斯大学格拉茨大学,651,海外名校,Karl-Franzens-Universitaet Graz
1326
+ 哈立德国王大学,651,海外名校,King Khalid University
1327
+ 拉合尔管理科学大学,651,海外名校,Lahore University of Management Sciences
1328
+ 洛巴切夫斯基大学,651,海外名校,Lobachevsky University
1329
+ 澳门科技大学,651,海外名校,Macau University of Science & Technology
1330
+ 密歇根理工大学,651,海外名校,Michigan Technological University
1331
+ 雅典大学,651,海外名校,National and Kapodistrian University of Athens
1332
+ 国立中兴大学,651,海外名校,National Chung Hsing University
1333
+ 国立哈尔科夫工业大学,651,海外名校,"National Technical University ""Kharkiv Polytechnic Institute"""
1334
+ 里约热内卢天主教大学,651,海外名校,Pontifícia Universidade Católica do Rio de Janeiro
1335
+ 谢切诺夫大学,651,海外名校,Sechenov University
1336
+ 双威大学,651,海外名校,Sunway University
1337
+ 雪城大学,651,海外名校,Syracuse University
1338
+ 新墨西哥大学-阿尔伯克基,651,海外名校,The University of New Mexico - Albuquerque
1339
+ 德岛大学,651,海外名校,Tokushima University
1340
+ 阿尔斯特大学,651,海外名校,Ulster University
1341
+ 黎巴嫩大学,701,海外名校,Lebanese University
1342
+ 米德尔塞克斯大学,701,海外名校,Middlesex University
1343
+ 乌克兰国立技术大学 – 伊戈尔·西科斯基基辅理工学院,701,海外名校,National Technical University of Ukraine – Igor Sikorsky Kyiv Poly
1344
+ 新泽西理工大学,701,海外名校,New Jersey Institute of Technology
1345
+ 诺桑比亚大学,701,海外名校,Northumbria University
1346
+ 印度金德而全球大学,701,海外名校,O.P. Jindal Global University
1347
+ 大阪府大学,701,海外名校,Osaka Prefecture University
1348
+ 马尔堡菲尔兹大学,701,海外名校,Philipps University of Marburg
1349
+ 普列汉诺夫俄罗斯经济大学,701,海外名校,Plekhanov Russian University of Economics
1350
+ 巴里理工大学,701,海外名校,Politecnico di Bari
1351
+ 穆罕默德·本·法赫德王子大学,701,海外名校,Prince Mohammad Bin Fahd university
1352
+ 罗维拉-威尔吉利大学,701,海外名校,Rovira i Virgili University
1353
+ 圣彼得堡国立电子科技大学,701,海外名校,Saint Petersburg Electrotechnical University ETU-LETI
1354
+ 史蒂文斯理工学院,701,海外名校,Stevens Institute of Technology
1355
+ 苏美州立大学,701,海外名校,Sumy State University
1356
+ 东京都立大学,701,海外名校,Tokyo Metropolitan University
1357
+ 乌法国立航空技术大学,701,海外名校,Ufa State Aviation Technical University
1358
+ 委内瑞拉中央大学,701,海外名校,Universidad Central de Venezuela
1359
+ 萨瓦纳大学,701,海外名校,Universidad de La Sabana
1360
+ 圣安德烈斯大学,701,海外名校,Universidad de San Andrés - UdeSA
1361
+ 伊比利亚美洲大学,701,海外名校,Universidad Iberoamericana IBERO
1362
+ 圣地亚哥德孔波斯特拉大学,701,海外名校,Universidade de Santiago de Compostela
1363
+ 波茨坦大学,701,海外名校,Universität Potsdam
1364
+ 蔚蓝海岸大学,701,海外名校,Université Côte d'Azur
1365
+ 苏塞大学,701,海外名校,Université de Sousse
1366
+ 布拉德福德大学,701,海外名校,University of Bradford
1367
+ 海法大学,701,海外名校,University of Haifa
1368
+ 休斯敦大学,701,海外名校,University of Houston
1369
+ 哈德斯菲尔德大学,701,海外名校,University of Huddersfield
1370
+ 摩德纳大学和雷焦艾米利亚,701,海外名校,University of Modena and Reggio Emilia
1371
+ 南昆士兰大学,701,海外名校,University of Southern Queensland
1372
+ 巴斯克大学,701,海外名校,University of the Basque Country
1373
+ 威斯敏斯特大学,701,海外名校,University of Westminster
1374
+ 温莎大学,701,海外名校,University of Windsor
1375
+ 弗吉尼亚联邦大学,701,海外名校,Virginia Commonwealth University
1376
+ 阿布扎比大学,701,海外名校,Abu Dhabi University
1377
+ 阿治曼大学,701,海外名校,Ajman University
1378
+ 艾因大学,701,海外名校,Al Ain University
1379
+ 海峡大学,701,海外名校,Bogaziçi Üniversitesi
1380
+ 布尔诺工业大学,701,海外名校,Brno University of Technology
1381
+ 查尔斯达尔文大学,701,海外名校,Charles Darwin University
1382
+ 纽约城市大学,701,海外名校,City University of New York
1383
+ 群马大学,701,海外名校,Gunma University
1384
+ 印度理工学院布巴内斯瓦尔,701,海外名校,Indian Institute of Technology Bhubaneswar
1385
+ 布宜诺斯艾利斯技术大学,701,海外名校,Instituto Tecnológico de Buenos Aires (ITBA)
1386
+ 伊斯坦布尔理工大学,701,海外名校,Istanbul Technical University
1387
+ 约夫大学,701,海外名校,Jouf University
1388
+ 阿道夫·伊巴涅斯大学,751,海外名校,Adolfo Ibáñez University
1389
+ 中东美国大学,751,海外名校,American University of the Middle East
1390
+ 白俄罗斯国立技术大学(BNTU),751,海外名校,Belarusian National Technical University (BNTU)
1391
+ 布雷西亚大学,751,海外名校,Brescia University
1392
+ 全南国立大学,751,海外名校,Chonnam National University
1393
+ 忠南国立大学,751,海外名校,Chungnam National University
1394
+ 克拉克森大学,751,海外名校,Clarkson University
1395
+ 檀国大学,751,海外名校,Dankook University
1396
+ 南里奥格兰德联邦大学,751,海外名校,Federal University of Rio Grande do Sul
1397
+ 佐治亚州立大学,751,海外名校,Georgia State University
1398
+ 杜塞尔多夫大学,751,海外名校,Heinrich Heine University Duesseldorf
1399
+ 印尼泗水理工大学,751,海外名校,Institut Teknologi Sepuluh Nopember
1400
+ 国立理工学院 (IPN),751,海外名校,Instituto Politécnico Nacional (IPN)
1401
+ 墨西哥自治技术学院,751,海外名校,Instituto Tecnológico Autónomo de México (ITAM)
1402
+ 印度新德里国立伊斯兰大学,751,海外名校,Jamia Millia Islamia
1403
+ 卡拉干达国立技术大学,751,海外名校,Karaganda State Technical University
1404
+ República大学(乌德拉尔),751,海外名校,Universidad de la República (Udelar)
1405
+ 罗萨里奥大学,751,海外名校,Universidad del Rosario
1406
+ 基多圣弗朗西斯科大学,751,海外名校,Universidad San Francisco de Quito
1407
+ 托拉卡托迪泰拉大学,751,海外名校,Universidad Torcuato Di Tella
1408
+ 佩鲁贾研究大学,751,海外名校,Università degli Studi di Perugia
1409
+ 舍布鲁克大学,751,海外名校,Université de Sherbrooke
1410
+ 国能大学,751,海外名校,Universiti Tenaga Nasional (UNITEN)
1411
+ 中佛罗里达大学,751,海外名校,University of Central Florida
1412
+ 丹佛大学,751,海外名校,University of Denver
1413
+ 格林威治大学,751,海外名校,University of Greenwich
1414
+ 马里兰大学巴尔的摩,751,海外名校,"University of Maryland, Baltimore County"
1415
+ 萨格勒布大学,751,海外名校,University of Zagreb
1416
+ 曼尼珀尔高等教育学院,751,海外名校,Manipal Academy of Higher Education
1417
+ 梅努斯大学,751,海外名校,Maynooth University
1418
+ 纽芬兰纪念大学,751,海外名校,Memorial University of Newfoundland
1419
+ 瓦尔帕莱索天主教大学,751,海外名校,Pontificia Universidad Catolica de Valparaiso
1420
+ 努拉·本·公主阿卜杜勒拉曼大学,751,海外名校,Princess Nourah bint Abdulrahman University
1421
+ 里加技术大学,751,海外名校,Riga Technical University
1422
+ 立命馆大学,751,海外名校,Ritsumeikan University
1423
+ 设拉子大学,751,海外名校,Shiraz University
1424
+ 南十字大学,751,海外名校,Southern Cross University
1425
+ 纽约州立大学奥尔巴尼分校,751,海外名校,State University of New York Albany
1426
+ 塔林科技大学,751,海外名校,Tallinn University of Technology
1427
+ 天普大学,751,海外名校,Temple University
1428
+ 维尔纽斯Gediminas技术大学,751,海外名校,Vilnius Gediminas Technical University
1429
+ 伍斯特理工学院,751,海外名校,Worcester Polytechnic Institute
1430
+ 越南国立大学胡志明市,801,海外名校,Viet Nam National University Ho Chi Minh City (VNU-HCM)
1431
+ 越南河内国立大学,801,海外名校,Vietnam National University Hanoi
1432
+ 维陶塔斯·马格努斯大学,801,海外名校,Vytautas Magnus University
1433
+ 西弗吉尼亚大学,801,海外名校,West Virginia University
1434
+ 弗罗茨瓦夫科技大学,801,海外名校,Wroclaw University of Science and Technology
1435
+ 西交利物浦大学,801,海外名校,Xi‘an Jiaotong-Liverpool University
1436
+ 山口大学,801,海外名校,Yamaguchi University
1437
+ 埃里温州立大学,801,海外名校,Yerevan State University
1438
+ 岭南大学,801,海外名校,Yeungnam University
1439
+ 横滨国立大学,801,海外名校,Yokohama National University
1440
+ 查平戈自治大学,801,海外名校,Universidad Autónoma de Chapingo
1441
+ 伊达尔戈自治大学,801,海外名校,Universidad Autónoma del Estado de Hidalgo
1442
+ 墨西哥自治大学,801,海外名校,Universidad Autonoma del Estado de Mexico
1443
+ 大都会大学(UAM),801,海外名校,Universidad Autónoma Metropolitana (UAM)
1444
+ 卡塔利卡大学安德烈斯·贝洛大学,801,海外名校,Universidad Católica Andres Bello
1445
+ 乌拉圭卡托利卡大学(UCU),801,海外名校,Universidad Católica del Uruguay (UCU)
1446
+ 瓜达拉哈拉大学,801,海外名校,Universidad de Guadalajara
1447
+ 美洲普埃布拉大学(UDLAP),801,海外名校,Universidad de las Américas Puebla (UDLAP)
1448
+ 洛斯安第斯大学-(ULA)梅里达,801,海外名校,Universidad de Los Andes - (ULA) Mérida
1449
+ 洛斯安第斯大学-智利,801,海外名校,Universidad de los Andes - Chile
1450
+ 奥维耶多大学,801,海外名校,Universidad de Oviedo
1451
+ 瓦莱大学,801,海外名校,Universidad del Valle
1452
+ 圣地亚哥大学门户网站,801,海外名校,University Diego Portales
1453
+ EAFIT大学,801,海外名校,Universidad EAFIT
1454
+ 国立圣路易斯大学,801,海外名校,Universidad Nacional de San Luis
1455
+ 圣马科斯国立大学市长,801,海外名校,Universidad Nacional Mayor de San Marcos
1456
+ 西蒙玻利瓦尔大学(USB),801,海外名校,Universidad Simón Bolívar (USB)
1457
+ 巴拿马巴拿马技术大学(UTP),801,海外名校,Universidad Tecnológica de Panamá (UTP)
1458
+ 全国技术大学(UTN),801,海外名校,Universidad Tecnológica Nacional (UTN)
1459
+ 卡图里卡葡萄牙大学-UCP,801,海外名校,Universidade Católica Portuguesa - UCP
1460
+ 拉科鲁尼亚大学,801,海外名校,Universidade da Coruna
1461
+ 巴西利亚大学,801,海外名校,Universidade de Brasília
1462
+ 费拉拉大学,801,海外名校,Universita' degli Studi di Ferrara
1463
+ 乌迪内大学研究,801,海外名校,Università degli Studi di Udine
1464
+ 意大利马尔凯理工大学,801,海外名校,Universita' Politecnica delle Marche
1465
+ 帕贾扎兰大学,801,海外名校,Universitas Padjadjaran
1466
+ 杜伊斯堡 - 埃森大学,801,海外名校,University of Duisburg-Essen
1467
+ 里尔大学,801,海外名校,Université de Lille
1468
+ 洛林大学,801,海外名校,Universite de Lorraine
1469
+ 南特大学,801,海外名校,Université de Nantes
1470
+ 雷恩第一大学,801,海外名校,Université de Rennes 1
1471
+ 图卢兹大学1 Capitole,801,海外名校,Université Toulouse 1 Capitole
1472
+ 吉隆坡大学,801,海外名校,Universiti Kuala Lumpur
1473
+ 马来西亚彭亨大学,801,海外名校,Universiti Malaysia Pahang
1474
+ 马来西亚玻璃市大学,801,海外名校,Universiti Malaysia Perlis
1475
+ 拉曼大学,801,海外名校,Universiti Tunku Abdul Rahman (UTAR)
1476
+ 阿利坎特大学,801,海外名校,University of Alicante
1477
+ 巴格达大学,801,海外名校,University of Baghdad
1478
+ 巴林大学,801,海外名校,University of Bahrain
1479
+ 巴里大学,801,海外名校,University of Bari
1480
+ 布莱顿大学,801,海外名校,University of Brighton
1481
+ 加尔各答大学,801,海外名校,University of Calcutta
1482
+ 中央兰开夏大学,801,海外名校,University of Central Lancashire
1483
+ 达卡大学,801,海外名校,University of Dhaka
1484
+ 东伦敦大学,801,海外名校,University of East London
1485
+ 拉合尔工程技术大学(UET),801,海外名校,University of Engineering & Technology (UET) Lahore
1486
+ 格但斯克大学,801,海外名校,University of Gdansk
1487
+ 哈特福德大学,801,海外名校,University of Hartford
1488
+ 赫特福德大学,801,海外名校,University of Hertfordshire
1489
+ 赫拉德茨克拉洛夫大学,801,海外名校,University of Hradec Kralove
1490
+ 夸祖鲁纳塔尔大学,801,海外名校,University of KwaZulu Natal
1491
+ 罗兹大学,801,海外名校,University of Lodz
1492
+ 马耳他大学,801,海外名校,University of Malta
1493
+ 马里博尔大学,801,海外名校,University of Maribor
1494
+ 墨西拿大学,801,海外名校,University of Messina
1495
+ 密西西比大学,801,海外名校,University of Mississippi
1496
+ 密苏里大学堪萨斯城分校,801,海外名校,"University of Missouri, Kansas City"
1497
+ 穆尔西亚大学,801,海外名校,University of Murcia
1498
+ 新英格兰大学,801,海外名校,University of New England
1499
+ 新罕布什尔大学,801,海外名校,University of New Hampshire
1500
+ 帕尔马大学,801,海外名校,University of Parma
1501
+ 帕特雷大学,801,海外名校,University of Patras
1502
+ 萨勒诺大学,801,海外名校,University of Salerno
1503
+ 索尔福德大学,801,海外名校,University of Salford
1504
+ 首尔大学,801,海外名校,University of Seoul
1505
+ 旁遮普大学,801,海外名校,University of the Punjab
1506
+ 英格兰西部大学,801,海外名校,University of the West of England
1507
+ 塔尔萨大学,801,海外名校,University of Tulsa
1508
+ 秋明大学,801,海外名校,University of Tyumen
1509
+ 维罗纳大学,801,海外名校,University of Verona
1510
+ 威斯康星大学-密尔沃基,801,海外名校,University of Wisconsin - Milwaukee
1511
+ 弗罗茨瓦夫大学,801,海外名校,University of Wroclaw
1512
+ 怀俄明大学,801,海外名校,University of Wyoming
1513
+ 齐里纳大学,801,海外名校,University of Žilina
1514
+ Y.A.院士布克托夫卡拉干达大学,801,海外名校,Academician Y.A. Buketov Karaganda University
1515
+ 亚当·米基维奇大学,801,海外名校,Adam Mickiewicz University
1516
+ AGH科技大学,801,海外名校,AGH University of Science & Technology
1517
+ 艾恩夏姆斯大学,801,海外名校,Ain Shams University
1518
+ 安卡拉大学,801,海外名校,Ankara Üniversitesi
1519
+ 安娜大学,801,海外名校,Anna University
1520
+ 奥本大学,801,海外名校,Auburn University
1521
+ 澳洲天主教大学,801,海外名校,Australian Catholic University
1522
+ 孟加拉国工程技术大学,801,海外名校,Bangladesh University of Engineering and Technology
1523
+ 贝鲁特阿拉伯大学,801,海外名校,Beirut Arab University
1524
+ 比勒费尔德大学,801,海外名校,Bielefeld University
1525
+ 纽约州立大学宾汉姆顿大学,801,海外名校,Binghamton University
1526
+ 布达佩斯科技经济大学,801,海外名校,Budapest University of Technology & Economics
1527
+ 威尼斯福斯卡里宫大学,801,海外名校,Ca’ Foscari University of Venice
1528
+ 卡塔尼亚大学,801,海外名校,Catania University
1529
+ 查尔斯斯特大学,801,海外名校,Charles Sturt University
1530
+ 克莱姆森大学,801,海外名校,Clemson University
1531
+ 布达佩斯科维努斯大学,801,海外名校,Corvinus University of Budapest
1532
+ 克拉科夫工业大学,801,海外名校,Cracow University of Technology
1533
+ CY塞尔吉巴黎大学,801,海外名校,CY Cergy Paris Université
1534
+ 布拉格捷克生命科学大学,801,海外名校,Czech University of Life Sciences in Prague
1535
+ 德拉萨大学,801,海外名校,De La Salle University
1536
+ 德蒙福特大学,801,海外名校,De Montfort University
1537
+ 多特蒙德工业大学,801,海外名校,Technical University of Dortmund
1538
+ 爱丁堡纳皮尔大学,801,海外名校,Edinburgh Napier University
1539
+ 巴拉那联邦大学,801,海外名校,Federal University of Parana
1540
+ 伯南布哥联邦大学,801,海外名校,Federal University of Pernambuco
1541
+ 圣卡塔琳娜联邦大学,801,海外名校,Federal University of Santa Catarina
1542
+ 圣卡洛斯联邦大学,801,海外名校,Federal University of Sao Carlos
1543
+ 费德里科圣玛丽亚技术大学,801,海外名校,Federico Santa María Technical University
1544
+ 俄罗斯联邦政府金融大学,801,海外名校,Financial University under the Government of the Russian Federation
1545
+ 佛罗里达国际大学,801,海外名校,Florida International University
1546
+ 福特汉姆大学,801,海外名校,Fordham University
1547
+ 格但斯克工业大学,801,海外名校,Gdansk University of Technology
1548
+ 乔治梅森大学,801,海外名校,George Mason University
1549
+ 德国约旦大学,801,海外名校,German Jordanian University
1550
+ 海湾科技大学,801,海外名校,Gulf University for Science and Technology
1551
+ 哈斯特帕大学,801,海外名校,Hacettepe University
1552
+ 印第安纳大学-印第安纳波利斯,801,海外名校,Indiana University-Purdue University at Indianapolis
1553
+ 国际基督教大学,801,海外名校,International Christian University
1554
+ 麦地那伊斯兰大学,801,海外名校,Islamic University of Madinah
1555
+ 伊斯坦布尔大学,801,海外名校,Istanbul University
1556
+ 约旦科技大学,801,海外名校,Jordan University of Science & Technology
1557
+ 堪萨斯州立大学,801,海外名校,Kansas State University
1558
+ 卡塞萨特大学,801,海外名校,Kasetsart University
1559
+ 考纳斯工业大学,801,海外名校,Kaunas University of Technology
1560
+ 哈萨克阿布赖汗国际关系与世界语言大学,801,海外名校,Kazakh Ablai Khan University of International Relations and World Languages
1561
+ 哈萨克斯坦-英国技术大学,801,海外名校,Kazakh-British Technical University
1562
+ 喀山国立研究技术大学,801,海外名校,Kazan National Research Technical University
1563
+ 孔敬大学,801,海外名校,Khon Kaen University
1564
+ 费萨尔国王大学,801,海外名校,King Faisal University
1565
+ 泰国国王科技大学,801,海外名校,King Mongkut’s University of Technology Thonburi
1566
+ 京都工业大学,801,海外名校,Kyoto Institute of Technology
1567
+ 九州工业大学,801,海外名校,Kyushu Institute of Technology
1568
+ 利物浦约翰摩尔斯大学,801,海外名校,Liverpool John Moores University
1569
+ 罗兹工业大学,801,海外名校,Lodz University of Technology
1570
+ 伦敦都会大学,801,海外名校,London Metropolitan University
1571
+ 伦敦南岸大学,801,海外名校,London South Bank University
1572
+ 路易斯安那州立大学,801,海外名校,Louisiana State University
1573
+ 芝加哥洛约拉大学,801,海外名校,Loyola University Chicago
1574
+ 利沃夫理工大学,801,海外名校,Lviv Polytechnic National University
1575
+ 曼彻斯特城市大学,801,海外名校,Manchester Metropolitan University
1576
+ 布尔诺的孟德尔大学,801,海外名校,Mendel University Brno
1577
+ 门捷列夫化工大学,801,海外名校,Mendeleev University of Chemical Technology
1578
+ 国立中正大学,801,海外名校,National Chung Cheng University
1579
+ 科尔多瓦国立大学,801,海外名校,National University of Cordoba
1580
+ 罗萨里奥国立大学,801,海外名校,National University of Rosario
1581
+ 尼古拉·哥白尼大学,801,海外名校,Nicolaus Copernicus University
1582
+ NJSC KIMEP大学,801,海外名校,NJSC KIMEP University
1583
+ 诺丁汉特伦特大学,801,海外名校,Nottingham Trent University
1584
+ 新西伯利亚国立技术大学,801,海外名校,Novosibirsk State Technical University
1585
+ 俄克拉荷马州立大学,801,海外名校,Oklahoma State University
1586
+ 萨尔茨堡巴黎罗德隆大学,801,海外名校,Paris Lodron University of Salzburg
1587
+ 那不勒斯帕斯诺普大学,801,海外名校,Parthenope University Naples
1588
+ 彼尔姆国立研究大学,801,海外名校,Perm State National Research University
1589
+ 本地治里大学,801,海外名校,Pondicherry University
1590
+ 圣保罗天主教大学,801,海外名校,Pontifícia Universidade Católica de São Paulo
1591
+ 波兹南理工大学,801,海外名校,Poznan University of Technology
1592
+ 宋卡王子大学,801,海外名校,Prince of Songkla University
1593
+ 苏美亚公主科技大学,801,海外名校,Princess Sumaya University for Technology
1594
+ 卡西姆大学,801,海外名校,Qassim University
1595
+ 英国爱丁堡玛格丽特女王大学,801,海外名校,"Queen Margaret University , Edinburgh"
1596
+ 罗得大学,801,海外名校,Rhodes University
1597
+ 里加斯特拉丁斯大学,801,海外名校,Riga Stradins University
1598
+ 罗伯特·戈登大学,801,海外名校,Robert Gordon University
1599
+ 罗马特雷大学,801,海外名校,Roma Tre University
1600
+ 俄罗斯总统国民经济与公共行政学院,801,海外名校,Russian Presidential Academy of National Economy and Public Admini
1601
+ 罗格斯大学纽瓦克分校,801,海外名校,Rutgers University - Newark
1602
+ 瑞尔森大学,801,海外名校,Ryerson University
1603
+ 西雅图大学,801,海外名校,Seattle University
1604
+ 信州大学,801,海外名校,Shinshu University
1605
+ Siksha'O'Anusandhan,801,海外名校,Siksha ‘O’ Anusandhan
1606
+ 西里西亚工业大学,801,海外名校,Silesian University of Technology
1607
+ 斯洛伐克工业大学布拉迪斯拉发,801,海外名校,Slovak University of Technology Bratislava
1608
+ 上智大学,801,海外名校,Sophia University
1609
+ 南乌拉尔州立大学,801,海外名校,South Ural State University
1610
+ 南卫理公会大学,801,海外名校,Southern Methodist University
1611
+ 塞切尼伊斯特万大学,801,海外名校,Széchenyi István University
1612
+ 圣伊斯特万大学,801,海外名校,Szent Istvan University
1613
+ 科希策技术大学,801,海外名校,Technical University of Kosice
1614
+ 利贝雷茨技术大学,801,海外名校,Technical University of Liberec
1615
+ 都柏林科技大学,801,海外名校,Technological University Dublin
1616
+ 哥斯达黎加技术中心(TECN),801,海外名校,Tecnológico de Costa Rica -TEC
1617
+ 德克萨斯理工大学,801,海外名校,Texas Tech University
1618
+ 阿拉巴马大学,801,海外名校,The University of Alabama
1619
+ 阿尔伯塔大学,126,海外名校,
1620
+ 加州艺术学院,1,海外名校,California Institute of the Arts
1621
+ 萨凡纳艺术学院,2,海外名校,Savannah College of Art and Design
1622
+ 瑞格林艺术设计学院,3,海外名校,Ringling College of Art and Design
1623
+ 纽约视觉艺术学院,4,海外名校,School of Visual Art
1624
+ 谢尔丹学院,6,海外名校,Sheridan College
1625
+ 伯恩茅斯大学,7,海外名校,Bournemouth University
1626
+ 提赛德大学,8,海外名校,Teesside University
1627
+ 高布兰学院,9,海外名校,Gobelins
deepdoc/parser/resume/entities/res/schools.csv ADDED
The diff for this file is too large to render. See raw diff
 
deepdoc/parser/resume/entities/schools.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: UTF-8 -*-
2
+ import os, json,re,copy
3
+ import pandas as pd
4
+ current_file_path = os.path.dirname(os.path.abspath(__file__))
5
+ TBL = pd.read_csv(os.path.join(current_file_path, "res/schools.csv"), sep="\t", header=0).fillna("")
6
+ TBL["name_en"] = TBL["name_en"].map(lambda x: x.lower().strip())
7
+ GOOD_SCH = json.load(open(os.path.join(current_file_path, "res/good_sch.json"), "r"))
8
+ GOOD_SCH = set([re.sub(r"[,. &()()]+", "", c) for c in GOOD_SCH])
9
+
10
+
11
+ def loadRank(fnm):
12
+ global TBL
13
+ TBL["rank"] = 1000000
14
+ with open(fnm, "r",encoding='UTF-8') as f:
15
+ while True:
16
+ l = f.readline()
17
+ if not l:break
18
+ l = l.strip("\n").split(",")
19
+ try:
20
+ nm,rk = l[0].strip(),int(l[1])
21
+ #assert len(TBL[((TBL.name_cn == nm) | (TBL.name_en == nm))]),f"<{nm}>"
22
+ TBL.loc[((TBL.name_cn == nm) | (TBL.name_en == nm)), "rank"] = rk
23
+ except Exception as e:
24
+ pass
25
+
26
+
27
+ loadRank(os.path.join(current_file_path, "res/school.rank.csv"))
28
+
29
+
30
+ def split(txt):
31
+ tks = []
32
+ for t in re.sub(r"[ \t]+", " ",txt).split(" "):
33
+ if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
34
+ re.match(r"[a-zA-Z]", t) and tks:
35
+ tks[-1] = tks[-1] + " " + t
36
+ else:tks.append(t)
37
+ return tks
38
+
39
+
40
+ def select(nm):
41
+ global TBL
42
+ if not nm:return
43
+ if isinstance(nm, list):nm = str(nm[0])
44
+ nm = split(nm)[0]
45
+ nm = str(nm).lower().strip()
46
+ nm = re.sub(r"[((][^()()]+[))]", "", nm.lower())
47
+ nm = re.sub(r"(^the |[,.&()();;·]+|^(英国|美国|瑞士))", "", nm)
48
+ nm = re.sub(r"大学.*学院", "大学", nm)
49
+ tbl = copy.deepcopy(TBL)
50
+ tbl["hit_alias"] = tbl["alias"].map(lambda x:nm in set(x.split("+")))
51
+ res = tbl[((tbl.name_cn == nm) | (tbl.name_en == nm) | (tbl.hit_alias == True))]
52
+ if res.empty:return
53
+
54
+ return json.loads(res.to_json(orient="records"))[0]
55
+
56
+
57
+ def is_good(nm):
58
+ global GOOD_SCH
59
+ nm = re.sub(r"[((][^()()]+[))]", "", nm.lower())
60
+ nm = re.sub(r"[''`‘’“”,. &()();;]+", "", nm)
61
+ return nm in GOOD_SCH
62
+
deepdoc/parser/resume/step_one.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import json
3
+ from deepdoc.parser.resume.entities import degrees, regions, industries
4
+
5
+ FIELDS = [
6
+ "address STRING",
7
+ "annual_salary int",
8
+ "annual_salary_from int",
9
+ "annual_salary_to int",
10
+ "birth STRING",
11
+ "card STRING",
12
+ "certificate_obj string",
13
+ "city STRING",
14
+ "corporation_id int",
15
+ "corporation_name STRING",
16
+ "corporation_type STRING",
17
+ "degree STRING",
18
+ "discipline_name STRING",
19
+ "education_obj string",
20
+ "email STRING",
21
+ "expect_annual_salary int",
22
+ "expect_city_names string",
23
+ "expect_industry_name STRING",
24
+ "expect_position_name STRING",
25
+ "expect_salary_from int",
26
+ "expect_salary_to int",
27
+ "expect_type STRING",
28
+ "gender STRING",
29
+ "industry_name STRING",
30
+ "industry_names STRING",
31
+ "is_deleted STRING",
32
+ "is_fertility STRING",
33
+ "is_house STRING",
34
+ "is_management_experience STRING",
35
+ "is_marital STRING",
36
+ "is_oversea STRING",
37
+ "language_obj string",
38
+ "name STRING",
39
+ "nation STRING",
40
+ "phone STRING",
41
+ "political_status STRING",
42
+ "position_name STRING",
43
+ "project_obj string",
44
+ "responsibilities string",
45
+ "salary_month int",
46
+ "scale STRING",
47
+ "school_name STRING",
48
+ "self_remark string",
49
+ "skill_obj string",
50
+ "title_name STRING",
51
+ "tob_resume_id STRING",
52
+ "updated_at Timestamp",
53
+ "wechat STRING",
54
+ "work_obj string",
55
+ "work_experience int",
56
+ "work_start_time BIGINT"
57
+ ]
58
+
59
+ def refactor(df):
60
+ def deal_obj(obj, k, kk):
61
+ if not isinstance(obj, type({})):
62
+ return ""
63
+ obj = obj.get(k, {})
64
+ if not isinstance(obj, type({})):
65
+ return ""
66
+ return obj.get(kk, "")
67
+
68
+ def loadjson(line):
69
+ try:
70
+ return json.loads(line)
71
+ except Exception as e:
72
+ pass
73
+ return {}
74
+
75
+ df["obj"] = df["resume_content"].map(lambda x: loadjson(x))
76
+ df.fillna("", inplace=True)
77
+
78
+ clms = ["tob_resume_id", "updated_at"]
79
+
80
+ def extract(nms, cc=None):
81
+ nonlocal clms
82
+ clms.extend(nms)
83
+ for c in nms:
84
+ if cc:
85
+ df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c))
86
+ else:
87
+ df[c] = df["obj"].map(
88
+ lambda x: json.dumps(
89
+ x.get(
90
+ c,
91
+ {}),
92
+ ensure_ascii=False) if isinstance(
93
+ x,
94
+ type(
95
+ {})) and (
96
+ isinstance(
97
+ x.get(c),
98
+ type(
99
+ {})) or not x.get(c)) else str(x).replace(
100
+ "None",
101
+ ""))
102
+
103
+ extract(["education", "work", "certificate", "project", "language",
104
+ "skill"])
105
+ extract(["wechat", "phone", "is_deleted",
106
+ "name", "tel", "email"], "contact")
107
+ extract(["nation", "expect_industry_name", "salary_month",
108
+ "industry_ids", "is_house", "birth", "annual_salary_from",
109
+ "annual_salary_to", "card",
110
+ "expect_salary_to", "expect_salary_from",
111
+ "expect_position_name", "gender", "city",
112
+ "is_fertility", "expect_city_names",
113
+ "political_status", "title_name", "expect_annual_salary",
114
+ "industry_name", "address", "position_name", "school_name",
115
+ "corporation_id",
116
+ "is_oversea", "responsibilities",
117
+ "work_start_time", "degree", "management_experience",
118
+ "expect_type", "corporation_type", "scale", "corporation_name",
119
+ "self_remark", "annual_salary", "work_experience",
120
+ "discipline_name", "marital", "updated_at"], "basic")
121
+
122
+ df["degree"] = df["degree"].map(lambda x: degrees.get_name(x))
123
+ df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x)))
124
+ df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in
125
+ str(x).split(",")]))
126
+ clms.append("industry_names")
127
+
128
+ def arr2str(a):
129
+ if not a:
130
+ return ""
131
+ if isinstance(a, list):
132
+ a = " ".join([str(i) for i in a])
133
+ return str(a).replace(",", " ")
134
+
135
+ df["expect_industry_name"] = df["expect_industry_name"].map(
136
+ lambda x: arr2str(x))
137
+ df["gender"] = df["gender"].map(
138
+ lambda x: "男" if x == 'M' else (
139
+ "女" if x == 'F' else ""))
140
+ for c in ["is_fertility", "is_oversea", "is_house",
141
+ "management_experience", "marital"]:
142
+ df[c] = df[c].map(
143
+ lambda x: '是' if x == 'Y' else (
144
+ '否' if x == 'N' else ""))
145
+ df["is_management_experience"] = df["management_experience"]
146
+ df["is_marital"] = df["marital"]
147
+ clms.extend(["is_management_experience", "is_marital"])
148
+
149
+ df.fillna("", inplace=True)
150
+ for i in range(len(df)):
151
+ if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip():
152
+ df.loc[i, "phone"] = df.loc[i, "tel"].strip()
153
+
154
+ for n in ["industry_ids", "management_experience", "marital", "tel"]:
155
+ for i in range(len(clms)):
156
+ if clms[i] == n:
157
+ del clms[i]
158
+ break
159
+
160
+ clms = list(set(clms))
161
+
162
+ df = df.reindex(sorted(clms), axis=1)
163
+ #print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL")
164
+ for c in clms:
165
+ df[c] = df[c].map(
166
+ lambda s: str(s).replace(
167
+ "\t",
168
+ " ").replace(
169
+ "\n",
170
+ "\\n").replace(
171
+ "\r",
172
+ "\\n"))
173
+ # print(df.values.tolist())
174
+ return dict(zip([n.split(" ")[0] for n in FIELDS], df.values.tolist()[0]))
deepdoc/parser/resume/step_two.py ADDED
@@ -0,0 +1,580 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import re, copy, time, datetime, demjson3, \
3
+ traceback, signal
4
+ import numpy as np
5
+ from deepdoc.parser.resume.entities import degrees, schools, corporations
6
+ from rag.nlp import rag_tokenizer, surname
7
+ from xpinyin import Pinyin
8
+ from contextlib import contextmanager
9
+
10
+
11
+ class TimeoutException(Exception): pass
12
+
13
+
14
+ @contextmanager
15
+ def time_limit(seconds):
16
+ def signal_handler(signum, frame):
17
+ raise TimeoutException("Timed out!")
18
+
19
+ signal.signal(signal.SIGALRM, signal_handler)
20
+ signal.alarm(seconds)
21
+ try:
22
+ yield
23
+ finally:
24
+ signal.alarm(0)
25
+
26
+
27
+ ENV = None
28
+ PY = Pinyin()
29
+
30
+
31
+ def rmHtmlTag(line):
32
+ return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, 100000, re.IGNORECASE)
33
+
34
+
35
+ def highest_degree(dg):
36
+ if not dg: return ""
37
+ if type(dg) == type(""): dg = [dg]
38
+ m = {"初中": 0, "高中": 1, "中专": 2, "大专": 3, "专升本": 4, "本科": 5, "硕士": 6, "博士": 7, "博士后": 8}
39
+ return sorted([(d, m.get(d, -1)) for d in dg], key=lambda x: x[1] * -1)[0][0]
40
+
41
+
42
+ def forEdu(cv):
43
+ if not cv.get("education_obj"):
44
+ cv["integerity_flt"] *= 0.8
45
+ return cv
46
+
47
+ first_fea, fea, maj, fmaj, deg, fdeg, sch, fsch, st_dt, ed_dt = [], [], [], [], [], [], [], [], [], []
48
+ edu_nst = []
49
+ edu_end_dt = ""
50
+ cv["school_rank_int"] = 1000000
51
+ for ii, n in enumerate(sorted(cv["education_obj"], key=lambda x: x.get("start_time", "3"))):
52
+ e = {}
53
+ if n.get("end_time"):
54
+ if n["end_time"] > edu_end_dt: edu_end_dt = n["end_time"]
55
+ try:
56
+ dt = n["end_time"]
57
+ if re.match(r"[0-9]{9,}", dt): dt = turnTm2Dt(dt)
58
+ y, m, d = getYMD(dt)
59
+ ed_dt.append(str(y))
60
+ e["end_dt_kwd"] = str(y)
61
+ except Exception as e:
62
+ pass
63
+ if n.get("start_time"):
64
+ try:
65
+ dt = n["start_time"]
66
+ if re.match(r"[0-9]{9,}", dt): dt = turnTm2Dt(dt)
67
+ y, m, d = getYMD(dt)
68
+ st_dt.append(str(y))
69
+ e["start_dt_kwd"] = str(y)
70
+ except Exception as e:
71
+ pass
72
+
73
+ r = schools.select(n.get("school_name", ""))
74
+ if r:
75
+ if str(r.get("type", "")) == "1": fea.append("211")
76
+ if str(r.get("type", "")) == "2": fea.append("211")
77
+ if str(r.get("is_abroad", "")) == "1": fea.append("留学")
78
+ if str(r.get("is_double_first", "")) == "1": fea.append("双一流")
79
+ if str(r.get("is_985", "")) == "1": fea.append("985")
80
+ if str(r.get("is_world_known", "")) == "1": fea.append("海外知名")
81
+ if r.get("rank") and cv["school_rank_int"] > r["rank"]: cv["school_rank_int"] = r["rank"]
82
+
83
+ if n.get("school_name") and isinstance(n["school_name"], str):
84
+ sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
85
+ e["sch_nm_kwd"] = sch[-1]
86
+ fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split(" ")[-1])
87
+
88
+ if n.get("discipline_name") and isinstance(n["discipline_name"], str):
89
+ maj.append(n["discipline_name"])
90
+ e["major_kwd"] = n["discipline_name"]
91
+
92
+ if not n.get("degree") and "985" in fea and not first_fea: n["degree"] = "1"
93
+
94
+ if n.get("degree"):
95
+ d = degrees.get_name(n["degree"])
96
+ if d: e["degree_kwd"] = d
97
+ if d == "本科" and ("专科" in deg or "专升本" in deg or "中专" in deg or "大专" in deg or re.search(r"(成人|自考|自学考试)",
98
+ n.get(
99
+ "school_name",
100
+ ""))): d = "专升本"
101
+ if d: deg.append(d)
102
+
103
+ # for first degree
104
+ if not fdeg and d in ["中专", "专升本", "专科", "本科", "大专"]:
105
+ fdeg = [d]
106
+ if n.get("school_name"): fsch = [n["school_name"]]
107
+ if n.get("discipline_name"): fmaj = [n["discipline_name"]]
108
+ first_fea = copy.deepcopy(fea)
109
+
110
+ edu_nst.append(e)
111
+
112
+ cv["sch_rank_kwd"] = []
113
+ if cv["school_rank_int"] <= 20 \
114
+ or ("海外名校" in fea and cv["school_rank_int"] <= 200):
115
+ cv["sch_rank_kwd"].append("顶尖学校")
116
+ elif cv["school_rank_int"] <= 50 and cv["school_rank_int"] > 20 \
117
+ or ("海外名校" in fea and cv["school_rank_int"] <= 500 and \
118
+ cv["school_rank_int"] > 200):
119
+ cv["sch_rank_kwd"].append("精英学校")
120
+ elif cv["school_rank_int"] > 50 and ("985" in fea or "211" in fea) \
121
+ or ("海外名校" in fea and cv["school_rank_int"] > 500):
122
+ cv["sch_rank_kwd"].append("优质学校")
123
+ else:
124
+ cv["sch_rank_kwd"].append("一般学校")
125
+
126
+ if edu_nst: cv["edu_nst"] = edu_nst
127
+ if fea: cv["edu_fea_kwd"] = list(set(fea))
128
+ if first_fea: cv["edu_first_fea_kwd"] = list(set(first_fea))
129
+ if maj: cv["major_kwd"] = maj
130
+ if fsch: cv["first_school_name_kwd"] = fsch
131
+ if fdeg: cv["first_degree_kwd"] = fdeg
132
+ if fmaj: cv["first_major_kwd"] = fmaj
133
+ if st_dt: cv["edu_start_kwd"] = st_dt
134
+ if ed_dt: cv["edu_end_kwd"] = ed_dt
135
+ if ed_dt: cv["edu_end_int"] = max([int(t) for t in ed_dt])
136
+ if deg:
137
+ if "本科" in deg and "专科" in deg:
138
+ deg.append("专升本")
139
+ deg = [d for d in deg if d != '本科']
140
+ cv["degree_kwd"] = deg
141
+ cv["highest_degree_kwd"] = highest_degree(deg)
142
+ if edu_end_dt:
143
+ try:
144
+ if re.match(r"[0-9]{9,}", edu_end_dt): edu_end_dt = turnTm2Dt(edu_end_dt)
145
+ if edu_end_dt.strip("\n") == "至今": edu_end_dt = cv.get("updated_at_dt", str(datetime.date.today()))
146
+ y, m, d = getYMD(edu_end_dt)
147
+ cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
148
+ except Exception as e:
149
+ print("EXCEPTION: ", e, edu_end_dt, cv.get("work_exp_flt"))
150
+ if sch:
151
+ cv["school_name_kwd"] = sch
152
+ if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \
153
+ or all([c.lower() in ["硕士", "博士", "mba", "博士后"] for c in cv.get("degree_kwd", [])]) \
154
+ or not cv.get("degree_kwd"):
155
+ for c in sch:
156
+ if schools.is_good(c):
157
+ if "tag_kwd" not in cv: cv["tag_kwd"] = []
158
+ cv["tag_kwd"].append("好学校")
159
+ cv["tag_kwd"].append("好学历")
160
+ break
161
+ if (len(cv.get("degree_kwd", [])) >= 1 and \
162
+ "本科" in cv["degree_kwd"] and \
163
+ any([d.lower() in ["硕士", "博士", "mba", "博士"] for d in cv.get("degree_kwd", [])])) \
164
+ or all([d.lower() in ["硕士", "博士", "mba", "博士后"] for d in cv.get("degree_kwd", [])]) \
165
+ or any([d in ["mba", "emba", "博士后"] for d in cv.get("degree_kwd", [])]):
166
+ if "tag_kwd" not in cv: cv["tag_kwd"] = []
167
+ if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历")
168
+
169
+ if cv.get("major_kwd"): cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
170
+ if cv.get("school_name_kwd"): cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
171
+ if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
172
+ if cv.get("first_major_kwd"): cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
173
+
174
+ return cv
175
+
176
+
177
+ def forProj(cv):
178
+ if not cv.get("project_obj"): return cv
179
+
180
+ pro_nms, desc = [], []
181
+ for i, n in enumerate(
182
+ sorted(cv.get("project_obj", []), key=lambda x: str(x.get("updated_at", "")) if type(x) == type({}) else "",
183
+ reverse=True)):
184
+ if n.get("name"): pro_nms.append(n["name"])
185
+ if n.get("describe"): desc.append(str(n["describe"]))
186
+ if n.get("responsibilities"): desc.append(str(n["responsibilities"]))
187
+ if n.get("achivement"): desc.append(str(n["achivement"]))
188
+
189
+ if pro_nms:
190
+ # cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms))
191
+ cv["project_name_tks"] = rag_tokenizer.tokenize(pro_nms[0])
192
+ if desc:
193
+ cv["pro_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(" ".join(desc)))
194
+ cv["project_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(desc[0]))
195
+
196
+ return cv
197
+
198
+
199
+ def json_loads(line):
200
+ return demjson3.decode(re.sub(r": *(True|False)", r": '\1'", line))
201
+
202
+
203
+ def forWork(cv):
204
+ if not cv.get("work_obj"):
205
+ cv["integerity_flt"] *= 0.7
206
+ return cv
207
+
208
+ flds = ["position_name", "corporation_name", "corporation_id", "responsibilities",
209
+ "industry_name", "subordinates_count"]
210
+ duas = []
211
+ scales = []
212
+ fea = {c: [] for c in flds}
213
+ latest_job_tm = ""
214
+ goodcorp = False
215
+ goodcorp_ = False
216
+ work_st_tm = ""
217
+ corp_tags = []
218
+ for i, n in enumerate(
219
+ sorted(cv.get("work_obj", []), key=lambda x: str(x.get("start_time", "")) if type(x) == type({}) else "",
220
+ reverse=True)):
221
+ if type(n) == type(""):
222
+ try:
223
+ n = json_loads(n)
224
+ except Exception as e:
225
+ continue
226
+
227
+ if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm): work_st_tm = n["start_time"]
228
+ for c in flds:
229
+ if not n.get(c) or str(n[c]) == '0':
230
+ fea[c].append("")
231
+ continue
232
+ if c == "corporation_name":
233
+ n[c] = corporations.corpNorm(n[c], False)
234
+ if corporations.is_good(n[c]):
235
+ if i == 0:
236
+ goodcorp = True
237
+ else:
238
+ goodcorp_ = True
239
+ ct = corporations.corp_tag(n[c])
240
+ if i == 0:
241
+ corp_tags.extend(ct)
242
+ elif ct and ct[0] != "软外":
243
+ corp_tags.extend([f"{t}(曾)" for t in ct])
244
+
245
+ fea[c].append(rmHtmlTag(str(n[c]).lower()))
246
+
247
+ y, m, d = getYMD(n.get("start_time"))
248
+ if not y or not m: continue
249
+ st = "%s-%02d-%02d" % (y, int(m), int(d))
250
+ latest_job_tm = st
251
+
252
+ y, m, d = getYMD(n.get("end_time"))
253
+ if (not y or not m) and i > 0: continue
254
+ if not y or not m or int(y) > 2022: y, m, d = getYMD(str(n.get("updated_at", "")))
255
+ if not y or not m: continue
256
+ ed = "%s-%02d-%02d" % (y, int(m), int(d))
257
+
258
+ try:
259
+ duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days)
260
+ except Exception as e:
261
+ print("kkkkkkkkkkkkkkkkkkkk", n.get("start_time"), n.get("end_time"))
262
+
263
+ if n.get("scale"):
264
+ r = re.search(r"^([0-9]+)", str(n["scale"]))
265
+ if r: scales.append(int(r.group(1)))
266
+
267
+ if goodcorp:
268
+ if "tag_kwd" not in cv: cv["tag_kwd"] = []
269
+ cv["tag_kwd"].append("好公司")
270
+ if goodcorp_:
271
+ if "tag_kwd" not in cv: cv["tag_kwd"] = []
272
+ cv["tag_kwd"].append("好公司(曾)")
273
+
274
+ if corp_tags:
275
+ if "tag_kwd" not in cv: cv["tag_kwd"] = []
276
+ cv["tag_kwd"].extend(corp_tags)
277
+ cv["corp_tag_kwd"] = [c for c in corp_tags if re.match(r"(综合|行业)", c)]
278
+
279
+ if latest_job_tm: cv["latest_job_dt"] = latest_job_tm
280
+ if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"]
281
+
282
+ if fea["position_name"]:
283
+ cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0])
284
+ cv["position_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["position_name_tks"])
285
+ cv["pos_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["position_name"][1:]))
286
+
287
+ if fea["industry_name"]:
288
+ cv["industry_name_tks"] = rag_tokenizer.tokenize(fea["industry_name"][0])
289
+ cv["industry_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["industry_name_tks"])
290
+ cv["indu_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["industry_name"][1:]))
291
+
292
+ if fea["corporation_name"]:
293
+ cv["corporation_name_kwd"] = fea["corporation_name"][0]
294
+ cv["corp_nm_kwd"] = fea["corporation_name"]
295
+ cv["corporation_name_tks"] = rag_tokenizer.tokenize(fea["corporation_name"][0])
296
+ cv["corporation_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["corporation_name_tks"])
297
+ cv["corp_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["corporation_name"][1:]))
298
+
299
+ if fea["responsibilities"]:
300
+ cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0])
301
+ cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:]))
302
+
303
+ if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
304
+ re.match(r"[^0-9]+$", str(i))]
305
+ if fea["subordinates_count"]: cv["max_sub_cnt_int"] = np.max(fea["subordinates_count"])
306
+
307
+ if type(cv.get("corporation_id")) == type(1): cv["corporation_id"] = [str(cv["corporation_id"])]
308
+ if not cv.get("corporation_id"): cv["corporation_id"] = []
309
+ for i in cv.get("corporation_id", []):
310
+ cv["baike_flt"] = max(corporations.baike(i), cv["baike_flt"] if "baike_flt" in cv else 0)
311
+
312
+ if work_st_tm:
313
+ try:
314
+ if re.match(r"[0-9]{9,}", work_st_tm): work_st_tm = turnTm2Dt(work_st_tm)
315
+ y, m, d = getYMD(work_st_tm)
316
+ cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
317
+ except Exception as e:
318
+ print("EXCEPTION: ", e, work_st_tm, cv.get("work_exp_flt"))
319
+
320
+ cv["job_num_int"] = 0
321
+ if duas:
322
+ cv["dua_flt"] = np.mean(duas)
323
+ cv["cur_dua_int"] = duas[0]
324
+ cv["job_num_int"] = len(duas)
325
+ if scales: cv["scale_flt"] = np.max(scales)
326
+ return cv
327
+
328
+
329
+ def turnTm2Dt(b):
330
+ if not b: return
331
+ b = str(b).strip()
332
+ if re.match(r"[0-9]{10,}", b): b = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(b[:10])))
333
+ return b
334
+
335
+
336
+ def getYMD(b):
337
+ y, m, d = "", "", "01"
338
+ if not b: return (y, m, d)
339
+ b = turnTm2Dt(b)
340
+ if re.match(r"[0-9]{4}", b): y = int(b[:4])
341
+ r = re.search(r"[0-9]{4}.?([0-9]{1,2})", b)
342
+ if r: m = r.group(1)
343
+ r = re.search(r"[0-9]{4}.?[0-9]{,2}.?([0-9]{1,2})", b)
344
+ if r: d = r.group(1)
345
+ if not d or int(d) == 0 or int(d) > 31: d = "1"
346
+ if not m or int(m) > 12 or int(m) < 1: m = "1"
347
+ return (y, m, d)
348
+
349
+
350
+ def birth(cv):
351
+ if not cv.get("birth"):
352
+ cv["integerity_flt"] *= 0.9
353
+ return cv
354
+ y, m, d = getYMD(cv["birth"])
355
+ if not m or not y: return cv
356
+ b = "%s-%02d-%02d" % (y, int(m), int(d))
357
+ cv["birth_dt"] = b
358
+ cv["birthday_kwd"] = "%02d%02d" % (int(m), int(d))
359
+
360
+ cv["age_int"] = datetime.datetime.now().year - int(y)
361
+ return cv
362
+
363
+
364
+ def parse(cv):
365
+ for k in cv.keys():
366
+ if cv[k] == '\\N': cv[k] = ''
367
+ # cv = cv.asDict()
368
+ tks_fld = ["address", "corporation_name", "discipline_name", "email", "expect_city_names",
369
+ "expect_industry_name", "expect_position_name", "industry_name", "industry_names", "name",
370
+ "position_name", "school_name", "self_remark", "title_name"]
371
+ small_tks_fld = ["corporation_name", "expect_position_name", "position_name", "school_name", "title_name"]
372
+ kwd_fld = ["address", "city", "corporation_type", "degree", "discipline_name", "expect_city_names", "email",
373
+ "expect_industry_name", "expect_position_name", "expect_type", "gender", "industry_name",
374
+ "industry_names", "political_status", "position_name", "scale", "school_name", "phone", "tel"]
375
+ num_fld = ["annual_salary", "annual_salary_from", "annual_salary_to", "expect_annual_salary", "expect_salary_from",
376
+ "expect_salary_to", "salary_month"]
377
+
378
+ is_fld = [
379
+ ("is_fertility", "已育", "未育"),
380
+ ("is_house", "有房", "没房"),
381
+ ("is_management_experience", "有管理经验", "无管理经验"),
382
+ ("is_marital", "已婚", "未婚"),
383
+ ("is_oversea", "有海外经验", "无海外经验")
384
+ ]
385
+
386
+ rmkeys = []
387
+ for k in cv.keys():
388
+ if cv[k] is None: rmkeys.append(k)
389
+ if (type(cv[k]) == type([]) or type(cv[k]) == type("")) and len(cv[k]) == 0: rmkeys.append(k)
390
+ for k in rmkeys: del cv[k]
391
+
392
+ integerity = 0.
393
+ flds_num = 0.
394
+
395
+ def hasValues(flds):
396
+ nonlocal integerity, flds_num
397
+ flds_num += len(flds)
398
+ for f in flds:
399
+ v = str(cv.get(f, ""))
400
+ if len(v) > 0 and v != '0' and v != '[]': integerity += 1
401
+
402
+ hasValues(tks_fld)
403
+ hasValues(small_tks_fld)
404
+ hasValues(kwd_fld)
405
+ hasValues(num_fld)
406
+ cv["integerity_flt"] = integerity / flds_num
407
+
408
+ if cv.get("corporation_type"):
409
+ for p, r in [(r"(公司|企业|其它|其他|Others*|\n|未填写|Enterprises|Company|companies)", ""),
410
+ (r"[//.· <\((]+.*", ""),
411
+ (r".*(合资|民企|股份制|中外|私营|个体|Private|创业|Owned|投资).*", "民营"),
412
+ (r".*(机关|事业).*", "机关"),
413
+ (r".*(非盈利|Non-profit).*", "非盈利"),
414
+ (r".*(外企|外商|欧美|foreign|Institution|Australia|港资).*", "外企"),
415
+ (r".*国有.*", "国企"),
416
+ (r"[ ()\(\)人/·0-9-]+", ""),
417
+ (r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]:
418
+ cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], 1000, re.IGNORECASE)
419
+ if len(cv["corporation_type"]) < 2: del cv["corporation_type"]
420
+
421
+ if cv.get("political_status"):
422
+ for p, r in [
423
+ (r".*党员.*", "党员"),
424
+ (r".*(无党派|公民).*", "群众"),
425
+ (r".*团员.*", "团员")]:
426
+ cv["political_status"] = re.sub(p, r, cv["political_status"])
427
+ if not re.search(r"[党团群]", cv["political_status"]): del cv["political_status"]
428
+
429
+ if cv.get("phone"): cv["phone"] = re.sub(r"^0*86([0-9]{11})", r"\1", re.sub(r"[^0-9]+", "", cv["phone"]))
430
+
431
+ keys = list(cv.keys())
432
+ for k in keys:
433
+ # deal with json objects
434
+ if k.find("_obj") > 0:
435
+ try:
436
+ cv[k] = json_loads(cv[k])
437
+ cv[k] = [a for _, a in cv[k].items()]
438
+ nms = []
439
+ for n in cv[k]:
440
+ if type(n) != type({}) or "name" not in n or not n.get("name"): continue
441
+ n["name"] = re.sub(r"((442)|\t )", "", n["name"]).strip().lower()
442
+ if not n["name"]: continue
443
+ nms.append(n["name"])
444
+ if nms:
445
+ t = k[:-4]
446
+ cv[f"{t}_kwd"] = nms
447
+ cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
448
+ except Exception as e:
449
+ print("【EXCEPTION】:", str(traceback.format_exc()), cv[k])
450
+ cv[k] = []
451
+
452
+ # tokenize fields
453
+ if k in tks_fld:
454
+ cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k])
455
+ if k in small_tks_fld: cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
456
+
457
+ # keyword fields
458
+ if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower()
459
+ for n in re.split(r"[\t,,;;. ]",
460
+ re.sub(r"([^a-zA-Z])[ ]+([^a-zA-Z ])", r"\1,\2", cv[k])
461
+ ) if n]
462
+
463
+ if k in num_fld and cv.get(k): cv[f"{k}_int"] = cv[k]
464
+
465
+ cv["email_kwd"] = cv.get("email_tks", "").replace(" ", "")
466
+ # for name field
467
+ if cv.get("name"):
468
+ nm = re.sub(r"[\n——\-\((\+].*", "", cv["name"].strip())
469
+ nm = re.sub(r"[ \t ]+", " ", nm)
470
+ if re.match(r"[a-zA-Z ]+$", nm):
471
+ if len(nm.split(" ")) > 1:
472
+ cv["name"] = nm
473
+ else:
474
+ nm = ""
475
+ elif nm and (surname.isit(nm[0]) or surname.isit(nm[:2])):
476
+ nm = re.sub(r"[a-zA-Z]+.*", "", nm[:5])
477
+ else:
478
+ nm = ""
479
+ cv["name"] = nm.strip()
480
+ name = cv["name"]
481
+
482
+ # name pingyin and its prefix
483
+ cv["name_py_tks"] = " ".join(PY.get_pinyins(nm[:20], '')) + " " + " ".join(PY.get_pinyins(nm[:20], ' '))
484
+ cv["name_py_pref0_tks"] = ""
485
+ cv["name_py_pref_tks"] = ""
486
+ for py in PY.get_pinyins(nm[:20], ''):
487
+ for i in range(2, len(py) + 1): cv["name_py_pref_tks"] += " " + py[:i]
488
+ for py in PY.get_pinyins(nm[:20], ' '):
489
+ py = py.split(" ")
490
+ for i in range(1, len(py) + 1): cv["name_py_pref0_tks"] += " " + "".join(py[:i])
491
+
492
+ cv["name_kwd"] = name
493
+ cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
494
+ cv["name_tks"] = (
495
+ rag_tokenizer.tokenize(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
496
+ ) if name else ""
497
+ else:
498
+ cv["integerity_flt"] /= 2.
499
+
500
+ if cv.get("phone"):
501
+ r = re.search(r"(1[3456789][0-9]{9})", cv["phone"])
502
+ if not r:
503
+ cv["phone"] = ""
504
+ else:
505
+ cv["phone"] = r.group(1)
506
+
507
+ # deal with date fields
508
+ if cv.get("updated_at") and isinstance(cv["updated_at"], datetime.datetime):
509
+ cv["updated_at_dt"] = cv["updated_at"].strftime('%Y-%m-%d %H:%M:%S')
510
+ else:
511
+ y, m, d = getYMD(str(cv.get("updated_at", "")))
512
+ if not y: y = "2012"
513
+ if not m: m = "01"
514
+ if not d: d = "01"
515
+ cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
516
+ # long text tokenize
517
+
518
+ if cv.get("responsibilities"): cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
519
+
520
+ # for yes or no field
521
+ fea = []
522
+ for f, y, n in is_fld:
523
+ if f not in cv: continue
524
+ if cv[f] == '是': fea.append(y)
525
+ if cv[f] == '否': fea.append(n)
526
+
527
+ if fea: cv["tag_kwd"] = fea
528
+
529
+ cv = forEdu(cv)
530
+ cv = forProj(cv)
531
+ cv = forWork(cv)
532
+ cv = birth(cv)
533
+
534
+ cv["corp_proj_sch_deg_kwd"] = [c for c in cv.get("corp_tag_kwd", [])]
535
+ for i in range(len(cv["corp_proj_sch_deg_kwd"])):
536
+ for j in cv.get("sch_rank_kwd", []): cv["corp_proj_sch_deg_kwd"][i] += "+" + j
537
+ for i in range(len(cv["corp_proj_sch_deg_kwd"])):
538
+ if cv.get("highest_degree_kwd"): cv["corp_proj_sch_deg_kwd"][i] += "+" + cv["highest_degree_kwd"]
539
+
540
+ try:
541
+ if not cv.get("work_exp_flt") and cv.get("work_start_time"):
542
+ if re.match(r"[0-9]{9,}", str(cv["work_start_time"])):
543
+ cv["work_start_dt"] = turnTm2Dt(cv["work_start_time"])
544
+ cv["work_exp_flt"] = (time.time() - int(int(cv["work_start_time"]) / 1000)) / 3600. / 24. / 365.
545
+ elif re.match(r"[0-9]{4}[^0-9]", str(cv["work_start_time"])):
546
+ y, m, d = getYMD(str(cv["work_start_time"]))
547
+ cv["work_start_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
548
+ cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
549
+ except Exception as e:
550
+ print("【EXCEPTION】", e, "==>", cv.get("work_start_time"))
551
+ if "work_exp_flt" not in cv and cv.get("work_experience", 0): cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
552
+
553
+ keys = list(cv.keys())
554
+ for k in keys:
555
+ if not re.search(r"_(fea|tks|nst|dt|int|flt|ltks|kwd|id)$", k): del cv[k]
556
+ for k in cv.keys():
557
+ if not re.search("_(kwd|id)$", k) or type(cv[k]) != type([]): continue
558
+ cv[k] = list(set([re.sub("(市)$", "", str(n)) for n in cv[k] if n not in ['中国', '0']]))
559
+ keys = [k for k in cv.keys() if re.search(r"_feas*$", k)]
560
+ for k in keys:
561
+ if cv[k] <= 0: del cv[k]
562
+
563
+ cv["tob_resume_id"] = str(cv["tob_resume_id"])
564
+ cv["id"] = cv["tob_resume_id"]
565
+ print("CCCCCCCCCCCCCCC")
566
+
567
+ return dealWithInt64(cv)
568
+
569
+
570
+ def dealWithInt64(d):
571
+ if isinstance(d, dict):
572
+ for n, v in d.items():
573
+ d[n] = dealWithInt64(v)
574
+
575
+ if isinstance(d, list):
576
+ d = [dealWithInt64(t) for t in d]
577
+
578
+ if isinstance(d, np.integer): d = int(d)
579
+ return d
580
+
deepdoc/utils/__init__.py ADDED
File without changes
deepdoc/utils/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (143 Bytes). View file
 
deepdoc/utils/__pycache__/file_utils.cpython-310.pyc ADDED
Binary file (5.67 kB). View file
 
deepdoc/utils/__pycache__/log_utils.cpython-310.pyc ADDED
Binary file (7.41 kB). View file
 
deepdoc/utils/__pycache__/rag_tokenizer.cpython-310.pyc ADDED
Binary file (12.5 kB). View file
 
deepdoc/utils/file_utils.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import base64
17
+ import json
18
+ import os
19
+ import re
20
+ from io import BytesIO
21
+
22
+ import pdfplumber
23
+ from PIL import Image
24
+ from cachetools import LRUCache, cached
25
+ from ruamel.yaml import YAML
26
+ from strenum import StrEnum
27
+
28
+ class FileType(StrEnum):
29
+ PDF = 'pdf'
30
+ DOC = 'doc'
31
+ VISUAL = 'visual'
32
+ AURAL = 'aural'
33
+ VIRTUAL = 'virtual'
34
+ FOLDER = 'folder'
35
+ OTHER = "other"
36
+
37
+
38
+ PROJECT_BASE = os.getenv("RAG_PROJECT_BASE") or os.getenv("RAG_DEPLOY_BASE")
39
+ RAG_BASE = os.getenv("RAG_BASE")
40
+
41
+
42
+ def get_project_base_directory(*args):
43
+ global PROJECT_BASE
44
+ if PROJECT_BASE is None:
45
+ PROJECT_BASE = os.path.abspath(
46
+ os.path.join(
47
+ os.path.dirname(os.path.realpath(__file__)),
48
+ os.pardir,
49
+ os.pardir,
50
+ )
51
+ )
52
+
53
+ if args:
54
+ return os.path.join(PROJECT_BASE, *args)
55
+ return PROJECT_BASE
56
+
57
+
58
+ def get_rag_directory(*args):
59
+ global RAG_BASE
60
+ if RAG_BASE is None:
61
+ RAG_BASE = os.path.abspath(
62
+ os.path.join(
63
+ os.path.dirname(os.path.realpath(__file__)),
64
+ os.pardir,
65
+ os.pardir,
66
+ os.pardir,
67
+ )
68
+ )
69
+ if args:
70
+ return os.path.join(RAG_BASE, *args)
71
+ return RAG_BASE
72
+
73
+
74
+ def get_rag_python_directory(*args):
75
+ return get_rag_directory("python", *args)
76
+
77
+
78
+ def get_home_cache_dir():
79
+ dir = os.path.join(os.path.expanduser('~'), ".ragflow")
80
+ try:
81
+ os.mkdir(dir)
82
+ except OSError as error:
83
+ pass
84
+ return dir
85
+
86
+
87
+ @cached(cache=LRUCache(maxsize=10))
88
+ def load_json_conf(conf_path):
89
+ if os.path.isabs(conf_path):
90
+ json_conf_path = conf_path
91
+ else:
92
+ json_conf_path = os.path.join(get_project_base_directory(), conf_path)
93
+ try:
94
+ with open(json_conf_path) as f:
95
+ return json.load(f)
96
+ except BaseException:
97
+ raise EnvironmentError(
98
+ "loading json file config from '{}' failed!".format(json_conf_path)
99
+ )
100
+
101
+
102
+ def dump_json_conf(config_data, conf_path):
103
+ if os.path.isabs(conf_path):
104
+ json_conf_path = conf_path
105
+ else:
106
+ json_conf_path = os.path.join(get_project_base_directory(), conf_path)
107
+ try:
108
+ with open(json_conf_path, "w") as f:
109
+ json.dump(config_data, f, indent=4)
110
+ except BaseException:
111
+ raise EnvironmentError(
112
+ "loading json file config from '{}' failed!".format(json_conf_path)
113
+ )
114
+
115
+
116
+ def load_json_conf_real_time(conf_path):
117
+ if os.path.isabs(conf_path):
118
+ json_conf_path = conf_path
119
+ else:
120
+ json_conf_path = os.path.join(get_project_base_directory(), conf_path)
121
+ try:
122
+ with open(json_conf_path) as f:
123
+ return json.load(f)
124
+ except BaseException:
125
+ raise EnvironmentError(
126
+ "loading json file config from '{}' failed!".format(json_conf_path)
127
+ )
128
+
129
+
130
+ def load_yaml_conf(conf_path):
131
+ if not os.path.isabs(conf_path):
132
+ conf_path = os.path.join(get_project_base_directory(), conf_path)
133
+ try:
134
+ with open(conf_path) as f:
135
+ yaml = YAML(typ='safe', pure=True)
136
+ return yaml.load(f)
137
+ except Exception as e:
138
+ raise EnvironmentError(
139
+ "loading yaml file config from {} failed:".format(conf_path), e
140
+ )
141
+
142
+
143
+ def rewrite_yaml_conf(conf_path, config):
144
+ if not os.path.isabs(conf_path):
145
+ conf_path = os.path.join(get_project_base_directory(), conf_path)
146
+ try:
147
+ with open(conf_path, "w") as f:
148
+ yaml = YAML(typ="safe")
149
+ yaml.dump(config, f)
150
+ except Exception as e:
151
+ raise EnvironmentError(
152
+ "rewrite yaml file config {} failed:".format(conf_path), e
153
+ )
154
+
155
+
156
+ def rewrite_json_file(filepath, json_data):
157
+ with open(filepath, "w") as f:
158
+ json.dump(json_data, f, indent=4, separators=(",", ": "))
159
+ f.close()
160
+
161
+
162
+ def filename_type(filename):
163
+ filename = filename.lower()
164
+ if re.match(r".*\.pdf$", filename):
165
+ return FileType.PDF.value
166
+
167
+ if re.match(
168
+ r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename):
169
+ return FileType.DOC.value
170
+
171
+ if re.match(
172
+ r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename):
173
+ return FileType.AURAL.value
174
+
175
+ if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename):
176
+ return FileType.VISUAL.value
177
+
178
+ return FileType.OTHER.value
179
+
180
+
181
+ def thumbnail(filename, blob):
182
+ filename = filename.lower()
183
+ if re.match(r".*\.pdf$", filename):
184
+ pdf = pdfplumber.open(BytesIO(blob))
185
+ buffered = BytesIO()
186
+ pdf.pages[0].to_image(resolution=32).annotated.save(buffered, format="png")
187
+ return "data:image/png;base64," + \
188
+ base64.b64encode(buffered.getvalue()).decode("utf-8")
189
+
190
+ if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
191
+ image = Image.open(BytesIO(blob))
192
+ image.thumbnail((30, 30))
193
+ buffered = BytesIO()
194
+ image.save(buffered, format="png")
195
+ return "data:image/png;base64," + \
196
+ base64.b64encode(buffered.getvalue()).decode("utf-8")
197
+
198
+ if re.match(r".*\.(ppt|pptx)$", filename):
199
+ import aspose.slides as slides
200
+ import aspose.pydrawing as drawing
201
+ try:
202
+ with slides.Presentation(BytesIO(blob)) as presentation:
203
+ buffered = BytesIO()
204
+ presentation.slides[0].get_thumbnail(0.03, 0.03).save(
205
+ buffered, drawing.imaging.ImageFormat.png)
206
+ return "data:image/png;base64," + \
207
+ base64.b64encode(buffered.getvalue()).decode("utf-8")
208
+ except Exception as e:
209
+ pass
210
+
211
+
212
+ def traversal_files(base):
213
+ for root, ds, fs in os.walk(base):
214
+ for f in fs:
215
+ fullname = os.path.join(root, f)
216
+ yield fullname
deepdoc/utils/log_utils.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import os
17
+ import typing
18
+ import traceback
19
+ import logging
20
+ import inspect
21
+ from logging.handlers import TimedRotatingFileHandler
22
+ from threading import RLock
23
+
24
+ from deepdoc.utils import file_utils
25
+
26
+
27
+ class LoggerFactory(object):
28
+ TYPE = "FILE"
29
+ LOG_FORMAT = "[%(levelname)s] [%(asctime)s] [%(module)s.%(funcName)s] [line:%(lineno)d]: %(message)s"
30
+ logging.basicConfig(format=LOG_FORMAT)
31
+ LEVEL = logging.DEBUG
32
+ logger_dict = {}
33
+ global_handler_dict = {}
34
+
35
+ LOG_DIR = None
36
+ PARENT_LOG_DIR = None
37
+ log_share = True
38
+
39
+ append_to_parent_log = None
40
+
41
+ lock = RLock()
42
+ # CRITICAL = 50
43
+ # FATAL = CRITICAL
44
+ # ERROR = 40
45
+ # WARNING = 30
46
+ # WARN = WARNING
47
+ # INFO = 20
48
+ # DEBUG = 10
49
+ # NOTSET = 0
50
+ levels = (10, 20, 30, 40)
51
+ schedule_logger_dict = {}
52
+
53
+ @staticmethod
54
+ def set_directory(directory=None, parent_log_dir=None,
55
+ append_to_parent_log=None, force=False):
56
+ if parent_log_dir:
57
+ LoggerFactory.PARENT_LOG_DIR = parent_log_dir
58
+ if append_to_parent_log:
59
+ LoggerFactory.append_to_parent_log = append_to_parent_log
60
+ with LoggerFactory.lock:
61
+ if not directory:
62
+ directory = file_utils.get_project_base_directory("logs")
63
+ if not LoggerFactory.LOG_DIR or force:
64
+ LoggerFactory.LOG_DIR = directory
65
+ if LoggerFactory.log_share:
66
+ oldmask = os.umask(000)
67
+ os.makedirs(LoggerFactory.LOG_DIR, exist_ok=True)
68
+ os.umask(oldmask)
69
+ else:
70
+ os.makedirs(LoggerFactory.LOG_DIR, exist_ok=True)
71
+ for loggerName, ghandler in LoggerFactory.global_handler_dict.items():
72
+ for className, (logger,
73
+ handler) in LoggerFactory.logger_dict.items():
74
+ logger.removeHandler(ghandler)
75
+ ghandler.close()
76
+ LoggerFactory.global_handler_dict = {}
77
+ for className, (logger,
78
+ handler) in LoggerFactory.logger_dict.items():
79
+ logger.removeHandler(handler)
80
+ _handler = None
81
+ if handler:
82
+ handler.close()
83
+ if className != "default":
84
+ _handler = LoggerFactory.get_handler(className)
85
+ logger.addHandler(_handler)
86
+ LoggerFactory.assemble_global_handler(logger)
87
+ LoggerFactory.logger_dict[className] = logger, _handler
88
+
89
+ @staticmethod
90
+ def new_logger(name):
91
+ logger = logging.getLogger(name)
92
+ logger.propagate = False
93
+ logger.setLevel(LoggerFactory.LEVEL)
94
+ return logger
95
+
96
+ @staticmethod
97
+ def get_logger(class_name=None):
98
+ with LoggerFactory.lock:
99
+ if class_name in LoggerFactory.logger_dict.keys():
100
+ logger, handler = LoggerFactory.logger_dict[class_name]
101
+ if not logger:
102
+ logger, handler = LoggerFactory.init_logger(class_name)
103
+ else:
104
+ logger, handler = LoggerFactory.init_logger(class_name)
105
+ return logger
106
+
107
+ @staticmethod
108
+ def get_global_handler(logger_name, level=None, log_dir=None):
109
+ if not LoggerFactory.LOG_DIR:
110
+ return logging.StreamHandler()
111
+ if log_dir:
112
+ logger_name_key = logger_name + "_" + log_dir
113
+ else:
114
+ logger_name_key = logger_name + "_" + LoggerFactory.LOG_DIR
115
+ # if loggerName not in LoggerFactory.globalHandlerDict:
116
+ if logger_name_key not in LoggerFactory.global_handler_dict:
117
+ with LoggerFactory.lock:
118
+ if logger_name_key not in LoggerFactory.global_handler_dict:
119
+ handler = LoggerFactory.get_handler(
120
+ logger_name, level, log_dir)
121
+ LoggerFactory.global_handler_dict[logger_name_key] = handler
122
+ return LoggerFactory.global_handler_dict[logger_name_key]
123
+
124
+ @staticmethod
125
+ def get_handler(class_name, level=None, log_dir=None,
126
+ log_type=None, job_id=None):
127
+ if not log_type:
128
+ if not LoggerFactory.LOG_DIR or not class_name:
129
+ return logging.StreamHandler()
130
+ # return Diy_StreamHandler()
131
+
132
+ if not log_dir:
133
+ log_file = os.path.join(
134
+ LoggerFactory.LOG_DIR,
135
+ "{}.log".format(class_name))
136
+ else:
137
+ log_file = os.path.join(log_dir, "{}.log".format(class_name))
138
+ else:
139
+ log_file = os.path.join(log_dir, "rag_flow_{}.log".format(
140
+ log_type) if level == LoggerFactory.LEVEL else 'rag_flow_{}_error.log'.format(log_type))
141
+
142
+ os.makedirs(os.path.dirname(log_file), exist_ok=True)
143
+ if LoggerFactory.log_share:
144
+ handler = ROpenHandler(log_file,
145
+ when='D',
146
+ interval=1,
147
+ backupCount=14,
148
+ delay=True)
149
+ else:
150
+ handler = TimedRotatingFileHandler(log_file,
151
+ when='D',
152
+ interval=1,
153
+ backupCount=14,
154
+ delay=True)
155
+ if level:
156
+ handler.level = level
157
+
158
+ return handler
159
+
160
+ @staticmethod
161
+ def init_logger(class_name):
162
+ with LoggerFactory.lock:
163
+ logger = LoggerFactory.new_logger(class_name)
164
+ handler = None
165
+ if class_name:
166
+ handler = LoggerFactory.get_handler(class_name)
167
+ logger.addHandler(handler)
168
+ LoggerFactory.logger_dict[class_name] = logger, handler
169
+
170
+ else:
171
+ LoggerFactory.logger_dict["default"] = logger, handler
172
+
173
+ LoggerFactory.assemble_global_handler(logger)
174
+ return logger, handler
175
+
176
+ @staticmethod
177
+ def assemble_global_handler(logger):
178
+ if LoggerFactory.LOG_DIR:
179
+ for level in LoggerFactory.levels:
180
+ if level >= LoggerFactory.LEVEL:
181
+ level_logger_name = logging._levelToName[level]
182
+ logger.addHandler(
183
+ LoggerFactory.get_global_handler(
184
+ level_logger_name, level))
185
+ if LoggerFactory.append_to_parent_log and LoggerFactory.PARENT_LOG_DIR:
186
+ for level in LoggerFactory.levels:
187
+ if level >= LoggerFactory.LEVEL:
188
+ level_logger_name = logging._levelToName[level]
189
+ logger.addHandler(
190
+ LoggerFactory.get_global_handler(level_logger_name, level, LoggerFactory.PARENT_LOG_DIR))
191
+
192
+
193
+ def setDirectory(directory=None):
194
+ LoggerFactory.set_directory(directory)
195
+
196
+
197
+ def setLevel(level):
198
+ LoggerFactory.LEVEL = level
199
+
200
+
201
+ def getLogger(className=None, useLevelFile=False):
202
+ if className is None:
203
+ frame = inspect.stack()[1]
204
+ module = inspect.getmodule(frame[0])
205
+ className = 'stat'
206
+ return LoggerFactory.get_logger(className)
207
+
208
+
209
+ def exception_to_trace_string(ex):
210
+ return "".join(traceback.TracebackException.from_exception(ex).format())
211
+
212
+
213
+ class ROpenHandler(TimedRotatingFileHandler):
214
+ def _open(self):
215
+ prevumask = os.umask(000)
216
+ rtv = TimedRotatingFileHandler._open(self)
217
+ os.umask(prevumask)
218
+ return rtv
219
+
220
+
221
+ def sql_logger(job_id='', log_type='sql'):
222
+ key = job_id + log_type
223
+ if key in LoggerFactory.schedule_logger_dict.keys():
224
+ return LoggerFactory.schedule_logger_dict[key]
225
+ return get_job_logger(job_id=job_id, log_type=log_type)
226
+
227
+
228
+ def ready_log(msg, job=None, task=None, role=None, party_id=None, detail=None):
229
+ prefix, suffix = base_msg(job, task, role, party_id, detail)
230
+ return f"{prefix}{msg} ready{suffix}"
231
+
232
+
233
+ def start_log(msg, job=None, task=None, role=None, party_id=None, detail=None):
234
+ prefix, suffix = base_msg(job, task, role, party_id, detail)
235
+ return f"{prefix}start to {msg}{suffix}"
236
+
237
+
238
+ def successful_log(msg, job=None, task=None, role=None,
239
+ party_id=None, detail=None):
240
+ prefix, suffix = base_msg(job, task, role, party_id, detail)
241
+ return f"{prefix}{msg} successfully{suffix}"
242
+
243
+
244
+ def warning_log(msg, job=None, task=None, role=None,
245
+ party_id=None, detail=None):
246
+ prefix, suffix = base_msg(job, task, role, party_id, detail)
247
+ return f"{prefix}{msg} is not effective{suffix}"
248
+
249
+
250
+ def failed_log(msg, job=None, task=None, role=None,
251
+ party_id=None, detail=None):
252
+ prefix, suffix = base_msg(job, task, role, party_id, detail)
253
+ return f"{prefix}failed to {msg}{suffix}"
254
+
255
+
256
+ def base_msg(job=None, task=None, role: str = None,
257
+ party_id: typing.Union[str, int] = None, detail=None):
258
+ if detail:
259
+ detail_msg = f" detail: \n{detail}"
260
+ else:
261
+ detail_msg = ""
262
+ if task is not None:
263
+ return f"task {task.f_task_id} {task.f_task_version} ", f" on {task.f_role} {task.f_party_id}{detail_msg}"
264
+ elif job is not None:
265
+ return "", f" on {job.f_role} {job.f_party_id}{detail_msg}"
266
+ elif role and party_id:
267
+ return "", f" on {role} {party_id}{detail_msg}"
268
+ else:
269
+ return "", f"{detail_msg}"
270
+
271
+
272
+ def exception_to_trace_string(ex):
273
+ return "".join(traceback.TracebackException.from_exception(ex).format())
274
+
275
+
276
+ def get_logger_base_dir():
277
+ job_log_dir = file_utils.get_rag_flow_directory('logs')
278
+ return job_log_dir
279
+
280
+
281
+ def get_job_logger(job_id, log_type):
282
+ rag_flow_log_dir = file_utils.get_rag_flow_directory('logs', 'rag_flow')
283
+ job_log_dir = file_utils.get_rag_flow_directory('logs', job_id)
284
+ if not job_id:
285
+ log_dirs = [rag_flow_log_dir]
286
+ else:
287
+ if log_type == 'audit':
288
+ log_dirs = [job_log_dir, rag_flow_log_dir]
289
+ else:
290
+ log_dirs = [job_log_dir]
291
+ if LoggerFactory.log_share:
292
+ oldmask = os.umask(000)
293
+ os.makedirs(job_log_dir, exist_ok=True)
294
+ os.makedirs(rag_flow_log_dir, exist_ok=True)
295
+ os.umask(oldmask)
296
+ else:
297
+ os.makedirs(job_log_dir, exist_ok=True)
298
+ os.makedirs(rag_flow_log_dir, exist_ok=True)
299
+ logger = LoggerFactory.new_logger(f"{job_id}_{log_type}")
300
+ for job_log_dir in log_dirs:
301
+ handler = LoggerFactory.get_handler(class_name=None, level=LoggerFactory.LEVEL,
302
+ log_dir=job_log_dir, log_type=log_type, job_id=job_id)
303
+ error_handler = LoggerFactory.get_handler(
304
+ class_name=None,
305
+ level=logging.ERROR,
306
+ log_dir=job_log_dir,
307
+ log_type=log_type,
308
+ job_id=job_id)
309
+ logger.addHandler(handler)
310
+ logger.addHandler(error_handler)
311
+ with LoggerFactory.lock:
312
+ LoggerFactory.schedule_logger_dict[job_id + log_type] = logger
313
+ return logger
deepdoc/utils/rag_tokenizer.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import copy
4
+ import datrie
5
+ import math
6
+ import os
7
+ import re
8
+ import string
9
+ import sys
10
+ from hanziconv import HanziConv
11
+ from huggingface_hub import snapshot_download
12
+ from nltk import word_tokenize
13
+ from nltk.stem import PorterStemmer, WordNetLemmatizer
14
+ from deepdoc.utils.file_utils import get_project_base_directory
15
+
16
+
17
+ class RagTokenizer:
18
+ def key_(self, line):
19
+ return str(line.lower().encode("utf-8"))[2:-1]
20
+
21
+ def rkey_(self, line):
22
+ return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]
23
+
24
+ def loadDict_(self, fnm):
25
+ print("[HUQIE]:Build trie", fnm, file=sys.stderr)
26
+ try:
27
+ of = open(fnm, "r")
28
+ while True:
29
+ line = of.readline()
30
+ if not line:
31
+ break
32
+ line = re.sub(r"[\r\n]+", "", line)
33
+ line = re.split(r"[ \t]", line)
34
+ k = self.key_(line[0])
35
+ F = int(math.log(float(line[1]) / self.DENOMINATOR) + .5)
36
+ if k not in self.trie_ or self.trie_[k][0] < F:
37
+ self.trie_[self.key_(line[0])] = (F, line[2])
38
+ self.trie_[self.rkey_(line[0])] = 1
39
+ self.trie_.save(fnm + ".trie")
40
+ of.close()
41
+ except Exception as e:
42
+ print("[HUQIE]:Faild to build trie, ", fnm, e, file=sys.stderr)
43
+
44
+ def __init__(self, debug=False):
45
+ self.DEBUG = debug
46
+ self.DENOMINATOR = 1000000
47
+ self.trie_ = datrie.Trie(string.printable)
48
+ self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie")
49
+
50
+ self.stemmer = PorterStemmer()
51
+ self.lemmatizer = WordNetLemmatizer()
52
+
53
+ self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)"
54
+ try:
55
+ self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
56
+ return
57
+ except Exception as e:
58
+ print("[HUQIE]:Build default trie", file=sys.stderr)
59
+ self.trie_ = datrie.Trie(string.printable)
60
+
61
+ self.loadDict_(self.DIR_ + ".txt")
62
+
63
+ def loadUserDict(self, fnm):
64
+ try:
65
+ self.trie_ = datrie.Trie.load(fnm + ".trie")
66
+ return
67
+ except Exception as e:
68
+ self.trie_ = datrie.Trie(string.printable)
69
+ self.loadDict_(fnm)
70
+
71
+ def addUserDict(self, fnm):
72
+ self.loadDict_(fnm)
73
+
74
+ def _strQ2B(self, ustring):
75
+ """把字符串全角转半角"""
76
+ rstring = ""
77
+ for uchar in ustring:
78
+ inside_code = ord(uchar)
79
+ if inside_code == 0x3000:
80
+ inside_code = 0x0020
81
+ else:
82
+ inside_code -= 0xfee0
83
+ if inside_code < 0x0020 or inside_code > 0x7e: # 转完之后不是半角字符返回原来的字符
84
+ rstring += uchar
85
+ else:
86
+ rstring += chr(inside_code)
87
+ return rstring
88
+
89
+ def _tradi2simp(self, line):
90
+ return HanziConv.toSimplified(line)
91
+
92
+ def dfs_(self, chars, s, preTks, tkslist):
93
+ MAX_L = 10
94
+ res = s
95
+ # if s > MAX_L or s>= len(chars):
96
+ if s >= len(chars):
97
+ tkslist.append(preTks)
98
+ return res
99
+
100
+ # pruning
101
+ S = s + 1
102
+ if s + 2 <= len(chars):
103
+ t1, t2 = "".join(chars[s:s + 1]), "".join(chars[s:s + 2])
104
+ if self.trie_.has_keys_with_prefix(self.key_(t1)) and not self.trie_.has_keys_with_prefix(
105
+ self.key_(t2)):
106
+ S = s + 2
107
+ if len(preTks) > 2 and len(
108
+ preTks[-1][0]) == 1 and len(preTks[-2][0]) == 1 and len(preTks[-3][0]) == 1:
109
+ t1 = preTks[-1][0] + "".join(chars[s:s + 1])
110
+ if self.trie_.has_keys_with_prefix(self.key_(t1)):
111
+ S = s + 2
112
+
113
+ ################
114
+ for e in range(S, len(chars) + 1):
115
+ t = "".join(chars[s:e])
116
+ k = self.key_(t)
117
+
118
+ if e > s + 1 and not self.trie_.has_keys_with_prefix(k):
119
+ break
120
+
121
+ if k in self.trie_:
122
+ pretks = copy.deepcopy(preTks)
123
+ if k in self.trie_:
124
+ pretks.append((t, self.trie_[k]))
125
+ else:
126
+ pretks.append((t, (-12, '')))
127
+ res = max(res, self.dfs_(chars, e, pretks, tkslist))
128
+
129
+ if res > s:
130
+ return res
131
+
132
+ t = "".join(chars[s:s + 1])
133
+ k = self.key_(t)
134
+ if k in self.trie_:
135
+ preTks.append((t, self.trie_[k]))
136
+ else:
137
+ preTks.append((t, (-12, '')))
138
+
139
+ return self.dfs_(chars, s + 1, preTks, tkslist)
140
+
141
+ def freq(self, tk):
142
+ k = self.key_(tk)
143
+ if k not in self.trie_:
144
+ return 0
145
+ return int(math.exp(self.trie_[k][0]) * self.DENOMINATOR + 0.5)
146
+
147
+ def tag(self, tk):
148
+ k = self.key_(tk)
149
+ if k not in self.trie_:
150
+ return ""
151
+ return self.trie_[k][1]
152
+
153
+ def score_(self, tfts):
154
+ B = 30
155
+ F, L, tks = 0, 0, []
156
+ for tk, (freq, tag) in tfts:
157
+ F += freq
158
+ L += 0 if len(tk) < 2 else 1
159
+ tks.append(tk)
160
+ F /= len(tks)
161
+ L /= len(tks)
162
+ if self.DEBUG:
163
+ print("[SC]", tks, len(tks), L, F, B / len(tks) + L + F)
164
+ return tks, B / len(tks) + L + F
165
+
166
+ def sortTks_(self, tkslist):
167
+ res = []
168
+ for tfts in tkslist:
169
+ tks, s = self.score_(tfts)
170
+ res.append((tks, s))
171
+ return sorted(res, key=lambda x: x[1], reverse=True)
172
+
173
+ def merge_(self, tks):
174
+ patts = [
175
+ (r"[ ]+", " "),
176
+ (r"([0-9\+\.,%\*=-]) ([0-9\+\.,%\*=-])", r"\1\2"),
177
+ ]
178
+ # for p,s in patts: tks = re.sub(p, s, tks)
179
+
180
+ # if split chars is part of token
181
+ res = []
182
+ tks = re.sub(r"[ ]+", " ", tks).split(" ")
183
+ s = 0
184
+ while True:
185
+ if s >= len(tks):
186
+ break
187
+ E = s + 1
188
+ for e in range(s + 2, min(len(tks) + 2, s + 6)):
189
+ tk = "".join(tks[s:e])
190
+ if re.search(self.SPLIT_CHAR, tk) and self.freq(tk):
191
+ E = e
192
+ res.append("".join(tks[s:E]))
193
+ s = E
194
+
195
+ return " ".join(res)
196
+
197
+ def maxForward_(self, line):
198
+ res = []
199
+ s = 0
200
+ while s < len(line):
201
+ e = s + 1
202
+ t = line[s:e]
203
+ while e < len(line) and self.trie_.has_keys_with_prefix(
204
+ self.key_(t)):
205
+ e += 1
206
+ t = line[s:e]
207
+
208
+ while e - 1 > s and self.key_(t) not in self.trie_:
209
+ e -= 1
210
+ t = line[s:e]
211
+
212
+ if self.key_(t) in self.trie_:
213
+ res.append((t, self.trie_[self.key_(t)]))
214
+ else:
215
+ res.append((t, (0, '')))
216
+
217
+ s = e
218
+
219
+ return self.score_(res)
220
+
221
+ def maxBackward_(self, line):
222
+ res = []
223
+ s = len(line) - 1
224
+ while s >= 0:
225
+ e = s + 1
226
+ t = line[s:e]
227
+ while s > 0 and self.trie_.has_keys_with_prefix(self.rkey_(t)):
228
+ s -= 1
229
+ t = line[s:e]
230
+
231
+ while s + 1 < e and self.key_(t) not in self.trie_:
232
+ s += 1
233
+ t = line[s:e]
234
+
235
+ if self.key_(t) in self.trie_:
236
+ res.append((t, self.trie_[self.key_(t)]))
237
+ else:
238
+ res.append((t, (0, '')))
239
+
240
+ s -= 1
241
+
242
+ return self.score_(res[::-1])
243
+
244
+ def tokenize(self, line):
245
+ line = self._strQ2B(line).lower()
246
+ line = self._tradi2simp(line)
247
+ zh_num = len([1 for c in line if is_chinese(c)])
248
+ if zh_num < len(line) * 0.2:
249
+ return " ".join([self.stemmer.stem(self.lemmatizer.lemmatize(t)) for t in word_tokenize(line)])
250
+
251
+ arr = re.split(self.SPLIT_CHAR, line)
252
+ res = []
253
+ for L in arr:
254
+ if len(L) < 2 or re.match(
255
+ r"[a-z\.-]+$", L) or re.match(r"[0-9\.-]+$", L):
256
+ res.append(L)
257
+ continue
258
+ # print(L)
259
+
260
+ # use maxforward for the first time
261
+ tks, s = self.maxForward_(L)
262
+ tks1, s1 = self.maxBackward_(L)
263
+ if self.DEBUG:
264
+ print("[FW]", tks, s)
265
+ print("[BW]", tks1, s1)
266
+
267
+ diff = [0 for _ in range(max(len(tks1), len(tks)))]
268
+ for i in range(min(len(tks1), len(tks))):
269
+ if tks[i] != tks1[i]:
270
+ diff[i] = 1
271
+
272
+ if s1 > s:
273
+ tks = tks1
274
+
275
+ i = 0
276
+ while i < len(tks):
277
+ s = i
278
+ while s < len(tks) and diff[s] == 0:
279
+ s += 1
280
+ if s == len(tks):
281
+ res.append(" ".join(tks[i:]))
282
+ break
283
+ if s > i:
284
+ res.append(" ".join(tks[i:s]))
285
+
286
+ e = s
287
+ while e < len(tks) and e - s < 5 and diff[e] == 1:
288
+ e += 1
289
+
290
+ tkslist = []
291
+ self.dfs_("".join(tks[s:e + 1]), 0, [], tkslist)
292
+ res.append(" ".join(self.sortTks_(tkslist)[0][0]))
293
+
294
+ i = e + 1
295
+
296
+ res = " ".join(res)
297
+ if self.DEBUG:
298
+ print("[TKS]", self.merge_(res))
299
+ return self.merge_(res)
300
+
301
+ def fine_grained_tokenize(self, tks):
302
+ tks = tks.split(" ")
303
+ zh_num = len([1 for c in tks if c and is_chinese(c[0])])
304
+ if zh_num < len(tks) * 0.2:
305
+ res = []
306
+ for tk in tks:
307
+ res.extend(tk.split("/"))
308
+ return " ".join(res)
309
+
310
+ res = []
311
+ for tk in tks:
312
+ if len(tk) < 3 or re.match(r"[0-9,\.-]+$", tk):
313
+ res.append(tk)
314
+ continue
315
+ tkslist = []
316
+ if len(tk) > 10:
317
+ tkslist.append(tk)
318
+ else:
319
+ self.dfs_(tk, 0, [], tkslist)
320
+ if len(tkslist) < 2:
321
+ res.append(tk)
322
+ continue
323
+ stk = self.sortTks_(tkslist)[1][0]
324
+ if len(stk) == len(tk):
325
+ stk = tk
326
+ else:
327
+ if re.match(r"[a-z\.-]+$", tk):
328
+ for t in stk:
329
+ if len(t) < 3:
330
+ stk = tk
331
+ break
332
+ else:
333
+ stk = " ".join(stk)
334
+ else:
335
+ stk = " ".join(stk)
336
+
337
+ res.append(stk)
338
+
339
+ return " ".join(res)
340
+
341
+
342
+ def is_chinese(s):
343
+ if s >= u'\u4e00' and s <= u'\u9fa5':
344
+ return True
345
+ else:
346
+ return False
347
+
348
+
349
+ def is_number(s):
350
+ if s >= u'\u0030' and s <= u'\u0039':
351
+ return True
352
+ else:
353
+ return False
354
+
355
+
356
+ def is_alphabet(s):
357
+ if (s >= u'\u0041' and s <= u'\u005a') or (
358
+ s >= u'\u0061' and s <= u'\u007a'):
359
+ return True
360
+ else:
361
+ return False
362
+
363
+
364
+ def naiveQie(txt):
365
+ tks = []
366
+ for t in txt.split(" "):
367
+ if tks and re.match(r".*[a-zA-Z]$", tks[-1]
368
+ ) and re.match(r".*[a-zA-Z]$", t):
369
+ tks.append(" ")
370
+ tks.append(t)
371
+ return tks
372
+
373
+
374
+ tokenizer = RagTokenizer()
375
+ tokenize = tokenizer.tokenize
376
+ fine_grained_tokenize = tokenizer.fine_grained_tokenize
377
+ tag = tokenizer.tag
378
+ freq = tokenizer.freq
379
+ loadUserDict = tokenizer.loadUserDict
380
+ addUserDict = tokenizer.addUserDict
381
+ tradi2simp = tokenizer._tradi2simp
382
+ strQ2B = tokenizer._strQ2B
383
+
384
+ if __name__ == '__main__':
385
+ tknzr = RagTokenizer(debug=True)
386
+ # huqie.addUserDict("/tmp/tmp.new.tks.dict")
387
+ tks = tknzr.tokenize(
388
+ "哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
389
+ print(tknzr.fine_grained_tokenize(tks))
390
+ tks = tknzr.tokenize(
391
+ "公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。")
392
+ print(tknzr.fine_grained_tokenize(tks))
393
+ tks = tknzr.tokenize(
394
+ "多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥")
395
+ print(tknzr.fine_grained_tokenize(tks))
396
+ tks = tknzr.tokenize(
397
+ "实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
398
+ print(tknzr.fine_grained_tokenize(tks))
399
+ tks = tknzr.tokenize("虽然我不怎么玩")
400
+ print(tknzr.fine_grained_tokenize(tks))
401
+ tks = tknzr.tokenize("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
402
+ print(tknzr.fine_grained_tokenize(tks))
403
+ tks = tknzr.tokenize(
404
+ "涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了")
405
+ print(tknzr.fine_grained_tokenize(tks))
406
+ tks = tknzr.tokenize("这周日你去吗?这周日你有空吗?")
407
+ print(tknzr.fine_grained_tokenize(tks))
408
+ tks = tknzr.tokenize("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
409
+ print(tknzr.fine_grained_tokenize(tks))
410
+ tks = tknzr.tokenize(
411
+ "数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
412
+ print(tknzr.fine_grained_tokenize(tks))
413
+ if len(sys.argv) < 2:
414
+ sys.exit()
415
+ tknzr.DEBUG = False
416
+ tknzr.loadUserDict(sys.argv[1])
417
+ of = open(sys.argv[2], "r")
418
+ while True:
419
+ line = of.readline()
420
+ if not line:
421
+ break
422
+ print(tknzr.tokenize(line))
423
+ of.close()
deepdoc/vision/__init__.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+
3
+ from .ocr import OCR
4
+ from .recognizer import Recognizer
5
+ from .layout_recognizer import LayoutRecognizer
6
+ from .table_structure_recognizer import TableStructureRecognizer
7
+
8
+
9
+ def init_in_out(args):
10
+ from PIL import Image
11
+ import os
12
+ import traceback
13
+ from deepdoc.utils.file_utils import traversal_files
14
+ images = []
15
+ outputs = []
16
+
17
+ if not os.path.exists(args.output_dir):
18
+ os.mkdir(args.output_dir)
19
+
20
+ def pdf_pages(fnm, zoomin=3):
21
+ nonlocal outputs, images
22
+ pdf = pdfplumber.open(fnm)
23
+ images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
24
+ enumerate(pdf.pages)]
25
+
26
+ for i, page in enumerate(images):
27
+ outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
28
+
29
+ def images_and_outputs(fnm):
30
+ nonlocal outputs, images
31
+ if fnm.split(".")[-1].lower() == "pdf":
32
+ pdf_pages(fnm)
33
+ return
34
+ try:
35
+ images.append(Image.open(fnm))
36
+ outputs.append(os.path.split(fnm)[-1])
37
+ except Exception as e:
38
+ traceback.print_exc()
39
+
40
+ if os.path.isdir(args.inputs):
41
+ for fnm in traversal_files(args.inputs):
42
+ images_and_outputs(fnm)
43
+ else:
44
+ images_and_outputs(args.inputs)
45
+
46
+ for i in range(len(outputs)): outputs[i] = os.path.join(args.output_dir, outputs[i])
47
+
48
+ return images, outputs
deepdoc/vision/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.81 kB). View file
 
deepdoc/vision/__pycache__/layout_recognizer.cpython-310.pyc ADDED
Binary file (4.82 kB). View file
 
deepdoc/vision/__pycache__/ocr.cpython-310.pyc ADDED
Binary file (16.8 kB). View file
 
deepdoc/vision/__pycache__/operators.cpython-310.pyc ADDED
Binary file (20.2 kB). View file