BAAI
/

MonteXiaofeng commited on
Commit
35b54f7
·
verified ·
1 Parent(s): 98d5982

Upload files

Browse files
.DS_Store ADDED
Binary file (8.2 kB). View file
 
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "",
3
+ "architectures": [
4
+ "XLMRobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": 0.1,
9
+ "eos_token_id": 2,
10
+ "finetuning_task": "text-classification",
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.0,
13
+ "hidden_size": 1024,
14
+ "id2label": {
15
+ "0": "\u4ea4\u901a\u8fd0\u8f93",
16
+ "1": "\u4eba\u5de5\u667a\u80fd_\u673a\u5668\u5b66\u4e60",
17
+ "2": "\u4f4f\u5bbf_\u9910\u996e_\u9152\u5e97",
18
+ "3": "\u4f53\u80b2",
19
+ "4": "\u5176\u4ed6",
20
+ "5": "\u5176\u4ed6\u4fe1\u606f\u670d\u52a1_\u4fe1\u606f\u5b89\u5168",
21
+ "6": "\u5176\u4ed6\u5236\u9020",
22
+ "7": "\u519c\u6797\u7267\u6e14",
23
+ "8": "\u533b\u5b66_\u5065\u5eb7_\u5fc3\u7406_\u4e2d\u533b",
24
+ "9": "\u5b66\u79d1\u6559\u80b2_\u6559\u80b2",
25
+ "10": "\u5f71\u89c6_\u5a31\u4e50",
26
+ "11": "\u623f\u5730\u4ea7_\u5efa\u7b51",
27
+ "12": "\u6570\u5b66_\u7edf\u8ba1\u5b66",
28
+ "13": "\u6587\u5b66_\u60c5\u611f",
29
+ "14": "\u65b0\u95fb\u4f20\u5a92",
30
+ "15": "\u65c5\u6e38_\u5730\u7406",
31
+ "16": "\u65f6\u653f_\u653f\u52a1_\u884c\u653f",
32
+ "17": "\u6c34\u5229_\u6d77\u6d0b",
33
+ "18": "\u6c7d\u8f66",
34
+ "19": "\u6cd5\u5f8b_\u53f8\u6cd5",
35
+ "20": "\u6d88\u9632\u5b89\u5168_\u98df\u54c1\u5b89\u5168",
36
+ "21": "\u6e38\u620f",
37
+ "22": "\u751f\u7269\u533b\u836f",
38
+ "23": "\u7535\u529b\u80fd\u6e90",
39
+ "24": "\u77f3\u6cb9\u5316\u5de5",
40
+ "25": "\u79d1\u6280_\u79d1\u5b66\u7814\u7a76",
41
+ "26": "\u822a\u7a7a\u822a\u5929",
42
+ "27": "\u8ba1\u7b97\u673a_\u901a\u4fe1",
43
+ "28": "\u8ba1\u7b97\u673a\u7f16\u7a0b_\u4ee3\u7801",
44
+ "29": "\u91c7\u77ff",
45
+ "30": "\u91d1\u878d_\u7ecf\u6d4e"
46
+ },
47
+ "initializer_range": 0.02,
48
+ "intermediate_size": 4096,
49
+ "label2id": {
50
+ "\u4ea4\u901a\u8fd0\u8f93": 0,
51
+ "\u4eba\u5de5\u667a\u80fd_\u673a\u5668\u5b66\u4e60": 1,
52
+ "\u4f4f\u5bbf_\u9910\u996e_\u9152\u5e97": 2,
53
+ "\u4f53\u80b2": 3,
54
+ "\u5176\u4ed6": 4,
55
+ "\u5176\u4ed6\u4fe1\u606f\u670d\u52a1_\u4fe1\u606f\u5b89\u5168": 5,
56
+ "\u5176\u4ed6\u5236\u9020": 6,
57
+ "\u519c\u6797\u7267\u6e14": 7,
58
+ "\u533b\u5b66_\u5065\u5eb7_\u5fc3\u7406_\u4e2d\u533b": 8,
59
+ "\u5b66\u79d1\u6559\u80b2_\u6559\u80b2": 9,
60
+ "\u5f71\u89c6_\u5a31\u4e50": 10,
61
+ "\u623f\u5730\u4ea7_\u5efa\u7b51": 11,
62
+ "\u6570\u5b66_\u7edf\u8ba1\u5b66": 12,
63
+ "\u6587\u5b66_\u60c5\u611f": 13,
64
+ "\u65b0\u95fb\u4f20\u5a92": 14,
65
+ "\u65c5\u6e38_\u5730\u7406": 15,
66
+ "\u65f6\u653f_\u653f\u52a1_\u884c\u653f": 16,
67
+ "\u6c34\u5229_\u6d77\u6d0b": 17,
68
+ "\u6c7d\u8f66": 18,
69
+ "\u6cd5\u5f8b_\u53f8\u6cd5": 19,
70
+ "\u6d88\u9632\u5b89\u5168_\u98df\u54c1\u5b89\u5168": 20,
71
+ "\u6e38\u620f": 21,
72
+ "\u751f\u7269\u533b\u836f": 22,
73
+ "\u7535\u529b\u80fd\u6e90": 23,
74
+ "\u77f3\u6cb9\u5316\u5de5": 24,
75
+ "\u79d1\u6280_\u79d1\u5b66\u7814\u7a76": 25,
76
+ "\u822a\u7a7a\u822a\u5929": 26,
77
+ "\u8ba1\u7b97\u673a_\u901a\u4fe1": 27,
78
+ "\u8ba1\u7b97\u673a\u7f16\u7a0b_\u4ee3\u7801": 28,
79
+ "\u91c7\u77ff": 29,
80
+ "\u91d1\u878d_\u7ecf\u6d4e": 30
81
+ },
82
+ "layer_norm_eps": 1e-05,
83
+ "max_position_embeddings": 8194,
84
+ "model_type": "xlm-roberta",
85
+ "num_attention_heads": 16,
86
+ "num_hidden_layers": 24,
87
+ "output_past": true,
88
+ "pad_token_id": 1,
89
+ "position_embedding_type": "absolute",
90
+ "problem_type": "single_label_classification",
91
+ "torch_dtype": "float32",
92
+ "transformers_version": "4.39.0",
93
+ "type_vocab_size": 1,
94
+ "use_cache": true,
95
+ "vocab_size": 250002
96
+ }
img/.DS_Store ADDED
Binary file (6.15 kB). View file
 
img/classify.png ADDED
img/classify_exp.png ADDED
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0597708ae15fdcce27850e947653857d31f2e161a0ff46c53185ad58b644908c
3
+ size 2271194860
readme.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 模型是数据集[BAAI/IndustryCorpus2](https://huggingface.co/datasets/BAAI/IndustryCorpus2)中用来进行行业分类分类
2
+
3
+ 模型细节:
4
+
5
+ 为了提升数据集中行业划分对实际行业的覆盖,并对齐国家标准中定义的行业目录,我们参考国家统计局制定的国民经济行业分类体系和世界知识体系,进行类目的合并和整合,设计了覆盖中英文的最终的31个行业类目。类目表名称如下所示
6
+
7
+ ```
8
+ {
9
+ "数学_统计": {"zh": "数学与统计", "en": "Math & Statistics"},
10
+ "体育": {"zh": "体育", "en": "Sports"},
11
+ "农林牧渔": {"zh": "农业与渔业", "en": "Agriculture & Fisheries"},
12
+ "房地产_建筑": {"zh": "房地产与建筑", "en": "Real Estate & Construction"},
13
+ "时政_政务_行政": {"zh": "政治与行政", "en": "Politics & Administration"},
14
+ "消防安全_食品安全": {"zh": "安全管理", "en": "Safety Management"},
15
+ "石油化工": {"zh": "石油化工", "en": "Petrochemicals"},
16
+ "计算机_通信": {"zh": "计算机与通信", "en": "Computing & Telecommunications"},
17
+ "交通运输": {"zh": "交通运输", "en": "Transportation"},
18
+ "其他": {"zh": "其他", "en": "Others"},
19
+ "医学_健康_心理_中医": {"zh": "健康与医学", "en": "Health & Medicine"},
20
+ "文学_情感": {"zh": "文学与情感", "en": "Literature & Emotions"},
21
+ "水利_海洋": {"zh": "水利与海洋", "en": "Water Resources & Marine"},
22
+ "游戏": {"zh": "游戏", "en": "Gaming"},
23
+ "科技_科学研究": {"zh": "科技与研究", "en": "Technology & Research"},
24
+ "采矿": {"zh": "采矿", "en": "Mining"},
25
+ "人工智能_机器学习": {"zh": "人工智能", "en": "Artificial Intelligence"},
26
+ "其他信息服务_信息安全": {"zh": "信息服务", "en": "Information Services"},
27
+ "学科教育_教育": {"zh": "学科教育", "en": "Subject Education"},
28
+ "新闻传媒": {"zh": "新闻传媒", "en": "Media & Journalism"},
29
+ "汽车": {"zh": "汽车", "en": "Automobiles"},
30
+ "生物医药": {"zh": "生物医药", "en": "Biopharmaceuticals"},
31
+ "航空航天": {"zh": "航空航天", "en": "Aerospace"},
32
+ "金融_经济": {"zh": "金融与经济", "en": "Finance & Economics"},
33
+ "住宿_餐饮_酒店": {"zh": "住宿与餐饮", "en": "Hospitality & Catering"},
34
+ "其他制造": {"zh": "制造业", "en": "Manufacturing"},
35
+ "影视_娱乐": {"zh": "影视与娱乐", "en": "Film & Entertainment"},
36
+ "旅游_地理": {"zh": "旅游与地理", "en": "Travel & Geography"},
37
+ "法律_司法": {"zh": "法律与司法", "en": "Law & Justice"},
38
+ "电力能源": {"zh": "电力与能源", "en": "Power & Energy"},
39
+ "计算机编程_代码": {"zh": "编程", "en": "Programming"},
40
+ }
41
+ ```
42
+
43
+ - 行业分类模型的数据构造
44
+
45
+ - 数据构建
46
+
47
+ 数据来源:预训练预训练语料抽样和开源文本分类数据,其中预训练语料占比90%,通过数据采样,保证中英文数据占比为1:1
48
+
49
+ 标签构造:使用LLM模型对数据进行多次分类判定,筛选多次判定一致的数据作为训练数据
50
+
51
+ 数据规模:36K
52
+
53
+ 数据构造的整体流程如下:
54
+
55
+ ![image-20240919140307205](./img/classify.png)
56
+
57
+ - 模型训练:
58
+
59
+ 参数更新:在预训练的bert模型上添加分类头进行文本分类模型训练
60
+
61
+ 模型选型:考虑的模型性能和推理效率,我们选用了0.5b规模的模型,通过对比实验最终最终选择了bge-m3并全参数训练的方式,作为我们的基座模型
62
+
63
+ 训练超参:全参数训练,max_length = 2048,lr=1e-5,batch_size=64,,验证集评估acc:86%
64
+
65
+ ![image-20240919141408659](./img/classify_exp.png)
66
+
67
+ ###
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6710678b12670bc442b99edc952c4d996ae309a7020c1fa0096dd245c2faf790
3
+ size 17082821
tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "mask_token": "<mask>",
49
+ "model_max_length": 8192,
50
+ "pad_token": "<pad>",
51
+ "sep_token": "</s>",
52
+ "sp_model_kwargs": {},
53
+ "tokenizer_class": "XLMRobertaTokenizer",
54
+ "unk_token": "<unk>"
55
+ }
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16a60324779f4561552c1e524383f6591f97b80e0adc5098445a110a72360d23
3
+ size 5240