bond005 commited on
Commit
191d1d2
1 Parent(s): 76d3271

Upload 13 files

Browse files
config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/bond005/competitions/SHROOM/models/xlm-roberta-xl-hallucination-detector",
3
+ "architectures": [
4
+ "XLMRobertaXLForHierarchicalSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "auto_map": {
8
+ "AutoConfig": "modeling_hierarchical_classifier.HierarchicalXLMRobertaXLConfig",
9
+ "AutoModelForSequenceClassification": "modeling_hierarchical_classifier.XLMRobertaXLForHierarchicalSequenceClassification",
10
+ "AutoModelForTextEncoding": "modeling_hierarchical_classifier.XLMRobertaXLForHierarchicalEmbedding"
11
+ },
12
+ "bos_token_id": 0,
13
+ "classifier_dropout": null,
14
+ "eos_token_id": 2,
15
+ "hidden_act": "gelu",
16
+ "hidden_dropout_prob": 0.1,
17
+ "hidden_size": 2560,
18
+ "id2label": {
19
+ "0": "Not Hallucination",
20
+ "1": "Hallucination"
21
+ },
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 10240,
24
+ "label2id": {
25
+ "Hallucination": 1,
26
+ "Not Hallucination": 0
27
+ },
28
+ "label_smoothing": null,
29
+ "layer_norm_eps": 1e-05,
30
+ "max_position_embeddings": 514,
31
+ "model_type": "hierarchical-xlm-roberta-xl",
32
+ "num_attention_heads": 32,
33
+ "num_hidden_layers": 36,
34
+ "pad_token_id": 1,
35
+ "position_embedding_type": "absolute",
36
+ "temperature": 0.1,
37
+ "tokenizer_class": "XLMRobertaTokenizer",
38
+ "torch_dtype": "float32",
39
+ "transformers_version": "4.35.0",
40
+ "type_vocab_size": 1,
41
+ "use_cache": true,
42
+ "vocab_size": 250880
43
+ }
handler.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from typing import Any, Dict, Union
4
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
5
+
6
+
7
+ class EndpointHandler:
8
+ def __init__(self, path=""):
9
+ # load model and tokenizer from path
10
+ self.tokenizer = AutoTokenizer.from_pretrained(path)
11
+ self.model = AutoModelForSequenceClassification.from_pretrained(
12
+ path, device_map="auto", trust_remote_code=True
13
+ )
14
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
15
+
16
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, Union[str, float]]:
17
+ # process input
18
+ inputs = data.pop("inputs", data)
19
+
20
+ # preprocess
21
+ inputs = self.tokenizer(inputs, return_tensors="pt").to(self.device)
22
+
23
+ # pass inputs with all kwargs in data
24
+ logits = self.model(**inputs)[0]
25
+
26
+ # postprocess the prediction
27
+ predicted_class_id = int(torch.argmax(logits, dim=-1))
28
+ predicted_score = float(logits[0, predicted_class_id])
29
+ predicted_label = str(self.model.config.id2label[predicted_class_id])
30
+
31
+ return {'label': predicted_label, 'score': predicted_score}
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e154c6173ccc9cdc2d208fe3f4fe7177342302feaee2dce36434eca703da8a35
3
+ size 4958767392
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e88a4c1d5be403e7a7421b564383c9b9afada7240716110d8711f3a1deb90ea
3
+ size 2006286972
model.safetensors.index.json ADDED
@@ -0,0 +1,593 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 6964981836
4
+ },
5
+ "weight_map": {
6
+ "classifier.dense.bias": "model-00002-of-00002.safetensors",
7
+ "classifier.dense.weight": "model-00002-of-00002.safetensors",
8
+ "classifier.out_proj.bias": "model-00002-of-00002.safetensors",
9
+ "classifier.out_proj.weight": "model-00002-of-00002.safetensors",
10
+ "layer_weights.weight": "model-00002-of-00002.safetensors",
11
+ "roberta.embeddings.position_embeddings.weight": "model-00001-of-00002.safetensors",
12
+ "roberta.embeddings.token_type_embeddings.weight": "model-00001-of-00002.safetensors",
13
+ "roberta.embeddings.word_embeddings.weight": "model-00001-of-00002.safetensors",
14
+ "roberta.encoder.LayerNorm.bias": "model-00002-of-00002.safetensors",
15
+ "roberta.encoder.LayerNorm.weight": "model-00002-of-00002.safetensors",
16
+ "roberta.encoder.layer.0.LayerNorm.bias": "model-00001-of-00002.safetensors",
17
+ "roberta.encoder.layer.0.LayerNorm.weight": "model-00001-of-00002.safetensors",
18
+ "roberta.encoder.layer.0.attention.output.dense.bias": "model-00001-of-00002.safetensors",
19
+ "roberta.encoder.layer.0.attention.output.dense.weight": "model-00001-of-00002.safetensors",
20
+ "roberta.encoder.layer.0.attention.self.key.bias": "model-00001-of-00002.safetensors",
21
+ "roberta.encoder.layer.0.attention.self.key.weight": "model-00001-of-00002.safetensors",
22
+ "roberta.encoder.layer.0.attention.self.query.bias": "model-00001-of-00002.safetensors",
23
+ "roberta.encoder.layer.0.attention.self.query.weight": "model-00001-of-00002.safetensors",
24
+ "roberta.encoder.layer.0.attention.self.value.bias": "model-00001-of-00002.safetensors",
25
+ "roberta.encoder.layer.0.attention.self.value.weight": "model-00001-of-00002.safetensors",
26
+ "roberta.encoder.layer.0.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
27
+ "roberta.encoder.layer.0.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
28
+ "roberta.encoder.layer.0.intermediate.dense.bias": "model-00001-of-00002.safetensors",
29
+ "roberta.encoder.layer.0.intermediate.dense.weight": "model-00001-of-00002.safetensors",
30
+ "roberta.encoder.layer.0.output.dense.bias": "model-00001-of-00002.safetensors",
31
+ "roberta.encoder.layer.0.output.dense.weight": "model-00001-of-00002.safetensors",
32
+ "roberta.encoder.layer.1.LayerNorm.bias": "model-00001-of-00002.safetensors",
33
+ "roberta.encoder.layer.1.LayerNorm.weight": "model-00001-of-00002.safetensors",
34
+ "roberta.encoder.layer.1.attention.output.dense.bias": "model-00001-of-00002.safetensors",
35
+ "roberta.encoder.layer.1.attention.output.dense.weight": "model-00001-of-00002.safetensors",
36
+ "roberta.encoder.layer.1.attention.self.key.bias": "model-00001-of-00002.safetensors",
37
+ "roberta.encoder.layer.1.attention.self.key.weight": "model-00001-of-00002.safetensors",
38
+ "roberta.encoder.layer.1.attention.self.query.bias": "model-00001-of-00002.safetensors",
39
+ "roberta.encoder.layer.1.attention.self.query.weight": "model-00001-of-00002.safetensors",
40
+ "roberta.encoder.layer.1.attention.self.value.bias": "model-00001-of-00002.safetensors",
41
+ "roberta.encoder.layer.1.attention.self.value.weight": "model-00001-of-00002.safetensors",
42
+ "roberta.encoder.layer.1.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
43
+ "roberta.encoder.layer.1.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
44
+ "roberta.encoder.layer.1.intermediate.dense.bias": "model-00001-of-00002.safetensors",
45
+ "roberta.encoder.layer.1.intermediate.dense.weight": "model-00001-of-00002.safetensors",
46
+ "roberta.encoder.layer.1.output.dense.bias": "model-00001-of-00002.safetensors",
47
+ "roberta.encoder.layer.1.output.dense.weight": "model-00001-of-00002.safetensors",
48
+ "roberta.encoder.layer.10.LayerNorm.bias": "model-00001-of-00002.safetensors",
49
+ "roberta.encoder.layer.10.LayerNorm.weight": "model-00001-of-00002.safetensors",
50
+ "roberta.encoder.layer.10.attention.output.dense.bias": "model-00001-of-00002.safetensors",
51
+ "roberta.encoder.layer.10.attention.output.dense.weight": "model-00001-of-00002.safetensors",
52
+ "roberta.encoder.layer.10.attention.self.key.bias": "model-00001-of-00002.safetensors",
53
+ "roberta.encoder.layer.10.attention.self.key.weight": "model-00001-of-00002.safetensors",
54
+ "roberta.encoder.layer.10.attention.self.query.bias": "model-00001-of-00002.safetensors",
55
+ "roberta.encoder.layer.10.attention.self.query.weight": "model-00001-of-00002.safetensors",
56
+ "roberta.encoder.layer.10.attention.self.value.bias": "model-00001-of-00002.safetensors",
57
+ "roberta.encoder.layer.10.attention.self.value.weight": "model-00001-of-00002.safetensors",
58
+ "roberta.encoder.layer.10.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
59
+ "roberta.encoder.layer.10.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
60
+ "roberta.encoder.layer.10.intermediate.dense.bias": "model-00001-of-00002.safetensors",
61
+ "roberta.encoder.layer.10.intermediate.dense.weight": "model-00001-of-00002.safetensors",
62
+ "roberta.encoder.layer.10.output.dense.bias": "model-00001-of-00002.safetensors",
63
+ "roberta.encoder.layer.10.output.dense.weight": "model-00001-of-00002.safetensors",
64
+ "roberta.encoder.layer.11.LayerNorm.bias": "model-00001-of-00002.safetensors",
65
+ "roberta.encoder.layer.11.LayerNorm.weight": "model-00001-of-00002.safetensors",
66
+ "roberta.encoder.layer.11.attention.output.dense.bias": "model-00001-of-00002.safetensors",
67
+ "roberta.encoder.layer.11.attention.output.dense.weight": "model-00001-of-00002.safetensors",
68
+ "roberta.encoder.layer.11.attention.self.key.bias": "model-00001-of-00002.safetensors",
69
+ "roberta.encoder.layer.11.attention.self.key.weight": "model-00001-of-00002.safetensors",
70
+ "roberta.encoder.layer.11.attention.self.query.bias": "model-00001-of-00002.safetensors",
71
+ "roberta.encoder.layer.11.attention.self.query.weight": "model-00001-of-00002.safetensors",
72
+ "roberta.encoder.layer.11.attention.self.value.bias": "model-00001-of-00002.safetensors",
73
+ "roberta.encoder.layer.11.attention.self.value.weight": "model-00001-of-00002.safetensors",
74
+ "roberta.encoder.layer.11.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
75
+ "roberta.encoder.layer.11.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
76
+ "roberta.encoder.layer.11.intermediate.dense.bias": "model-00001-of-00002.safetensors",
77
+ "roberta.encoder.layer.11.intermediate.dense.weight": "model-00001-of-00002.safetensors",
78
+ "roberta.encoder.layer.11.output.dense.bias": "model-00001-of-00002.safetensors",
79
+ "roberta.encoder.layer.11.output.dense.weight": "model-00001-of-00002.safetensors",
80
+ "roberta.encoder.layer.12.LayerNorm.bias": "model-00001-of-00002.safetensors",
81
+ "roberta.encoder.layer.12.LayerNorm.weight": "model-00001-of-00002.safetensors",
82
+ "roberta.encoder.layer.12.attention.output.dense.bias": "model-00001-of-00002.safetensors",
83
+ "roberta.encoder.layer.12.attention.output.dense.weight": "model-00001-of-00002.safetensors",
84
+ "roberta.encoder.layer.12.attention.self.key.bias": "model-00001-of-00002.safetensors",
85
+ "roberta.encoder.layer.12.attention.self.key.weight": "model-00001-of-00002.safetensors",
86
+ "roberta.encoder.layer.12.attention.self.query.bias": "model-00001-of-00002.safetensors",
87
+ "roberta.encoder.layer.12.attention.self.query.weight": "model-00001-of-00002.safetensors",
88
+ "roberta.encoder.layer.12.attention.self.value.bias": "model-00001-of-00002.safetensors",
89
+ "roberta.encoder.layer.12.attention.self.value.weight": "model-00001-of-00002.safetensors",
90
+ "roberta.encoder.layer.12.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
91
+ "roberta.encoder.layer.12.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
92
+ "roberta.encoder.layer.12.intermediate.dense.bias": "model-00001-of-00002.safetensors",
93
+ "roberta.encoder.layer.12.intermediate.dense.weight": "model-00001-of-00002.safetensors",
94
+ "roberta.encoder.layer.12.output.dense.bias": "model-00001-of-00002.safetensors",
95
+ "roberta.encoder.layer.12.output.dense.weight": "model-00001-of-00002.safetensors",
96
+ "roberta.encoder.layer.13.LayerNorm.bias": "model-00001-of-00002.safetensors",
97
+ "roberta.encoder.layer.13.LayerNorm.weight": "model-00001-of-00002.safetensors",
98
+ "roberta.encoder.layer.13.attention.output.dense.bias": "model-00001-of-00002.safetensors",
99
+ "roberta.encoder.layer.13.attention.output.dense.weight": "model-00001-of-00002.safetensors",
100
+ "roberta.encoder.layer.13.attention.self.key.bias": "model-00001-of-00002.safetensors",
101
+ "roberta.encoder.layer.13.attention.self.key.weight": "model-00001-of-00002.safetensors",
102
+ "roberta.encoder.layer.13.attention.self.query.bias": "model-00001-of-00002.safetensors",
103
+ "roberta.encoder.layer.13.attention.self.query.weight": "model-00001-of-00002.safetensors",
104
+ "roberta.encoder.layer.13.attention.self.value.bias": "model-00001-of-00002.safetensors",
105
+ "roberta.encoder.layer.13.attention.self.value.weight": "model-00001-of-00002.safetensors",
106
+ "roberta.encoder.layer.13.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
107
+ "roberta.encoder.layer.13.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
108
+ "roberta.encoder.layer.13.intermediate.dense.bias": "model-00001-of-00002.safetensors",
109
+ "roberta.encoder.layer.13.intermediate.dense.weight": "model-00001-of-00002.safetensors",
110
+ "roberta.encoder.layer.13.output.dense.bias": "model-00001-of-00002.safetensors",
111
+ "roberta.encoder.layer.13.output.dense.weight": "model-00001-of-00002.safetensors",
112
+ "roberta.encoder.layer.14.LayerNorm.bias": "model-00001-of-00002.safetensors",
113
+ "roberta.encoder.layer.14.LayerNorm.weight": "model-00001-of-00002.safetensors",
114
+ "roberta.encoder.layer.14.attention.output.dense.bias": "model-00001-of-00002.safetensors",
115
+ "roberta.encoder.layer.14.attention.output.dense.weight": "model-00001-of-00002.safetensors",
116
+ "roberta.encoder.layer.14.attention.self.key.bias": "model-00001-of-00002.safetensors",
117
+ "roberta.encoder.layer.14.attention.self.key.weight": "model-00001-of-00002.safetensors",
118
+ "roberta.encoder.layer.14.attention.self.query.bias": "model-00001-of-00002.safetensors",
119
+ "roberta.encoder.layer.14.attention.self.query.weight": "model-00001-of-00002.safetensors",
120
+ "roberta.encoder.layer.14.attention.self.value.bias": "model-00001-of-00002.safetensors",
121
+ "roberta.encoder.layer.14.attention.self.value.weight": "model-00001-of-00002.safetensors",
122
+ "roberta.encoder.layer.14.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
123
+ "roberta.encoder.layer.14.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
124
+ "roberta.encoder.layer.14.intermediate.dense.bias": "model-00001-of-00002.safetensors",
125
+ "roberta.encoder.layer.14.intermediate.dense.weight": "model-00001-of-00002.safetensors",
126
+ "roberta.encoder.layer.14.output.dense.bias": "model-00001-of-00002.safetensors",
127
+ "roberta.encoder.layer.14.output.dense.weight": "model-00001-of-00002.safetensors",
128
+ "roberta.encoder.layer.15.LayerNorm.bias": "model-00001-of-00002.safetensors",
129
+ "roberta.encoder.layer.15.LayerNorm.weight": "model-00001-of-00002.safetensors",
130
+ "roberta.encoder.layer.15.attention.output.dense.bias": "model-00001-of-00002.safetensors",
131
+ "roberta.encoder.layer.15.attention.output.dense.weight": "model-00001-of-00002.safetensors",
132
+ "roberta.encoder.layer.15.attention.self.key.bias": "model-00001-of-00002.safetensors",
133
+ "roberta.encoder.layer.15.attention.self.key.weight": "model-00001-of-00002.safetensors",
134
+ "roberta.encoder.layer.15.attention.self.query.bias": "model-00001-of-00002.safetensors",
135
+ "roberta.encoder.layer.15.attention.self.query.weight": "model-00001-of-00002.safetensors",
136
+ "roberta.encoder.layer.15.attention.self.value.bias": "model-00001-of-00002.safetensors",
137
+ "roberta.encoder.layer.15.attention.self.value.weight": "model-00001-of-00002.safetensors",
138
+ "roberta.encoder.layer.15.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
139
+ "roberta.encoder.layer.15.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
140
+ "roberta.encoder.layer.15.intermediate.dense.bias": "model-00001-of-00002.safetensors",
141
+ "roberta.encoder.layer.15.intermediate.dense.weight": "model-00001-of-00002.safetensors",
142
+ "roberta.encoder.layer.15.output.dense.bias": "model-00001-of-00002.safetensors",
143
+ "roberta.encoder.layer.15.output.dense.weight": "model-00001-of-00002.safetensors",
144
+ "roberta.encoder.layer.16.LayerNorm.bias": "model-00001-of-00002.safetensors",
145
+ "roberta.encoder.layer.16.LayerNorm.weight": "model-00001-of-00002.safetensors",
146
+ "roberta.encoder.layer.16.attention.output.dense.bias": "model-00001-of-00002.safetensors",
147
+ "roberta.encoder.layer.16.attention.output.dense.weight": "model-00001-of-00002.safetensors",
148
+ "roberta.encoder.layer.16.attention.self.key.bias": "model-00001-of-00002.safetensors",
149
+ "roberta.encoder.layer.16.attention.self.key.weight": "model-00001-of-00002.safetensors",
150
+ "roberta.encoder.layer.16.attention.self.query.bias": "model-00001-of-00002.safetensors",
151
+ "roberta.encoder.layer.16.attention.self.query.weight": "model-00001-of-00002.safetensors",
152
+ "roberta.encoder.layer.16.attention.self.value.bias": "model-00001-of-00002.safetensors",
153
+ "roberta.encoder.layer.16.attention.self.value.weight": "model-00001-of-00002.safetensors",
154
+ "roberta.encoder.layer.16.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
155
+ "roberta.encoder.layer.16.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
156
+ "roberta.encoder.layer.16.intermediate.dense.bias": "model-00001-of-00002.safetensors",
157
+ "roberta.encoder.layer.16.intermediate.dense.weight": "model-00001-of-00002.safetensors",
158
+ "roberta.encoder.layer.16.output.dense.bias": "model-00001-of-00002.safetensors",
159
+ "roberta.encoder.layer.16.output.dense.weight": "model-00001-of-00002.safetensors",
160
+ "roberta.encoder.layer.17.LayerNorm.bias": "model-00001-of-00002.safetensors",
161
+ "roberta.encoder.layer.17.LayerNorm.weight": "model-00001-of-00002.safetensors",
162
+ "roberta.encoder.layer.17.attention.output.dense.bias": "model-00001-of-00002.safetensors",
163
+ "roberta.encoder.layer.17.attention.output.dense.weight": "model-00001-of-00002.safetensors",
164
+ "roberta.encoder.layer.17.attention.self.key.bias": "model-00001-of-00002.safetensors",
165
+ "roberta.encoder.layer.17.attention.self.key.weight": "model-00001-of-00002.safetensors",
166
+ "roberta.encoder.layer.17.attention.self.query.bias": "model-00001-of-00002.safetensors",
167
+ "roberta.encoder.layer.17.attention.self.query.weight": "model-00001-of-00002.safetensors",
168
+ "roberta.encoder.layer.17.attention.self.value.bias": "model-00001-of-00002.safetensors",
169
+ "roberta.encoder.layer.17.attention.self.value.weight": "model-00001-of-00002.safetensors",
170
+ "roberta.encoder.layer.17.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
171
+ "roberta.encoder.layer.17.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
172
+ "roberta.encoder.layer.17.intermediate.dense.bias": "model-00001-of-00002.safetensors",
173
+ "roberta.encoder.layer.17.intermediate.dense.weight": "model-00001-of-00002.safetensors",
174
+ "roberta.encoder.layer.17.output.dense.bias": "model-00001-of-00002.safetensors",
175
+ "roberta.encoder.layer.17.output.dense.weight": "model-00001-of-00002.safetensors",
176
+ "roberta.encoder.layer.18.LayerNorm.bias": "model-00001-of-00002.safetensors",
177
+ "roberta.encoder.layer.18.LayerNorm.weight": "model-00001-of-00002.safetensors",
178
+ "roberta.encoder.layer.18.attention.output.dense.bias": "model-00001-of-00002.safetensors",
179
+ "roberta.encoder.layer.18.attention.output.dense.weight": "model-00001-of-00002.safetensors",
180
+ "roberta.encoder.layer.18.attention.self.key.bias": "model-00001-of-00002.safetensors",
181
+ "roberta.encoder.layer.18.attention.self.key.weight": "model-00001-of-00002.safetensors",
182
+ "roberta.encoder.layer.18.attention.self.query.bias": "model-00001-of-00002.safetensors",
183
+ "roberta.encoder.layer.18.attention.self.query.weight": "model-00001-of-00002.safetensors",
184
+ "roberta.encoder.layer.18.attention.self.value.bias": "model-00001-of-00002.safetensors",
185
+ "roberta.encoder.layer.18.attention.self.value.weight": "model-00001-of-00002.safetensors",
186
+ "roberta.encoder.layer.18.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
187
+ "roberta.encoder.layer.18.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
188
+ "roberta.encoder.layer.18.intermediate.dense.bias": "model-00001-of-00002.safetensors",
189
+ "roberta.encoder.layer.18.intermediate.dense.weight": "model-00001-of-00002.safetensors",
190
+ "roberta.encoder.layer.18.output.dense.bias": "model-00001-of-00002.safetensors",
191
+ "roberta.encoder.layer.18.output.dense.weight": "model-00001-of-00002.safetensors",
192
+ "roberta.encoder.layer.19.LayerNorm.bias": "model-00001-of-00002.safetensors",
193
+ "roberta.encoder.layer.19.LayerNorm.weight": "model-00001-of-00002.safetensors",
194
+ "roberta.encoder.layer.19.attention.output.dense.bias": "model-00001-of-00002.safetensors",
195
+ "roberta.encoder.layer.19.attention.output.dense.weight": "model-00001-of-00002.safetensors",
196
+ "roberta.encoder.layer.19.attention.self.key.bias": "model-00001-of-00002.safetensors",
197
+ "roberta.encoder.layer.19.attention.self.key.weight": "model-00001-of-00002.safetensors",
198
+ "roberta.encoder.layer.19.attention.self.query.bias": "model-00001-of-00002.safetensors",
199
+ "roberta.encoder.layer.19.attention.self.query.weight": "model-00001-of-00002.safetensors",
200
+ "roberta.encoder.layer.19.attention.self.value.bias": "model-00001-of-00002.safetensors",
201
+ "roberta.encoder.layer.19.attention.self.value.weight": "model-00001-of-00002.safetensors",
202
+ "roberta.encoder.layer.19.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
203
+ "roberta.encoder.layer.19.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
204
+ "roberta.encoder.layer.19.intermediate.dense.bias": "model-00001-of-00002.safetensors",
205
+ "roberta.encoder.layer.19.intermediate.dense.weight": "model-00001-of-00002.safetensors",
206
+ "roberta.encoder.layer.19.output.dense.bias": "model-00001-of-00002.safetensors",
207
+ "roberta.encoder.layer.19.output.dense.weight": "model-00001-of-00002.safetensors",
208
+ "roberta.encoder.layer.2.LayerNorm.bias": "model-00001-of-00002.safetensors",
209
+ "roberta.encoder.layer.2.LayerNorm.weight": "model-00001-of-00002.safetensors",
210
+ "roberta.encoder.layer.2.attention.output.dense.bias": "model-00001-of-00002.safetensors",
211
+ "roberta.encoder.layer.2.attention.output.dense.weight": "model-00001-of-00002.safetensors",
212
+ "roberta.encoder.layer.2.attention.self.key.bias": "model-00001-of-00002.safetensors",
213
+ "roberta.encoder.layer.2.attention.self.key.weight": "model-00001-of-00002.safetensors",
214
+ "roberta.encoder.layer.2.attention.self.query.bias": "model-00001-of-00002.safetensors",
215
+ "roberta.encoder.layer.2.attention.self.query.weight": "model-00001-of-00002.safetensors",
216
+ "roberta.encoder.layer.2.attention.self.value.bias": "model-00001-of-00002.safetensors",
217
+ "roberta.encoder.layer.2.attention.self.value.weight": "model-00001-of-00002.safetensors",
218
+ "roberta.encoder.layer.2.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
219
+ "roberta.encoder.layer.2.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
220
+ "roberta.encoder.layer.2.intermediate.dense.bias": "model-00001-of-00002.safetensors",
221
+ "roberta.encoder.layer.2.intermediate.dense.weight": "model-00001-of-00002.safetensors",
222
+ "roberta.encoder.layer.2.output.dense.bias": "model-00001-of-00002.safetensors",
223
+ "roberta.encoder.layer.2.output.dense.weight": "model-00001-of-00002.safetensors",
224
+ "roberta.encoder.layer.20.LayerNorm.bias": "model-00001-of-00002.safetensors",
225
+ "roberta.encoder.layer.20.LayerNorm.weight": "model-00001-of-00002.safetensors",
226
+ "roberta.encoder.layer.20.attention.output.dense.bias": "model-00001-of-00002.safetensors",
227
+ "roberta.encoder.layer.20.attention.output.dense.weight": "model-00001-of-00002.safetensors",
228
+ "roberta.encoder.layer.20.attention.self.key.bias": "model-00001-of-00002.safetensors",
229
+ "roberta.encoder.layer.20.attention.self.key.weight": "model-00001-of-00002.safetensors",
230
+ "roberta.encoder.layer.20.attention.self.query.bias": "model-00001-of-00002.safetensors",
231
+ "roberta.encoder.layer.20.attention.self.query.weight": "model-00001-of-00002.safetensors",
232
+ "roberta.encoder.layer.20.attention.self.value.bias": "model-00001-of-00002.safetensors",
233
+ "roberta.encoder.layer.20.attention.self.value.weight": "model-00001-of-00002.safetensors",
234
+ "roberta.encoder.layer.20.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
235
+ "roberta.encoder.layer.20.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
236
+ "roberta.encoder.layer.20.intermediate.dense.bias": "model-00001-of-00002.safetensors",
237
+ "roberta.encoder.layer.20.intermediate.dense.weight": "model-00001-of-00002.safetensors",
238
+ "roberta.encoder.layer.20.output.dense.bias": "model-00001-of-00002.safetensors",
239
+ "roberta.encoder.layer.20.output.dense.weight": "model-00001-of-00002.safetensors",
240
+ "roberta.encoder.layer.21.LayerNorm.bias": "model-00001-of-00002.safetensors",
241
+ "roberta.encoder.layer.21.LayerNorm.weight": "model-00001-of-00002.safetensors",
242
+ "roberta.encoder.layer.21.attention.output.dense.bias": "model-00001-of-00002.safetensors",
243
+ "roberta.encoder.layer.21.attention.output.dense.weight": "model-00001-of-00002.safetensors",
244
+ "roberta.encoder.layer.21.attention.self.key.bias": "model-00001-of-00002.safetensors",
245
+ "roberta.encoder.layer.21.attention.self.key.weight": "model-00001-of-00002.safetensors",
246
+ "roberta.encoder.layer.21.attention.self.query.bias": "model-00001-of-00002.safetensors",
247
+ "roberta.encoder.layer.21.attention.self.query.weight": "model-00001-of-00002.safetensors",
248
+ "roberta.encoder.layer.21.attention.self.value.bias": "model-00001-of-00002.safetensors",
249
+ "roberta.encoder.layer.21.attention.self.value.weight": "model-00001-of-00002.safetensors",
250
+ "roberta.encoder.layer.21.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
251
+ "roberta.encoder.layer.21.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
252
+ "roberta.encoder.layer.21.intermediate.dense.bias": "model-00001-of-00002.safetensors",
253
+ "roberta.encoder.layer.21.intermediate.dense.weight": "model-00001-of-00002.safetensors",
254
+ "roberta.encoder.layer.21.output.dense.bias": "model-00001-of-00002.safetensors",
255
+ "roberta.encoder.layer.21.output.dense.weight": "model-00001-of-00002.safetensors",
256
+ "roberta.encoder.layer.22.LayerNorm.bias": "model-00001-of-00002.safetensors",
257
+ "roberta.encoder.layer.22.LayerNorm.weight": "model-00001-of-00002.safetensors",
258
+ "roberta.encoder.layer.22.attention.output.dense.bias": "model-00001-of-00002.safetensors",
259
+ "roberta.encoder.layer.22.attention.output.dense.weight": "model-00001-of-00002.safetensors",
260
+ "roberta.encoder.layer.22.attention.self.key.bias": "model-00001-of-00002.safetensors",
261
+ "roberta.encoder.layer.22.attention.self.key.weight": "model-00001-of-00002.safetensors",
262
+ "roberta.encoder.layer.22.attention.self.query.bias": "model-00001-of-00002.safetensors",
263
+ "roberta.encoder.layer.22.attention.self.query.weight": "model-00001-of-00002.safetensors",
264
+ "roberta.encoder.layer.22.attention.self.value.bias": "model-00001-of-00002.safetensors",
265
+ "roberta.encoder.layer.22.attention.self.value.weight": "model-00001-of-00002.safetensors",
266
+ "roberta.encoder.layer.22.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
267
+ "roberta.encoder.layer.22.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
268
+ "roberta.encoder.layer.22.intermediate.dense.bias": "model-00001-of-00002.safetensors",
269
+ "roberta.encoder.layer.22.intermediate.dense.weight": "model-00001-of-00002.safetensors",
270
+ "roberta.encoder.layer.22.output.dense.bias": "model-00001-of-00002.safetensors",
271
+ "roberta.encoder.layer.22.output.dense.weight": "model-00001-of-00002.safetensors",
272
+ "roberta.encoder.layer.23.LayerNorm.bias": "model-00002-of-00002.safetensors",
273
+ "roberta.encoder.layer.23.LayerNorm.weight": "model-00002-of-00002.safetensors",
274
+ "roberta.encoder.layer.23.attention.output.dense.bias": "model-00001-of-00002.safetensors",
275
+ "roberta.encoder.layer.23.attention.output.dense.weight": "model-00001-of-00002.safetensors",
276
+ "roberta.encoder.layer.23.attention.self.key.bias": "model-00001-of-00002.safetensors",
277
+ "roberta.encoder.layer.23.attention.self.key.weight": "model-00001-of-00002.safetensors",
278
+ "roberta.encoder.layer.23.attention.self.query.bias": "model-00001-of-00002.safetensors",
279
+ "roberta.encoder.layer.23.attention.self.query.weight": "model-00001-of-00002.safetensors",
280
+ "roberta.encoder.layer.23.attention.self.value.bias": "model-00001-of-00002.safetensors",
281
+ "roberta.encoder.layer.23.attention.self.value.weight": "model-00001-of-00002.safetensors",
282
+ "roberta.encoder.layer.23.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
283
+ "roberta.encoder.layer.23.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
284
+ "roberta.encoder.layer.23.intermediate.dense.bias": "model-00002-of-00002.safetensors",
285
+ "roberta.encoder.layer.23.intermediate.dense.weight": "model-00002-of-00002.safetensors",
286
+ "roberta.encoder.layer.23.output.dense.bias": "model-00002-of-00002.safetensors",
287
+ "roberta.encoder.layer.23.output.dense.weight": "model-00002-of-00002.safetensors",
288
+ "roberta.encoder.layer.24.LayerNorm.bias": "model-00002-of-00002.safetensors",
289
+ "roberta.encoder.layer.24.LayerNorm.weight": "model-00002-of-00002.safetensors",
290
+ "roberta.encoder.layer.24.attention.output.dense.bias": "model-00002-of-00002.safetensors",
291
+ "roberta.encoder.layer.24.attention.output.dense.weight": "model-00002-of-00002.safetensors",
292
+ "roberta.encoder.layer.24.attention.self.key.bias": "model-00002-of-00002.safetensors",
293
+ "roberta.encoder.layer.24.attention.self.key.weight": "model-00002-of-00002.safetensors",
294
+ "roberta.encoder.layer.24.attention.self.query.bias": "model-00002-of-00002.safetensors",
295
+ "roberta.encoder.layer.24.attention.self.query.weight": "model-00002-of-00002.safetensors",
296
+ "roberta.encoder.layer.24.attention.self.value.bias": "model-00002-of-00002.safetensors",
297
+ "roberta.encoder.layer.24.attention.self.value.weight": "model-00002-of-00002.safetensors",
298
+ "roberta.encoder.layer.24.attention.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
299
+ "roberta.encoder.layer.24.attention.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
300
+ "roberta.encoder.layer.24.intermediate.dense.bias": "model-00002-of-00002.safetensors",
301
+ "roberta.encoder.layer.24.intermediate.dense.weight": "model-00002-of-00002.safetensors",
302
+ "roberta.encoder.layer.24.output.dense.bias": "model-00002-of-00002.safetensors",
303
+ "roberta.encoder.layer.24.output.dense.weight": "model-00002-of-00002.safetensors",
304
+ "roberta.encoder.layer.25.LayerNorm.bias": "model-00002-of-00002.safetensors",
305
+ "roberta.encoder.layer.25.LayerNorm.weight": "model-00002-of-00002.safetensors",
306
+ "roberta.encoder.layer.25.attention.output.dense.bias": "model-00002-of-00002.safetensors",
307
+ "roberta.encoder.layer.25.attention.output.dense.weight": "model-00002-of-00002.safetensors",
308
+ "roberta.encoder.layer.25.attention.self.key.bias": "model-00002-of-00002.safetensors",
309
+ "roberta.encoder.layer.25.attention.self.key.weight": "model-00002-of-00002.safetensors",
310
+ "roberta.encoder.layer.25.attention.self.query.bias": "model-00002-of-00002.safetensors",
311
+ "roberta.encoder.layer.25.attention.self.query.weight": "model-00002-of-00002.safetensors",
312
+ "roberta.encoder.layer.25.attention.self.value.bias": "model-00002-of-00002.safetensors",
313
+ "roberta.encoder.layer.25.attention.self.value.weight": "model-00002-of-00002.safetensors",
314
+ "roberta.encoder.layer.25.attention.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
315
+ "roberta.encoder.layer.25.attention.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
316
+ "roberta.encoder.layer.25.intermediate.dense.bias": "model-00002-of-00002.safetensors",
317
+ "roberta.encoder.layer.25.intermediate.dense.weight": "model-00002-of-00002.safetensors",
318
+ "roberta.encoder.layer.25.output.dense.bias": "model-00002-of-00002.safetensors",
319
+ "roberta.encoder.layer.25.output.dense.weight": "model-00002-of-00002.safetensors",
320
+ "roberta.encoder.layer.26.LayerNorm.bias": "model-00002-of-00002.safetensors",
321
+ "roberta.encoder.layer.26.LayerNorm.weight": "model-00002-of-00002.safetensors",
322
+ "roberta.encoder.layer.26.attention.output.dense.bias": "model-00002-of-00002.safetensors",
323
+ "roberta.encoder.layer.26.attention.output.dense.weight": "model-00002-of-00002.safetensors",
324
+ "roberta.encoder.layer.26.attention.self.key.bias": "model-00002-of-00002.safetensors",
325
+ "roberta.encoder.layer.26.attention.self.key.weight": "model-00002-of-00002.safetensors",
326
+ "roberta.encoder.layer.26.attention.self.query.bias": "model-00002-of-00002.safetensors",
327
+ "roberta.encoder.layer.26.attention.self.query.weight": "model-00002-of-00002.safetensors",
328
+ "roberta.encoder.layer.26.attention.self.value.bias": "model-00002-of-00002.safetensors",
329
+ "roberta.encoder.layer.26.attention.self.value.weight": "model-00002-of-00002.safetensors",
330
+ "roberta.encoder.layer.26.attention.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
331
+ "roberta.encoder.layer.26.attention.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
332
+ "roberta.encoder.layer.26.intermediate.dense.bias": "model-00002-of-00002.safetensors",
333
+ "roberta.encoder.layer.26.intermediate.dense.weight": "model-00002-of-00002.safetensors",
334
+ "roberta.encoder.layer.26.output.dense.bias": "model-00002-of-00002.safetensors",
335
+ "roberta.encoder.layer.26.output.dense.weight": "model-00002-of-00002.safetensors",
336
+ "roberta.encoder.layer.27.LayerNorm.bias": "model-00002-of-00002.safetensors",
337
+ "roberta.encoder.layer.27.LayerNorm.weight": "model-00002-of-00002.safetensors",
338
+ "roberta.encoder.layer.27.attention.output.dense.bias": "model-00002-of-00002.safetensors",
339
+ "roberta.encoder.layer.27.attention.output.dense.weight": "model-00002-of-00002.safetensors",
340
+ "roberta.encoder.layer.27.attention.self.key.bias": "model-00002-of-00002.safetensors",
341
+ "roberta.encoder.layer.27.attention.self.key.weight": "model-00002-of-00002.safetensors",
342
+ "roberta.encoder.layer.27.attention.self.query.bias": "model-00002-of-00002.safetensors",
343
+ "roberta.encoder.layer.27.attention.self.query.weight": "model-00002-of-00002.safetensors",
344
+ "roberta.encoder.layer.27.attention.self.value.bias": "model-00002-of-00002.safetensors",
345
+ "roberta.encoder.layer.27.attention.self.value.weight": "model-00002-of-00002.safetensors",
346
+ "roberta.encoder.layer.27.attention.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
347
+ "roberta.encoder.layer.27.attention.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
348
+ "roberta.encoder.layer.27.intermediate.dense.bias": "model-00002-of-00002.safetensors",
349
+ "roberta.encoder.layer.27.intermediate.dense.weight": "model-00002-of-00002.safetensors",
350
+ "roberta.encoder.layer.27.output.dense.bias": "model-00002-of-00002.safetensors",
351
+ "roberta.encoder.layer.27.output.dense.weight": "model-00002-of-00002.safetensors",
352
+ "roberta.encoder.layer.28.LayerNorm.bias": "model-00002-of-00002.safetensors",
353
+ "roberta.encoder.layer.28.LayerNorm.weight": "model-00002-of-00002.safetensors",
354
+ "roberta.encoder.layer.28.attention.output.dense.bias": "model-00002-of-00002.safetensors",
355
+ "roberta.encoder.layer.28.attention.output.dense.weight": "model-00002-of-00002.safetensors",
356
+ "roberta.encoder.layer.28.attention.self.key.bias": "model-00002-of-00002.safetensors",
357
+ "roberta.encoder.layer.28.attention.self.key.weight": "model-00002-of-00002.safetensors",
358
+ "roberta.encoder.layer.28.attention.self.query.bias": "model-00002-of-00002.safetensors",
359
+ "roberta.encoder.layer.28.attention.self.query.weight": "model-00002-of-00002.safetensors",
360
+ "roberta.encoder.layer.28.attention.self.value.bias": "model-00002-of-00002.safetensors",
361
+ "roberta.encoder.layer.28.attention.self.value.weight": "model-00002-of-00002.safetensors",
362
+ "roberta.encoder.layer.28.attention.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
363
+ "roberta.encoder.layer.28.attention.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
364
+ "roberta.encoder.layer.28.intermediate.dense.bias": "model-00002-of-00002.safetensors",
365
+ "roberta.encoder.layer.28.intermediate.dense.weight": "model-00002-of-00002.safetensors",
366
+ "roberta.encoder.layer.28.output.dense.bias": "model-00002-of-00002.safetensors",
367
+ "roberta.encoder.layer.28.output.dense.weight": "model-00002-of-00002.safetensors",
368
+ "roberta.encoder.layer.29.LayerNorm.bias": "model-00002-of-00002.safetensors",
369
+ "roberta.encoder.layer.29.LayerNorm.weight": "model-00002-of-00002.safetensors",
370
+ "roberta.encoder.layer.29.attention.output.dense.bias": "model-00002-of-00002.safetensors",
371
+ "roberta.encoder.layer.29.attention.output.dense.weight": "model-00002-of-00002.safetensors",
372
+ "roberta.encoder.layer.29.attention.self.key.bias": "model-00002-of-00002.safetensors",
373
+ "roberta.encoder.layer.29.attention.self.key.weight": "model-00002-of-00002.safetensors",
374
+ "roberta.encoder.layer.29.attention.self.query.bias": "model-00002-of-00002.safetensors",
375
+ "roberta.encoder.layer.29.attention.self.query.weight": "model-00002-of-00002.safetensors",
376
+ "roberta.encoder.layer.29.attention.self.value.bias": "model-00002-of-00002.safetensors",
377
+ "roberta.encoder.layer.29.attention.self.value.weight": "model-00002-of-00002.safetensors",
378
+ "roberta.encoder.layer.29.attention.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
379
+ "roberta.encoder.layer.29.attention.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
380
+ "roberta.encoder.layer.29.intermediate.dense.bias": "model-00002-of-00002.safetensors",
381
+ "roberta.encoder.layer.29.intermediate.dense.weight": "model-00002-of-00002.safetensors",
382
+ "roberta.encoder.layer.29.output.dense.bias": "model-00002-of-00002.safetensors",
383
+ "roberta.encoder.layer.29.output.dense.weight": "model-00002-of-00002.safetensors",
384
+ "roberta.encoder.layer.3.LayerNorm.bias": "model-00001-of-00002.safetensors",
385
+ "roberta.encoder.layer.3.LayerNorm.weight": "model-00001-of-00002.safetensors",
386
+ "roberta.encoder.layer.3.attention.output.dense.bias": "model-00001-of-00002.safetensors",
387
+ "roberta.encoder.layer.3.attention.output.dense.weight": "model-00001-of-00002.safetensors",
388
+ "roberta.encoder.layer.3.attention.self.key.bias": "model-00001-of-00002.safetensors",
389
+ "roberta.encoder.layer.3.attention.self.key.weight": "model-00001-of-00002.safetensors",
390
+ "roberta.encoder.layer.3.attention.self.query.bias": "model-00001-of-00002.safetensors",
391
+ "roberta.encoder.layer.3.attention.self.query.weight": "model-00001-of-00002.safetensors",
392
+ "roberta.encoder.layer.3.attention.self.value.bias": "model-00001-of-00002.safetensors",
393
+ "roberta.encoder.layer.3.attention.self.value.weight": "model-00001-of-00002.safetensors",
394
+ "roberta.encoder.layer.3.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
395
+ "roberta.encoder.layer.3.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
396
+ "roberta.encoder.layer.3.intermediate.dense.bias": "model-00001-of-00002.safetensors",
397
+ "roberta.encoder.layer.3.intermediate.dense.weight": "model-00001-of-00002.safetensors",
398
+ "roberta.encoder.layer.3.output.dense.bias": "model-00001-of-00002.safetensors",
399
+ "roberta.encoder.layer.3.output.dense.weight": "model-00001-of-00002.safetensors",
400
+ "roberta.encoder.layer.30.LayerNorm.bias": "model-00002-of-00002.safetensors",
401
+ "roberta.encoder.layer.30.LayerNorm.weight": "model-00002-of-00002.safetensors",
402
+ "roberta.encoder.layer.30.attention.output.dense.bias": "model-00002-of-00002.safetensors",
403
+ "roberta.encoder.layer.30.attention.output.dense.weight": "model-00002-of-00002.safetensors",
404
+ "roberta.encoder.layer.30.attention.self.key.bias": "model-00002-of-00002.safetensors",
405
+ "roberta.encoder.layer.30.attention.self.key.weight": "model-00002-of-00002.safetensors",
406
+ "roberta.encoder.layer.30.attention.self.query.bias": "model-00002-of-00002.safetensors",
407
+ "roberta.encoder.layer.30.attention.self.query.weight": "model-00002-of-00002.safetensors",
408
+ "roberta.encoder.layer.30.attention.self.value.bias": "model-00002-of-00002.safetensors",
409
+ "roberta.encoder.layer.30.attention.self.value.weight": "model-00002-of-00002.safetensors",
410
+ "roberta.encoder.layer.30.attention.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
411
+ "roberta.encoder.layer.30.attention.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
412
+ "roberta.encoder.layer.30.intermediate.dense.bias": "model-00002-of-00002.safetensors",
413
+ "roberta.encoder.layer.30.intermediate.dense.weight": "model-00002-of-00002.safetensors",
414
+ "roberta.encoder.layer.30.output.dense.bias": "model-00002-of-00002.safetensors",
415
+ "roberta.encoder.layer.30.output.dense.weight": "model-00002-of-00002.safetensors",
416
+ "roberta.encoder.layer.31.LayerNorm.bias": "model-00002-of-00002.safetensors",
417
+ "roberta.encoder.layer.31.LayerNorm.weight": "model-00002-of-00002.safetensors",
418
+ "roberta.encoder.layer.31.attention.output.dense.bias": "model-00002-of-00002.safetensors",
419
+ "roberta.encoder.layer.31.attention.output.dense.weight": "model-00002-of-00002.safetensors",
420
+ "roberta.encoder.layer.31.attention.self.key.bias": "model-00002-of-00002.safetensors",
421
+ "roberta.encoder.layer.31.attention.self.key.weight": "model-00002-of-00002.safetensors",
422
+ "roberta.encoder.layer.31.attention.self.query.bias": "model-00002-of-00002.safetensors",
423
+ "roberta.encoder.layer.31.attention.self.query.weight": "model-00002-of-00002.safetensors",
424
+ "roberta.encoder.layer.31.attention.self.value.bias": "model-00002-of-00002.safetensors",
425
+ "roberta.encoder.layer.31.attention.self.value.weight": "model-00002-of-00002.safetensors",
426
+ "roberta.encoder.layer.31.attention.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
427
+ "roberta.encoder.layer.31.attention.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
428
+ "roberta.encoder.layer.31.intermediate.dense.bias": "model-00002-of-00002.safetensors",
429
+ "roberta.encoder.layer.31.intermediate.dense.weight": "model-00002-of-00002.safetensors",
430
+ "roberta.encoder.layer.31.output.dense.bias": "model-00002-of-00002.safetensors",
431
+ "roberta.encoder.layer.31.output.dense.weight": "model-00002-of-00002.safetensors",
432
+ "roberta.encoder.layer.32.LayerNorm.bias": "model-00002-of-00002.safetensors",
433
+ "roberta.encoder.layer.32.LayerNorm.weight": "model-00002-of-00002.safetensors",
434
+ "roberta.encoder.layer.32.attention.output.dense.bias": "model-00002-of-00002.safetensors",
435
+ "roberta.encoder.layer.32.attention.output.dense.weight": "model-00002-of-00002.safetensors",
436
+ "roberta.encoder.layer.32.attention.self.key.bias": "model-00002-of-00002.safetensors",
437
+ "roberta.encoder.layer.32.attention.self.key.weight": "model-00002-of-00002.safetensors",
438
+ "roberta.encoder.layer.32.attention.self.query.bias": "model-00002-of-00002.safetensors",
439
+ "roberta.encoder.layer.32.attention.self.query.weight": "model-00002-of-00002.safetensors",
440
+ "roberta.encoder.layer.32.attention.self.value.bias": "model-00002-of-00002.safetensors",
441
+ "roberta.encoder.layer.32.attention.self.value.weight": "model-00002-of-00002.safetensors",
442
+ "roberta.encoder.layer.32.attention.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
443
+ "roberta.encoder.layer.32.attention.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
444
+ "roberta.encoder.layer.32.intermediate.dense.bias": "model-00002-of-00002.safetensors",
445
+ "roberta.encoder.layer.32.intermediate.dense.weight": "model-00002-of-00002.safetensors",
446
+ "roberta.encoder.layer.32.output.dense.bias": "model-00002-of-00002.safetensors",
447
+ "roberta.encoder.layer.32.output.dense.weight": "model-00002-of-00002.safetensors",
448
+ "roberta.encoder.layer.33.LayerNorm.bias": "model-00002-of-00002.safetensors",
449
+ "roberta.encoder.layer.33.LayerNorm.weight": "model-00002-of-00002.safetensors",
450
+ "roberta.encoder.layer.33.attention.output.dense.bias": "model-00002-of-00002.safetensors",
451
+ "roberta.encoder.layer.33.attention.output.dense.weight": "model-00002-of-00002.safetensors",
452
+ "roberta.encoder.layer.33.attention.self.key.bias": "model-00002-of-00002.safetensors",
453
+ "roberta.encoder.layer.33.attention.self.key.weight": "model-00002-of-00002.safetensors",
454
+ "roberta.encoder.layer.33.attention.self.query.bias": "model-00002-of-00002.safetensors",
455
+ "roberta.encoder.layer.33.attention.self.query.weight": "model-00002-of-00002.safetensors",
456
+ "roberta.encoder.layer.33.attention.self.value.bias": "model-00002-of-00002.safetensors",
457
+ "roberta.encoder.layer.33.attention.self.value.weight": "model-00002-of-00002.safetensors",
458
+ "roberta.encoder.layer.33.attention.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
459
+ "roberta.encoder.layer.33.attention.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
460
+ "roberta.encoder.layer.33.intermediate.dense.bias": "model-00002-of-00002.safetensors",
461
+ "roberta.encoder.layer.33.intermediate.dense.weight": "model-00002-of-00002.safetensors",
462
+ "roberta.encoder.layer.33.output.dense.bias": "model-00002-of-00002.safetensors",
463
+ "roberta.encoder.layer.33.output.dense.weight": "model-00002-of-00002.safetensors",
464
+ "roberta.encoder.layer.34.LayerNorm.bias": "model-00002-of-00002.safetensors",
465
+ "roberta.encoder.layer.34.LayerNorm.weight": "model-00002-of-00002.safetensors",
466
+ "roberta.encoder.layer.34.attention.output.dense.bias": "model-00002-of-00002.safetensors",
467
+ "roberta.encoder.layer.34.attention.output.dense.weight": "model-00002-of-00002.safetensors",
468
+ "roberta.encoder.layer.34.attention.self.key.bias": "model-00002-of-00002.safetensors",
469
+ "roberta.encoder.layer.34.attention.self.key.weight": "model-00002-of-00002.safetensors",
470
+ "roberta.encoder.layer.34.attention.self.query.bias": "model-00002-of-00002.safetensors",
471
+ "roberta.encoder.layer.34.attention.self.query.weight": "model-00002-of-00002.safetensors",
472
+ "roberta.encoder.layer.34.attention.self.value.bias": "model-00002-of-00002.safetensors",
473
+ "roberta.encoder.layer.34.attention.self.value.weight": "model-00002-of-00002.safetensors",
474
+ "roberta.encoder.layer.34.attention.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
475
+ "roberta.encoder.layer.34.attention.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
476
+ "roberta.encoder.layer.34.intermediate.dense.bias": "model-00002-of-00002.safetensors",
477
+ "roberta.encoder.layer.34.intermediate.dense.weight": "model-00002-of-00002.safetensors",
478
+ "roberta.encoder.layer.34.output.dense.bias": "model-00002-of-00002.safetensors",
479
+ "roberta.encoder.layer.34.output.dense.weight": "model-00002-of-00002.safetensors",
480
+ "roberta.encoder.layer.35.LayerNorm.bias": "model-00002-of-00002.safetensors",
481
+ "roberta.encoder.layer.35.LayerNorm.weight": "model-00002-of-00002.safetensors",
482
+ "roberta.encoder.layer.35.attention.output.dense.bias": "model-00002-of-00002.safetensors",
483
+ "roberta.encoder.layer.35.attention.output.dense.weight": "model-00002-of-00002.safetensors",
484
+ "roberta.encoder.layer.35.attention.self.key.bias": "model-00002-of-00002.safetensors",
485
+ "roberta.encoder.layer.35.attention.self.key.weight": "model-00002-of-00002.safetensors",
486
+ "roberta.encoder.layer.35.attention.self.query.bias": "model-00002-of-00002.safetensors",
487
+ "roberta.encoder.layer.35.attention.self.query.weight": "model-00002-of-00002.safetensors",
488
+ "roberta.encoder.layer.35.attention.self.value.bias": "model-00002-of-00002.safetensors",
489
+ "roberta.encoder.layer.35.attention.self.value.weight": "model-00002-of-00002.safetensors",
490
+ "roberta.encoder.layer.35.attention.self_attn_layer_norm.bias": "model-00002-of-00002.safetensors",
491
+ "roberta.encoder.layer.35.attention.self_attn_layer_norm.weight": "model-00002-of-00002.safetensors",
492
+ "roberta.encoder.layer.35.intermediate.dense.bias": "model-00002-of-00002.safetensors",
493
+ "roberta.encoder.layer.35.intermediate.dense.weight": "model-00002-of-00002.safetensors",
494
+ "roberta.encoder.layer.35.output.dense.bias": "model-00002-of-00002.safetensors",
495
+ "roberta.encoder.layer.35.output.dense.weight": "model-00002-of-00002.safetensors",
496
+ "roberta.encoder.layer.4.LayerNorm.bias": "model-00001-of-00002.safetensors",
497
+ "roberta.encoder.layer.4.LayerNorm.weight": "model-00001-of-00002.safetensors",
498
+ "roberta.encoder.layer.4.attention.output.dense.bias": "model-00001-of-00002.safetensors",
499
+ "roberta.encoder.layer.4.attention.output.dense.weight": "model-00001-of-00002.safetensors",
500
+ "roberta.encoder.layer.4.attention.self.key.bias": "model-00001-of-00002.safetensors",
501
+ "roberta.encoder.layer.4.attention.self.key.weight": "model-00001-of-00002.safetensors",
502
+ "roberta.encoder.layer.4.attention.self.query.bias": "model-00001-of-00002.safetensors",
503
+ "roberta.encoder.layer.4.attention.self.query.weight": "model-00001-of-00002.safetensors",
504
+ "roberta.encoder.layer.4.attention.self.value.bias": "model-00001-of-00002.safetensors",
505
+ "roberta.encoder.layer.4.attention.self.value.weight": "model-00001-of-00002.safetensors",
506
+ "roberta.encoder.layer.4.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
507
+ "roberta.encoder.layer.4.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
508
+ "roberta.encoder.layer.4.intermediate.dense.bias": "model-00001-of-00002.safetensors",
509
+ "roberta.encoder.layer.4.intermediate.dense.weight": "model-00001-of-00002.safetensors",
510
+ "roberta.encoder.layer.4.output.dense.bias": "model-00001-of-00002.safetensors",
511
+ "roberta.encoder.layer.4.output.dense.weight": "model-00001-of-00002.safetensors",
512
+ "roberta.encoder.layer.5.LayerNorm.bias": "model-00001-of-00002.safetensors",
513
+ "roberta.encoder.layer.5.LayerNorm.weight": "model-00001-of-00002.safetensors",
514
+ "roberta.encoder.layer.5.attention.output.dense.bias": "model-00001-of-00002.safetensors",
515
+ "roberta.encoder.layer.5.attention.output.dense.weight": "model-00001-of-00002.safetensors",
516
+ "roberta.encoder.layer.5.attention.self.key.bias": "model-00001-of-00002.safetensors",
517
+ "roberta.encoder.layer.5.attention.self.key.weight": "model-00001-of-00002.safetensors",
518
+ "roberta.encoder.layer.5.attention.self.query.bias": "model-00001-of-00002.safetensors",
519
+ "roberta.encoder.layer.5.attention.self.query.weight": "model-00001-of-00002.safetensors",
520
+ "roberta.encoder.layer.5.attention.self.value.bias": "model-00001-of-00002.safetensors",
521
+ "roberta.encoder.layer.5.attention.self.value.weight": "model-00001-of-00002.safetensors",
522
+ "roberta.encoder.layer.5.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
523
+ "roberta.encoder.layer.5.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
524
+ "roberta.encoder.layer.5.intermediate.dense.bias": "model-00001-of-00002.safetensors",
525
+ "roberta.encoder.layer.5.intermediate.dense.weight": "model-00001-of-00002.safetensors",
526
+ "roberta.encoder.layer.5.output.dense.bias": "model-00001-of-00002.safetensors",
527
+ "roberta.encoder.layer.5.output.dense.weight": "model-00001-of-00002.safetensors",
528
+ "roberta.encoder.layer.6.LayerNorm.bias": "model-00001-of-00002.safetensors",
529
+ "roberta.encoder.layer.6.LayerNorm.weight": "model-00001-of-00002.safetensors",
530
+ "roberta.encoder.layer.6.attention.output.dense.bias": "model-00001-of-00002.safetensors",
531
+ "roberta.encoder.layer.6.attention.output.dense.weight": "model-00001-of-00002.safetensors",
532
+ "roberta.encoder.layer.6.attention.self.key.bias": "model-00001-of-00002.safetensors",
533
+ "roberta.encoder.layer.6.attention.self.key.weight": "model-00001-of-00002.safetensors",
534
+ "roberta.encoder.layer.6.attention.self.query.bias": "model-00001-of-00002.safetensors",
535
+ "roberta.encoder.layer.6.attention.self.query.weight": "model-00001-of-00002.safetensors",
536
+ "roberta.encoder.layer.6.attention.self.value.bias": "model-00001-of-00002.safetensors",
537
+ "roberta.encoder.layer.6.attention.self.value.weight": "model-00001-of-00002.safetensors",
538
+ "roberta.encoder.layer.6.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
539
+ "roberta.encoder.layer.6.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
540
+ "roberta.encoder.layer.6.intermediate.dense.bias": "model-00001-of-00002.safetensors",
541
+ "roberta.encoder.layer.6.intermediate.dense.weight": "model-00001-of-00002.safetensors",
542
+ "roberta.encoder.layer.6.output.dense.bias": "model-00001-of-00002.safetensors",
543
+ "roberta.encoder.layer.6.output.dense.weight": "model-00001-of-00002.safetensors",
544
+ "roberta.encoder.layer.7.LayerNorm.bias": "model-00001-of-00002.safetensors",
545
+ "roberta.encoder.layer.7.LayerNorm.weight": "model-00001-of-00002.safetensors",
546
+ "roberta.encoder.layer.7.attention.output.dense.bias": "model-00001-of-00002.safetensors",
547
+ "roberta.encoder.layer.7.attention.output.dense.weight": "model-00001-of-00002.safetensors",
548
+ "roberta.encoder.layer.7.attention.self.key.bias": "model-00001-of-00002.safetensors",
549
+ "roberta.encoder.layer.7.attention.self.key.weight": "model-00001-of-00002.safetensors",
550
+ "roberta.encoder.layer.7.attention.self.query.bias": "model-00001-of-00002.safetensors",
551
+ "roberta.encoder.layer.7.attention.self.query.weight": "model-00001-of-00002.safetensors",
552
+ "roberta.encoder.layer.7.attention.self.value.bias": "model-00001-of-00002.safetensors",
553
+ "roberta.encoder.layer.7.attention.self.value.weight": "model-00001-of-00002.safetensors",
554
+ "roberta.encoder.layer.7.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
555
+ "roberta.encoder.layer.7.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
556
+ "roberta.encoder.layer.7.intermediate.dense.bias": "model-00001-of-00002.safetensors",
557
+ "roberta.encoder.layer.7.intermediate.dense.weight": "model-00001-of-00002.safetensors",
558
+ "roberta.encoder.layer.7.output.dense.bias": "model-00001-of-00002.safetensors",
559
+ "roberta.encoder.layer.7.output.dense.weight": "model-00001-of-00002.safetensors",
560
+ "roberta.encoder.layer.8.LayerNorm.bias": "model-00001-of-00002.safetensors",
561
+ "roberta.encoder.layer.8.LayerNorm.weight": "model-00001-of-00002.safetensors",
562
+ "roberta.encoder.layer.8.attention.output.dense.bias": "model-00001-of-00002.safetensors",
563
+ "roberta.encoder.layer.8.attention.output.dense.weight": "model-00001-of-00002.safetensors",
564
+ "roberta.encoder.layer.8.attention.self.key.bias": "model-00001-of-00002.safetensors",
565
+ "roberta.encoder.layer.8.attention.self.key.weight": "model-00001-of-00002.safetensors",
566
+ "roberta.encoder.layer.8.attention.self.query.bias": "model-00001-of-00002.safetensors",
567
+ "roberta.encoder.layer.8.attention.self.query.weight": "model-00001-of-00002.safetensors",
568
+ "roberta.encoder.layer.8.attention.self.value.bias": "model-00001-of-00002.safetensors",
569
+ "roberta.encoder.layer.8.attention.self.value.weight": "model-00001-of-00002.safetensors",
570
+ "roberta.encoder.layer.8.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
571
+ "roberta.encoder.layer.8.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
572
+ "roberta.encoder.layer.8.intermediate.dense.bias": "model-00001-of-00002.safetensors",
573
+ "roberta.encoder.layer.8.intermediate.dense.weight": "model-00001-of-00002.safetensors",
574
+ "roberta.encoder.layer.8.output.dense.bias": "model-00001-of-00002.safetensors",
575
+ "roberta.encoder.layer.8.output.dense.weight": "model-00001-of-00002.safetensors",
576
+ "roberta.encoder.layer.9.LayerNorm.bias": "model-00001-of-00002.safetensors",
577
+ "roberta.encoder.layer.9.LayerNorm.weight": "model-00001-of-00002.safetensors",
578
+ "roberta.encoder.layer.9.attention.output.dense.bias": "model-00001-of-00002.safetensors",
579
+ "roberta.encoder.layer.9.attention.output.dense.weight": "model-00001-of-00002.safetensors",
580
+ "roberta.encoder.layer.9.attention.self.key.bias": "model-00001-of-00002.safetensors",
581
+ "roberta.encoder.layer.9.attention.self.key.weight": "model-00001-of-00002.safetensors",
582
+ "roberta.encoder.layer.9.attention.self.query.bias": "model-00001-of-00002.safetensors",
583
+ "roberta.encoder.layer.9.attention.self.query.weight": "model-00001-of-00002.safetensors",
584
+ "roberta.encoder.layer.9.attention.self.value.bias": "model-00001-of-00002.safetensors",
585
+ "roberta.encoder.layer.9.attention.self.value.weight": "model-00001-of-00002.safetensors",
586
+ "roberta.encoder.layer.9.attention.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
587
+ "roberta.encoder.layer.9.attention.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
588
+ "roberta.encoder.layer.9.intermediate.dense.bias": "model-00001-of-00002.safetensors",
589
+ "roberta.encoder.layer.9.intermediate.dense.weight": "model-00001-of-00002.safetensors",
590
+ "roberta.encoder.layer.9.output.dense.bias": "model-00001-of-00002.safetensors",
591
+ "roberta.encoder.layer.9.output.dense.weight": "model-00001-of-00002.safetensors"
592
+ }
593
+ }
modeling_hierarchical_classifier.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC
2
+ from dataclasses import dataclass
3
+ from typing import List, Optional, Tuple, Union
4
+
5
+ import numpy as np
6
+ import torch
7
+ from torch.nn.modules.loss import _Loss
8
+ from transformers import XLMRobertaXLPreTrainedModel, XLMRobertaXLModel, XLMRobertaXLConfig
9
+ from transformers import AutoModelForSequenceClassification, AutoConfig
10
+ from transformers.modeling_outputs import ModelOutput
11
+ from pytorch_metric_learning.losses import NTXentLoss
12
+
13
+
14
+ @dataclass
15
+ class HierarchicalSequenceEmbedderOutput(ModelOutput):
16
+ loss: Optional[torch.FloatTensor] = None
17
+ embeddings: torch.FloatTensor = None
18
+ layer_embeddings: torch.FloatTensor = None
19
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
20
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
21
+
22
+
23
+ @dataclass
24
+ class HierarchicalSequenceClassifierOutput(ModelOutput):
25
+ loss: Optional[torch.FloatTensor] = None
26
+ logits: torch.FloatTensor = None
27
+ embeddings: torch.FloatTensor = None
28
+ layer_embeddings: torch.FloatTensor = None
29
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
30
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
31
+
32
+
33
+ class HierarchicalXLMRobertaXLConfig(XLMRobertaXLConfig):
34
+ model_type = "hierarchical-xlm-roberta-xl"
35
+
36
+ def __init__(self, label_smoothing: Optional[float] = None, **kwargs):
37
+ super().__init__(**kwargs)
38
+ self.label_smoothing = label_smoothing
39
+
40
+
41
+ class XLMRobertaXLHierarchicalClassificationHead(torch.nn.Module):
42
+ def __init__(self, config):
43
+ super().__init__()
44
+ self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size)
45
+ classifier_dropout = (
46
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
47
+ )
48
+ self.dropout = torch.nn.Dropout(classifier_dropout)
49
+ self.out_proj = torch.nn.Linear(config.hidden_size, config.num_labels)
50
+
51
+ def forward(self, features, **kwargs):
52
+ x = self.dropout(features)
53
+ x = self.dense(x)
54
+ x = torch.tanh(x)
55
+ x = self.dropout(x)
56
+ x = self.out_proj(x)
57
+ return x
58
+
59
+
60
+ def distance_to_probability(distance: torch.Tensor, margin: float) -> torch.Tensor:
61
+ margin = torch.full(size=distance.size(), fill_value=margin,
62
+ dtype=distance.dtype, device=distance.device, requires_grad=False)
63
+ p = (1.0 + torch.exp(-margin)) / (1.0 + torch.exp(distance - margin))
64
+ del margin
65
+ return p
66
+
67
+
68
+ class DistanceBasedLogisticLoss(_Loss):
69
+ __constants__ = ['margin', 'reduction']
70
+ margin: float
71
+
72
+ def __init__(self, margin: float = 1.0, size_average=None, reduce=None, reduction: str = 'mean'):
73
+ super(DistanceBasedLogisticLoss, self).__init__(size_average, reduce, reduction)
74
+ self.margin = margin
75
+
76
+ def forward(self, inputs, targets):
77
+ inputs = inputs.view(-1)
78
+ targets = targets.to(inputs.dtype).view(-1)
79
+ p = distance_to_probability(inputs, self.margin)
80
+ return 1.0 - torch.nn.functional.binary_cross_entropy(input=p, target=targets, reduction=self.reduction)
81
+
82
+
83
+ class LayerGatingNetwork(torch.nn.Module):
84
+ __constants__ = ['in_features']
85
+ in_features: int
86
+ weight: torch.Tensor
87
+
88
+ def __init__(self, in_features: int, device=None, dtype=None) -> None:
89
+ factory_kwargs = {'device': device, 'dtype': dtype}
90
+ super().__init__()
91
+ self.in_features = in_features
92
+ self.weight = torch.nn.Parameter(torch.empty((1, in_features), **factory_kwargs))
93
+ self.reset_parameters()
94
+
95
+ def reset_parameters(self) -> None:
96
+ initial_layer_weights = np.array(
97
+ [1.0 / (self.in_features - layer_idx) for layer_idx in range(self.in_features)],
98
+ dtype=np.float32
99
+ )
100
+ initial_layer_weights /= np.sum(initial_layer_weights)
101
+ initial_layer_weights_pt = torch.tensor(
102
+ initial_layer_weights.reshape((1, self.in_features)),
103
+ dtype=self.weight.dtype,
104
+ device=self.weight.device
105
+ )
106
+ del initial_layer_weights
107
+ self.weight = torch.nn.Parameter(initial_layer_weights_pt)
108
+ del initial_layer_weights_pt
109
+
110
+ def forward(self, input: torch.Tensor) -> torch.Tensor:
111
+ return torch.nn.functional.linear(input, torch.softmax(self.weight, dim=-1))
112
+
113
+ def extra_repr(self) -> str:
114
+ return 'in_features={}'.format(self.in_features)
115
+
116
+
117
+ class XLMRobertaXLForHierarchicalEmbedding(XLMRobertaXLPreTrainedModel, ABC):
118
+ config_class = HierarchicalXLMRobertaXLConfig
119
+
120
+ def __init__(self, config: HierarchicalXLMRobertaXLConfig):
121
+ super().__init__(config)
122
+ self.num_labels = config.num_labels
123
+ self.temperature = config.temperature
124
+ self.config = config
125
+
126
+ self.roberta = XLMRobertaXLModel(config, add_pooling_layer=False)
127
+ self.layer_weights = LayerGatingNetwork(in_features=config.num_hidden_layers)
128
+
129
+ self.init_weights()
130
+
131
+ def init_weights(self):
132
+ super().init_weights()
133
+ with torch.no_grad():
134
+ self.layer_weights.reset_parameters()
135
+
136
+ def forward(
137
+ self,
138
+ input_ids: Optional[torch.LongTensor] = None,
139
+ attention_mask: Optional[torch.FloatTensor] = None,
140
+ right_input_ids: Optional[torch.LongTensor] = None,
141
+ right_attention_mask: Optional[torch.LongTensor] = None,
142
+ token_type_ids: Optional[torch.LongTensor] = None,
143
+ position_ids: Optional[torch.LongTensor] = None,
144
+ head_mask: Optional[torch.FloatTensor] = None,
145
+ inputs_embeds: Optional[torch.FloatTensor] = None,
146
+ labels: Optional[torch.LongTensor] = None,
147
+ output_attentions: Optional[bool] = None,
148
+ return_dict: Optional[bool] = None,
149
+ ) -> Union[Tuple, HierarchicalSequenceEmbedderOutput]:
150
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
151
+
152
+ outputs = self.roberta(
153
+ input_ids,
154
+ attention_mask=attention_mask,
155
+ token_type_ids=token_type_ids,
156
+ position_ids=position_ids,
157
+ head_mask=head_mask,
158
+ inputs_embeds=inputs_embeds,
159
+ output_attentions=output_attentions,
160
+ output_hidden_states=True,
161
+ return_dict=False
162
+ )
163
+ cls_hidden_states = torch.stack(
164
+ tensors=outputs[2][-self.config.num_hidden_layers:],
165
+ dim=1
166
+ )[:, :, 0, :]
167
+ cls_emb = self.layer_weights(cls_hidden_states.permute(0, 2, 1))[:, :, 0]
168
+
169
+ loss = None
170
+ if labels is not None:
171
+ cls_emb_ = cls_emb.view(-1, self.config.hidden_size)
172
+ emb_norm = torch.linalg.norm(cls_emb_, dim=-1, keepdim=True) + 1e-9
173
+ if (right_input_ids is not None) or (right_attention_mask is not None):
174
+ if right_input_ids is None:
175
+ raise ValueError(f'right_input_ids is not specified!')
176
+ if right_attention_mask is None:
177
+ raise ValueError(f'right_attention_mask is not specified!')
178
+ right_outputs = self.roberta(
179
+ right_input_ids,
180
+ attention_mask=right_attention_mask,
181
+ output_hidden_states=True,
182
+ return_dict=False
183
+ )
184
+ right_cls_hidden_states = torch.stack(
185
+ tensors=right_outputs[2][-self.config.num_hidden_layers:],
186
+ dim=1
187
+ )[:, :, 0, :]
188
+ right_cls_emb = self.layer_weights(right_cls_hidden_states.permute(0, 2, 1))[:, :, 0]
189
+ right_cls_emb_ = right_cls_emb.view(-1, self.config.hidden_size)
190
+ right_emb_norm = torch.linalg.norm(right_cls_emb_, dim=-1, keepdim=True) + 1e-9
191
+ distances = torch.norm(cls_emb_ / emb_norm - right_cls_emb_ / right_emb_norm, 2, dim=-1)
192
+ loss_fct = DistanceBasedLogisticLoss(margin=1.0)
193
+ loss = loss_fct(distances, labels.view(-1))
194
+ else:
195
+ loss_fct = NTXentLoss(temperature=self.temperature)
196
+ loss = loss_fct(cls_emb_ / emb_norm, labels.view(-1))
197
+
198
+ if not return_dict:
199
+ output = (cls_emb, cls_hidden_states) + outputs[2:]
200
+ return ((loss,) + output) if loss is not None else output
201
+
202
+ return HierarchicalSequenceEmbedderOutput(
203
+ loss=loss,
204
+ embeddings=cls_emb,
205
+ layer_embeddings=cls_hidden_states,
206
+ hidden_states=outputs[2],
207
+ attentions=outputs[3] if output_attentions else None,
208
+ )
209
+
210
+ @property
211
+ def layer_importances(self) -> List[Tuple[int, float]]:
212
+ with torch.no_grad():
213
+ importances = torch.softmax(self.layer_weights.weight, dim=-1).detach().cpu().numpy().flatten()
214
+ indices_and_importances = []
215
+ for layer_idx in range(importances.shape[0]):
216
+ indices_and_importances.append((layer_idx + 1, float(importances[layer_idx])))
217
+ indices_and_importances.sort(key=lambda it: (-it[1], it[0]))
218
+ return indices_and_importances
219
+
220
+
221
+ class XLMRobertaXLForHierarchicalSequenceClassification(XLMRobertaXLForHierarchicalEmbedding, ABC):
222
+ def __init__(self, config: HierarchicalXLMRobertaXLConfig):
223
+ super().__init__(config)
224
+ self.num_labels = config.num_labels
225
+ self.label_smoothing = config.label_smoothing
226
+ self.config = config
227
+
228
+ self.classifier = XLMRobertaXLHierarchicalClassificationHead(config)
229
+
230
+ self.init_weights()
231
+
232
+ def forward(
233
+ self,
234
+ input_ids: Optional[torch.LongTensor] = None,
235
+ attention_mask: Optional[torch.FloatTensor] = None,
236
+ right_input_ids: Optional[torch.LongTensor] = None,
237
+ right_attention_mask: Optional[torch.LongTensor] = None,
238
+ token_type_ids: Optional[torch.LongTensor] = None,
239
+ position_ids: Optional[torch.LongTensor] = None,
240
+ head_mask: Optional[torch.FloatTensor] = None,
241
+ inputs_embeds: Optional[torch.FloatTensor] = None,
242
+ labels: Optional[torch.LongTensor] = None,
243
+ output_attentions: Optional[bool] = None,
244
+ return_dict: Optional[bool] = None,
245
+ ) -> Union[Tuple, HierarchicalSequenceClassifierOutput]:
246
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
247
+
248
+ outputs = super().forward(
249
+ input_ids,
250
+ attention_mask=attention_mask,
251
+ token_type_ids=token_type_ids,
252
+ position_ids=position_ids,
253
+ head_mask=head_mask,
254
+ inputs_embeds=inputs_embeds,
255
+ output_attentions=output_attentions,
256
+ return_dict=return_dict,
257
+ )
258
+ sequence_output = outputs[0]
259
+ logits = self.classifier(sequence_output)
260
+
261
+ loss = None
262
+ if labels is not None:
263
+ if self.config.problem_type is None:
264
+ if self.num_labels == 1:
265
+ self.config.problem_type = "regression"
266
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
267
+ self.config.problem_type = "single_label_classification"
268
+ else:
269
+ self.config.problem_type = "multi_label_classification"
270
+
271
+ if self.config.problem_type == "regression":
272
+ loss_fct = torch.nn.MSELoss()
273
+ if self.num_labels == 1:
274
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
275
+ else:
276
+ loss = loss_fct(logits, labels)
277
+ elif self.config.problem_type == "single_label_classification":
278
+ if self.label_smoothing is None:
279
+ loss_fct = torch.nn.CrossEntropyLoss()
280
+ else:
281
+ loss_fct = torch.nn.CrossEntropyLoss(label_smoothing=self.label_smoothing)
282
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
283
+ elif self.config.problem_type == "multi_label_classification":
284
+ loss_fct = torch.nn.BCEWithLogitsLoss()
285
+ loss = loss_fct(logits, labels)
286
+
287
+ if not return_dict:
288
+ output = (logits,) + outputs
289
+ return ((loss,) + output) if loss is not None else output
290
+
291
+ return HierarchicalSequenceClassifierOutput(
292
+ loss=loss,
293
+ logits=logits,
294
+ embeddings=outputs.embeddings,
295
+ layer_embeddings=outputs.layer_embeddings,
296
+ hidden_states=outputs.hidden_states,
297
+ attentions=outputs.attentions
298
+ )
299
+
300
+
301
+ AutoConfig.register("hierarchical-xlm-roberta-xl", HierarchicalXLMRobertaXLConfig)
302
+ AutoModelForSequenceClassification.register(
303
+ HierarchicalXLMRobertaXLConfig,
304
+ XLMRobertaXLForHierarchicalSequenceClassification
305
+ )
pytorch_model-00001-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:468d5860d463d60e770c77dc4c93593da6dd815a6a5a4e547935ebc367d7b526
3
+ size 4987088064
pytorch_model-00002-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d195a985039c4940ec45052ba1320a200af3179a964668ae14301a225312b1f
3
+ size 4930484493
pytorch_model-00003-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cda0ce53bfc8397e4d3872733b30450fc62734860f2b668606f40a49f2ad9e2
3
+ size 4012594338
pytorch_model.bin.index.json ADDED
@@ -0,0 +1,593 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 13929963672
4
+ },
5
+ "weight_map": {
6
+ "classifier.dense.bias": "pytorch_model-00003-of-00003.bin",
7
+ "classifier.dense.weight": "pytorch_model-00003-of-00003.bin",
8
+ "classifier.out_proj.bias": "pytorch_model-00003-of-00003.bin",
9
+ "classifier.out_proj.weight": "pytorch_model-00003-of-00003.bin",
10
+ "layer_weights.weight": "pytorch_model-00003-of-00003.bin",
11
+ "roberta.embeddings.position_embeddings.weight": "pytorch_model-00001-of-00003.bin",
12
+ "roberta.embeddings.token_type_embeddings.weight": "pytorch_model-00001-of-00003.bin",
13
+ "roberta.embeddings.word_embeddings.weight": "pytorch_model-00001-of-00003.bin",
14
+ "roberta.encoder.LayerNorm.bias": "pytorch_model-00003-of-00003.bin",
15
+ "roberta.encoder.LayerNorm.weight": "pytorch_model-00003-of-00003.bin",
16
+ "roberta.encoder.layer.0.LayerNorm.bias": "pytorch_model-00001-of-00003.bin",
17
+ "roberta.encoder.layer.0.LayerNorm.weight": "pytorch_model-00001-of-00003.bin",
18
+ "roberta.encoder.layer.0.attention.output.dense.bias": "pytorch_model-00001-of-00003.bin",
19
+ "roberta.encoder.layer.0.attention.output.dense.weight": "pytorch_model-00001-of-00003.bin",
20
+ "roberta.encoder.layer.0.attention.self.key.bias": "pytorch_model-00001-of-00003.bin",
21
+ "roberta.encoder.layer.0.attention.self.key.weight": "pytorch_model-00001-of-00003.bin",
22
+ "roberta.encoder.layer.0.attention.self.query.bias": "pytorch_model-00001-of-00003.bin",
23
+ "roberta.encoder.layer.0.attention.self.query.weight": "pytorch_model-00001-of-00003.bin",
24
+ "roberta.encoder.layer.0.attention.self.value.bias": "pytorch_model-00001-of-00003.bin",
25
+ "roberta.encoder.layer.0.attention.self.value.weight": "pytorch_model-00001-of-00003.bin",
26
+ "roberta.encoder.layer.0.attention.self_attn_layer_norm.bias": "pytorch_model-00001-of-00003.bin",
27
+ "roberta.encoder.layer.0.attention.self_attn_layer_norm.weight": "pytorch_model-00001-of-00003.bin",
28
+ "roberta.encoder.layer.0.intermediate.dense.bias": "pytorch_model-00001-of-00003.bin",
29
+ "roberta.encoder.layer.0.intermediate.dense.weight": "pytorch_model-00001-of-00003.bin",
30
+ "roberta.encoder.layer.0.output.dense.bias": "pytorch_model-00001-of-00003.bin",
31
+ "roberta.encoder.layer.0.output.dense.weight": "pytorch_model-00001-of-00003.bin",
32
+ "roberta.encoder.layer.1.LayerNorm.bias": "pytorch_model-00001-of-00003.bin",
33
+ "roberta.encoder.layer.1.LayerNorm.weight": "pytorch_model-00001-of-00003.bin",
34
+ "roberta.encoder.layer.1.attention.output.dense.bias": "pytorch_model-00001-of-00003.bin",
35
+ "roberta.encoder.layer.1.attention.output.dense.weight": "pytorch_model-00001-of-00003.bin",
36
+ "roberta.encoder.layer.1.attention.self.key.bias": "pytorch_model-00001-of-00003.bin",
37
+ "roberta.encoder.layer.1.attention.self.key.weight": "pytorch_model-00001-of-00003.bin",
38
+ "roberta.encoder.layer.1.attention.self.query.bias": "pytorch_model-00001-of-00003.bin",
39
+ "roberta.encoder.layer.1.attention.self.query.weight": "pytorch_model-00001-of-00003.bin",
40
+ "roberta.encoder.layer.1.attention.self.value.bias": "pytorch_model-00001-of-00003.bin",
41
+ "roberta.encoder.layer.1.attention.self.value.weight": "pytorch_model-00001-of-00003.bin",
42
+ "roberta.encoder.layer.1.attention.self_attn_layer_norm.bias": "pytorch_model-00001-of-00003.bin",
43
+ "roberta.encoder.layer.1.attention.self_attn_layer_norm.weight": "pytorch_model-00001-of-00003.bin",
44
+ "roberta.encoder.layer.1.intermediate.dense.bias": "pytorch_model-00001-of-00003.bin",
45
+ "roberta.encoder.layer.1.intermediate.dense.weight": "pytorch_model-00001-of-00003.bin",
46
+ "roberta.encoder.layer.1.output.dense.bias": "pytorch_model-00001-of-00003.bin",
47
+ "roberta.encoder.layer.1.output.dense.weight": "pytorch_model-00001-of-00003.bin",
48
+ "roberta.encoder.layer.10.LayerNorm.bias": "pytorch_model-00002-of-00003.bin",
49
+ "roberta.encoder.layer.10.LayerNorm.weight": "pytorch_model-00002-of-00003.bin",
50
+ "roberta.encoder.layer.10.attention.output.dense.bias": "pytorch_model-00002-of-00003.bin",
51
+ "roberta.encoder.layer.10.attention.output.dense.weight": "pytorch_model-00002-of-00003.bin",
52
+ "roberta.encoder.layer.10.attention.self.key.bias": "pytorch_model-00002-of-00003.bin",
53
+ "roberta.encoder.layer.10.attention.self.key.weight": "pytorch_model-00002-of-00003.bin",
54
+ "roberta.encoder.layer.10.attention.self.query.bias": "pytorch_model-00002-of-00003.bin",
55
+ "roberta.encoder.layer.10.attention.self.query.weight": "pytorch_model-00002-of-00003.bin",
56
+ "roberta.encoder.layer.10.attention.self.value.bias": "pytorch_model-00002-of-00003.bin",
57
+ "roberta.encoder.layer.10.attention.self.value.weight": "pytorch_model-00002-of-00003.bin",
58
+ "roberta.encoder.layer.10.attention.self_attn_layer_norm.bias": "pytorch_model-00002-of-00003.bin",
59
+ "roberta.encoder.layer.10.attention.self_attn_layer_norm.weight": "pytorch_model-00002-of-00003.bin",
60
+ "roberta.encoder.layer.10.intermediate.dense.bias": "pytorch_model-00002-of-00003.bin",
61
+ "roberta.encoder.layer.10.intermediate.dense.weight": "pytorch_model-00002-of-00003.bin",
62
+ "roberta.encoder.layer.10.output.dense.bias": "pytorch_model-00002-of-00003.bin",
63
+ "roberta.encoder.layer.10.output.dense.weight": "pytorch_model-00002-of-00003.bin",
64
+ "roberta.encoder.layer.11.LayerNorm.bias": "pytorch_model-00002-of-00003.bin",
65
+ "roberta.encoder.layer.11.LayerNorm.weight": "pytorch_model-00002-of-00003.bin",
66
+ "roberta.encoder.layer.11.attention.output.dense.bias": "pytorch_model-00002-of-00003.bin",
67
+ "roberta.encoder.layer.11.attention.output.dense.weight": "pytorch_model-00002-of-00003.bin",
68
+ "roberta.encoder.layer.11.attention.self.key.bias": "pytorch_model-00002-of-00003.bin",
69
+ "roberta.encoder.layer.11.attention.self.key.weight": "pytorch_model-00002-of-00003.bin",
70
+ "roberta.encoder.layer.11.attention.self.query.bias": "pytorch_model-00002-of-00003.bin",
71
+ "roberta.encoder.layer.11.attention.self.query.weight": "pytorch_model-00002-of-00003.bin",
72
+ "roberta.encoder.layer.11.attention.self.value.bias": "pytorch_model-00002-of-00003.bin",
73
+ "roberta.encoder.layer.11.attention.self.value.weight": "pytorch_model-00002-of-00003.bin",
74
+ "roberta.encoder.layer.11.attention.self_attn_layer_norm.bias": "pytorch_model-00002-of-00003.bin",
75
+ "roberta.encoder.layer.11.attention.self_attn_layer_norm.weight": "pytorch_model-00002-of-00003.bin",
76
+ "roberta.encoder.layer.11.intermediate.dense.bias": "pytorch_model-00002-of-00003.bin",
77
+ "roberta.encoder.layer.11.intermediate.dense.weight": "pytorch_model-00002-of-00003.bin",
78
+ "roberta.encoder.layer.11.output.dense.bias": "pytorch_model-00002-of-00003.bin",
79
+ "roberta.encoder.layer.11.output.dense.weight": "pytorch_model-00002-of-00003.bin",
80
+ "roberta.encoder.layer.12.LayerNorm.bias": "pytorch_model-00002-of-00003.bin",
81
+ "roberta.encoder.layer.12.LayerNorm.weight": "pytorch_model-00002-of-00003.bin",
82
+ "roberta.encoder.layer.12.attention.output.dense.bias": "pytorch_model-00002-of-00003.bin",
83
+ "roberta.encoder.layer.12.attention.output.dense.weight": "pytorch_model-00002-of-00003.bin",
84
+ "roberta.encoder.layer.12.attention.self.key.bias": "pytorch_model-00002-of-00003.bin",
85
+ "roberta.encoder.layer.12.attention.self.key.weight": "pytorch_model-00002-of-00003.bin",
86
+ "roberta.encoder.layer.12.attention.self.query.bias": "pytorch_model-00002-of-00003.bin",
87
+ "roberta.encoder.layer.12.attention.self.query.weight": "pytorch_model-00002-of-00003.bin",
88
+ "roberta.encoder.layer.12.attention.self.value.bias": "pytorch_model-00002-of-00003.bin",
89
+ "roberta.encoder.layer.12.attention.self.value.weight": "pytorch_model-00002-of-00003.bin",
90
+ "roberta.encoder.layer.12.attention.self_attn_layer_norm.bias": "pytorch_model-00002-of-00003.bin",
91
+ "roberta.encoder.layer.12.attention.self_attn_layer_norm.weight": "pytorch_model-00002-of-00003.bin",
92
+ "roberta.encoder.layer.12.intermediate.dense.bias": "pytorch_model-00002-of-00003.bin",
93
+ "roberta.encoder.layer.12.intermediate.dense.weight": "pytorch_model-00002-of-00003.bin",
94
+ "roberta.encoder.layer.12.output.dense.bias": "pytorch_model-00002-of-00003.bin",
95
+ "roberta.encoder.layer.12.output.dense.weight": "pytorch_model-00002-of-00003.bin",
96
+ "roberta.encoder.layer.13.LayerNorm.bias": "pytorch_model-00002-of-00003.bin",
97
+ "roberta.encoder.layer.13.LayerNorm.weight": "pytorch_model-00002-of-00003.bin",
98
+ "roberta.encoder.layer.13.attention.output.dense.bias": "pytorch_model-00002-of-00003.bin",
99
+ "roberta.encoder.layer.13.attention.output.dense.weight": "pytorch_model-00002-of-00003.bin",
100
+ "roberta.encoder.layer.13.attention.self.key.bias": "pytorch_model-00002-of-00003.bin",
101
+ "roberta.encoder.layer.13.attention.self.key.weight": "pytorch_model-00002-of-00003.bin",
102
+ "roberta.encoder.layer.13.attention.self.query.bias": "pytorch_model-00002-of-00003.bin",
103
+ "roberta.encoder.layer.13.attention.self.query.weight": "pytorch_model-00002-of-00003.bin",
104
+ "roberta.encoder.layer.13.attention.self.value.bias": "pytorch_model-00002-of-00003.bin",
105
+ "roberta.encoder.layer.13.attention.self.value.weight": "pytorch_model-00002-of-00003.bin",
106
+ "roberta.encoder.layer.13.attention.self_attn_layer_norm.bias": "pytorch_model-00002-of-00003.bin",
107
+ "roberta.encoder.layer.13.attention.self_attn_layer_norm.weight": "pytorch_model-00002-of-00003.bin",
108
+ "roberta.encoder.layer.13.intermediate.dense.bias": "pytorch_model-00002-of-00003.bin",
109
+ "roberta.encoder.layer.13.intermediate.dense.weight": "pytorch_model-00002-of-00003.bin",
110
+ "roberta.encoder.layer.13.output.dense.bias": "pytorch_model-00002-of-00003.bin",
111
+ "roberta.encoder.layer.13.output.dense.weight": "pytorch_model-00002-of-00003.bin",
112
+ "roberta.encoder.layer.14.LayerNorm.bias": "pytorch_model-00002-of-00003.bin",
113
+ "roberta.encoder.layer.14.LayerNorm.weight": "pytorch_model-00002-of-00003.bin",
114
+ "roberta.encoder.layer.14.attention.output.dense.bias": "pytorch_model-00002-of-00003.bin",
115
+ "roberta.encoder.layer.14.attention.output.dense.weight": "pytorch_model-00002-of-00003.bin",
116
+ "roberta.encoder.layer.14.attention.self.key.bias": "pytorch_model-00002-of-00003.bin",
117
+ "roberta.encoder.layer.14.attention.self.key.weight": "pytorch_model-00002-of-00003.bin",
118
+ "roberta.encoder.layer.14.attention.self.query.bias": "pytorch_model-00002-of-00003.bin",
119
+ "roberta.encoder.layer.14.attention.self.query.weight": "pytorch_model-00002-of-00003.bin",
120
+ "roberta.encoder.layer.14.attention.self.value.bias": "pytorch_model-00002-of-00003.bin",
121
+ "roberta.encoder.layer.14.attention.self.value.weight": "pytorch_model-00002-of-00003.bin",
122
+ "roberta.encoder.layer.14.attention.self_attn_layer_norm.bias": "pytorch_model-00002-of-00003.bin",
123
+ "roberta.encoder.layer.14.attention.self_attn_layer_norm.weight": "pytorch_model-00002-of-00003.bin",
124
+ "roberta.encoder.layer.14.intermediate.dense.bias": "pytorch_model-00002-of-00003.bin",
125
+ "roberta.encoder.layer.14.intermediate.dense.weight": "pytorch_model-00002-of-00003.bin",
126
+ "roberta.encoder.layer.14.output.dense.bias": "pytorch_model-00002-of-00003.bin",
127
+ "roberta.encoder.layer.14.output.dense.weight": "pytorch_model-00002-of-00003.bin",
128
+ "roberta.encoder.layer.15.LayerNorm.bias": "pytorch_model-00002-of-00003.bin",
129
+ "roberta.encoder.layer.15.LayerNorm.weight": "pytorch_model-00002-of-00003.bin",
130
+ "roberta.encoder.layer.15.attention.output.dense.bias": "pytorch_model-00002-of-00003.bin",
131
+ "roberta.encoder.layer.15.attention.output.dense.weight": "pytorch_model-00002-of-00003.bin",
132
+ "roberta.encoder.layer.15.attention.self.key.bias": "pytorch_model-00002-of-00003.bin",
133
+ "roberta.encoder.layer.15.attention.self.key.weight": "pytorch_model-00002-of-00003.bin",
134
+ "roberta.encoder.layer.15.attention.self.query.bias": "pytorch_model-00002-of-00003.bin",
135
+ "roberta.encoder.layer.15.attention.self.query.weight": "pytorch_model-00002-of-00003.bin",
136
+ "roberta.encoder.layer.15.attention.self.value.bias": "pytorch_model-00002-of-00003.bin",
137
+ "roberta.encoder.layer.15.attention.self.value.weight": "pytorch_model-00002-of-00003.bin",
138
+ "roberta.encoder.layer.15.attention.self_attn_layer_norm.bias": "pytorch_model-00002-of-00003.bin",
139
+ "roberta.encoder.layer.15.attention.self_attn_layer_norm.weight": "pytorch_model-00002-of-00003.bin",
140
+ "roberta.encoder.layer.15.intermediate.dense.bias": "pytorch_model-00002-of-00003.bin",
141
+ "roberta.encoder.layer.15.intermediate.dense.weight": "pytorch_model-00002-of-00003.bin",
142
+ "roberta.encoder.layer.15.output.dense.bias": "pytorch_model-00002-of-00003.bin",
143
+ "roberta.encoder.layer.15.output.dense.weight": "pytorch_model-00002-of-00003.bin",
144
+ "roberta.encoder.layer.16.LayerNorm.bias": "pytorch_model-00002-of-00003.bin",
145
+ "roberta.encoder.layer.16.LayerNorm.weight": "pytorch_model-00002-of-00003.bin",
146
+ "roberta.encoder.layer.16.attention.output.dense.bias": "pytorch_model-00002-of-00003.bin",
147
+ "roberta.encoder.layer.16.attention.output.dense.weight": "pytorch_model-00002-of-00003.bin",
148
+ "roberta.encoder.layer.16.attention.self.key.bias": "pytorch_model-00002-of-00003.bin",
149
+ "roberta.encoder.layer.16.attention.self.key.weight": "pytorch_model-00002-of-00003.bin",
150
+ "roberta.encoder.layer.16.attention.self.query.bias": "pytorch_model-00002-of-00003.bin",
151
+ "roberta.encoder.layer.16.attention.self.query.weight": "pytorch_model-00002-of-00003.bin",
152
+ "roberta.encoder.layer.16.attention.self.value.bias": "pytorch_model-00002-of-00003.bin",
153
+ "roberta.encoder.layer.16.attention.self.value.weight": "pytorch_model-00002-of-00003.bin",
154
+ "roberta.encoder.layer.16.attention.self_attn_layer_norm.bias": "pytorch_model-00002-of-00003.bin",
155
+ "roberta.encoder.layer.16.attention.self_attn_layer_norm.weight": "pytorch_model-00002-of-00003.bin",
156
+ "roberta.encoder.layer.16.intermediate.dense.bias": "pytorch_model-00002-of-00003.bin",
157
+ "roberta.encoder.layer.16.intermediate.dense.weight": "pytorch_model-00002-of-00003.bin",
158
+ "roberta.encoder.layer.16.output.dense.bias": "pytorch_model-00002-of-00003.bin",
159
+ "roberta.encoder.layer.16.output.dense.weight": "pytorch_model-00002-of-00003.bin",
160
+ "roberta.encoder.layer.17.LayerNorm.bias": "pytorch_model-00002-of-00003.bin",
161
+ "roberta.encoder.layer.17.LayerNorm.weight": "pytorch_model-00002-of-00003.bin",
162
+ "roberta.encoder.layer.17.attention.output.dense.bias": "pytorch_model-00002-of-00003.bin",
163
+ "roberta.encoder.layer.17.attention.output.dense.weight": "pytorch_model-00002-of-00003.bin",
164
+ "roberta.encoder.layer.17.attention.self.key.bias": "pytorch_model-00002-of-00003.bin",
165
+ "roberta.encoder.layer.17.attention.self.key.weight": "pytorch_model-00002-of-00003.bin",
166
+ "roberta.encoder.layer.17.attention.self.query.bias": "pytorch_model-00002-of-00003.bin",
167
+ "roberta.encoder.layer.17.attention.self.query.weight": "pytorch_model-00002-of-00003.bin",
168
+ "roberta.encoder.layer.17.attention.self.value.bias": "pytorch_model-00002-of-00003.bin",
169
+ "roberta.encoder.layer.17.attention.self.value.weight": "pytorch_model-00002-of-00003.bin",
170
+ "roberta.encoder.layer.17.attention.self_attn_layer_norm.bias": "pytorch_model-00002-of-00003.bin",
171
+ "roberta.encoder.layer.17.attention.self_attn_layer_norm.weight": "pytorch_model-00002-of-00003.bin",
172
+ "roberta.encoder.layer.17.intermediate.dense.bias": "pytorch_model-00002-of-00003.bin",
173
+ "roberta.encoder.layer.17.intermediate.dense.weight": "pytorch_model-00002-of-00003.bin",
174
+ "roberta.encoder.layer.17.output.dense.bias": "pytorch_model-00002-of-00003.bin",
175
+ "roberta.encoder.layer.17.output.dense.weight": "pytorch_model-00002-of-00003.bin",
176
+ "roberta.encoder.layer.18.LayerNorm.bias": "pytorch_model-00002-of-00003.bin",
177
+ "roberta.encoder.layer.18.LayerNorm.weight": "pytorch_model-00002-of-00003.bin",
178
+ "roberta.encoder.layer.18.attention.output.dense.bias": "pytorch_model-00002-of-00003.bin",
179
+ "roberta.encoder.layer.18.attention.output.dense.weight": "pytorch_model-00002-of-00003.bin",
180
+ "roberta.encoder.layer.18.attention.self.key.bias": "pytorch_model-00002-of-00003.bin",
181
+ "roberta.encoder.layer.18.attention.self.key.weight": "pytorch_model-00002-of-00003.bin",
182
+ "roberta.encoder.layer.18.attention.self.query.bias": "pytorch_model-00002-of-00003.bin",
183
+ "roberta.encoder.layer.18.attention.self.query.weight": "pytorch_model-00002-of-00003.bin",
184
+ "roberta.encoder.layer.18.attention.self.value.bias": "pytorch_model-00002-of-00003.bin",
185
+ "roberta.encoder.layer.18.attention.self.value.weight": "pytorch_model-00002-of-00003.bin",
186
+ "roberta.encoder.layer.18.attention.self_attn_layer_norm.bias": "pytorch_model-00002-of-00003.bin",
187
+ "roberta.encoder.layer.18.attention.self_attn_layer_norm.weight": "pytorch_model-00002-of-00003.bin",
188
+ "roberta.encoder.layer.18.intermediate.dense.bias": "pytorch_model-00002-of-00003.bin",
189
+ "roberta.encoder.layer.18.intermediate.dense.weight": "pytorch_model-00002-of-00003.bin",
190
+ "roberta.encoder.layer.18.output.dense.bias": "pytorch_model-00002-of-00003.bin",
191
+ "roberta.encoder.layer.18.output.dense.weight": "pytorch_model-00002-of-00003.bin",
192
+ "roberta.encoder.layer.19.LayerNorm.bias": "pytorch_model-00002-of-00003.bin",
193
+ "roberta.encoder.layer.19.LayerNorm.weight": "pytorch_model-00002-of-00003.bin",
194
+ "roberta.encoder.layer.19.attention.output.dense.bias": "pytorch_model-00002-of-00003.bin",
195
+ "roberta.encoder.layer.19.attention.output.dense.weight": "pytorch_model-00002-of-00003.bin",
196
+ "roberta.encoder.layer.19.attention.self.key.bias": "pytorch_model-00002-of-00003.bin",
197
+ "roberta.encoder.layer.19.attention.self.key.weight": "pytorch_model-00002-of-00003.bin",
198
+ "roberta.encoder.layer.19.attention.self.query.bias": "pytorch_model-00002-of-00003.bin",
199
+ "roberta.encoder.layer.19.attention.self.query.weight": "pytorch_model-00002-of-00003.bin",
200
+ "roberta.encoder.layer.19.attention.self.value.bias": "pytorch_model-00002-of-00003.bin",
201
+ "roberta.encoder.layer.19.attention.self.value.weight": "pytorch_model-00002-of-00003.bin",
202
+ "roberta.encoder.layer.19.attention.self_attn_layer_norm.bias": "pytorch_model-00002-of-00003.bin",
203
+ "roberta.encoder.layer.19.attention.self_attn_layer_norm.weight": "pytorch_model-00002-of-00003.bin",
204
+ "roberta.encoder.layer.19.intermediate.dense.bias": "pytorch_model-00002-of-00003.bin",
205
+ "roberta.encoder.layer.19.intermediate.dense.weight": "pytorch_model-00002-of-00003.bin",
206
+ "roberta.encoder.layer.19.output.dense.bias": "pytorch_model-00002-of-00003.bin",
207
+ "roberta.encoder.layer.19.output.dense.weight": "pytorch_model-00002-of-00003.bin",
208
+ "roberta.encoder.layer.2.LayerNorm.bias": "pytorch_model-00001-of-00003.bin",
209
+ "roberta.encoder.layer.2.LayerNorm.weight": "pytorch_model-00001-of-00003.bin",
210
+ "roberta.encoder.layer.2.attention.output.dense.bias": "pytorch_model-00001-of-00003.bin",
211
+ "roberta.encoder.layer.2.attention.output.dense.weight": "pytorch_model-00001-of-00003.bin",
212
+ "roberta.encoder.layer.2.attention.self.key.bias": "pytorch_model-00001-of-00003.bin",
213
+ "roberta.encoder.layer.2.attention.self.key.weight": "pytorch_model-00001-of-00003.bin",
214
+ "roberta.encoder.layer.2.attention.self.query.bias": "pytorch_model-00001-of-00003.bin",
215
+ "roberta.encoder.layer.2.attention.self.query.weight": "pytorch_model-00001-of-00003.bin",
216
+ "roberta.encoder.layer.2.attention.self.value.bias": "pytorch_model-00001-of-00003.bin",
217
+ "roberta.encoder.layer.2.attention.self.value.weight": "pytorch_model-00001-of-00003.bin",
218
+ "roberta.encoder.layer.2.attention.self_attn_layer_norm.bias": "pytorch_model-00001-of-00003.bin",
219
+ "roberta.encoder.layer.2.attention.self_attn_layer_norm.weight": "pytorch_model-00001-of-00003.bin",
220
+ "roberta.encoder.layer.2.intermediate.dense.bias": "pytorch_model-00001-of-00003.bin",
221
+ "roberta.encoder.layer.2.intermediate.dense.weight": "pytorch_model-00001-of-00003.bin",
222
+ "roberta.encoder.layer.2.output.dense.bias": "pytorch_model-00001-of-00003.bin",
223
+ "roberta.encoder.layer.2.output.dense.weight": "pytorch_model-00001-of-00003.bin",
224
+ "roberta.encoder.layer.20.LayerNorm.bias": "pytorch_model-00002-of-00003.bin",
225
+ "roberta.encoder.layer.20.LayerNorm.weight": "pytorch_model-00002-of-00003.bin",
226
+ "roberta.encoder.layer.20.attention.output.dense.bias": "pytorch_model-00002-of-00003.bin",
227
+ "roberta.encoder.layer.20.attention.output.dense.weight": "pytorch_model-00002-of-00003.bin",
228
+ "roberta.encoder.layer.20.attention.self.key.bias": "pytorch_model-00002-of-00003.bin",
229
+ "roberta.encoder.layer.20.attention.self.key.weight": "pytorch_model-00002-of-00003.bin",
230
+ "roberta.encoder.layer.20.attention.self.query.bias": "pytorch_model-00002-of-00003.bin",
231
+ "roberta.encoder.layer.20.attention.self.query.weight": "pytorch_model-00002-of-00003.bin",
232
+ "roberta.encoder.layer.20.attention.self.value.bias": "pytorch_model-00002-of-00003.bin",
233
+ "roberta.encoder.layer.20.attention.self.value.weight": "pytorch_model-00002-of-00003.bin",
234
+ "roberta.encoder.layer.20.attention.self_attn_layer_norm.bias": "pytorch_model-00002-of-00003.bin",
235
+ "roberta.encoder.layer.20.attention.self_attn_layer_norm.weight": "pytorch_model-00002-of-00003.bin",
236
+ "roberta.encoder.layer.20.intermediate.dense.bias": "pytorch_model-00002-of-00003.bin",
237
+ "roberta.encoder.layer.20.intermediate.dense.weight": "pytorch_model-00002-of-00003.bin",
238
+ "roberta.encoder.layer.20.output.dense.bias": "pytorch_model-00002-of-00003.bin",
239
+ "roberta.encoder.layer.20.output.dense.weight": "pytorch_model-00002-of-00003.bin",
240
+ "roberta.encoder.layer.21.LayerNorm.bias": "pytorch_model-00002-of-00003.bin",
241
+ "roberta.encoder.layer.21.LayerNorm.weight": "pytorch_model-00002-of-00003.bin",
242
+ "roberta.encoder.layer.21.attention.output.dense.bias": "pytorch_model-00002-of-00003.bin",
243
+ "roberta.encoder.layer.21.attention.output.dense.weight": "pytorch_model-00002-of-00003.bin",
244
+ "roberta.encoder.layer.21.attention.self.key.bias": "pytorch_model-00002-of-00003.bin",
245
+ "roberta.encoder.layer.21.attention.self.key.weight": "pytorch_model-00002-of-00003.bin",
246
+ "roberta.encoder.layer.21.attention.self.query.bias": "pytorch_model-00002-of-00003.bin",
247
+ "roberta.encoder.layer.21.attention.self.query.weight": "pytorch_model-00002-of-00003.bin",
248
+ "roberta.encoder.layer.21.attention.self.value.bias": "pytorch_model-00002-of-00003.bin",
249
+ "roberta.encoder.layer.21.attention.self.value.weight": "pytorch_model-00002-of-00003.bin",
250
+ "roberta.encoder.layer.21.attention.self_attn_layer_norm.bias": "pytorch_model-00002-of-00003.bin",
251
+ "roberta.encoder.layer.21.attention.self_attn_layer_norm.weight": "pytorch_model-00002-of-00003.bin",
252
+ "roberta.encoder.layer.21.intermediate.dense.bias": "pytorch_model-00002-of-00003.bin",
253
+ "roberta.encoder.layer.21.intermediate.dense.weight": "pytorch_model-00002-of-00003.bin",
254
+ "roberta.encoder.layer.21.output.dense.bias": "pytorch_model-00002-of-00003.bin",
255
+ "roberta.encoder.layer.21.output.dense.weight": "pytorch_model-00002-of-00003.bin",
256
+ "roberta.encoder.layer.22.LayerNorm.bias": "pytorch_model-00002-of-00003.bin",
257
+ "roberta.encoder.layer.22.LayerNorm.weight": "pytorch_model-00002-of-00003.bin",
258
+ "roberta.encoder.layer.22.attention.output.dense.bias": "pytorch_model-00002-of-00003.bin",
259
+ "roberta.encoder.layer.22.attention.output.dense.weight": "pytorch_model-00002-of-00003.bin",
260
+ "roberta.encoder.layer.22.attention.self.key.bias": "pytorch_model-00002-of-00003.bin",
261
+ "roberta.encoder.layer.22.attention.self.key.weight": "pytorch_model-00002-of-00003.bin",
262
+ "roberta.encoder.layer.22.attention.self.query.bias": "pytorch_model-00002-of-00003.bin",
263
+ "roberta.encoder.layer.22.attention.self.query.weight": "pytorch_model-00002-of-00003.bin",
264
+ "roberta.encoder.layer.22.attention.self.value.bias": "pytorch_model-00002-of-00003.bin",
265
+ "roberta.encoder.layer.22.attention.self.value.weight": "pytorch_model-00002-of-00003.bin",
266
+ "roberta.encoder.layer.22.attention.self_attn_layer_norm.bias": "pytorch_model-00002-of-00003.bin",
267
+ "roberta.encoder.layer.22.attention.self_attn_layer_norm.weight": "pytorch_model-00002-of-00003.bin",
268
+ "roberta.encoder.layer.22.intermediate.dense.bias": "pytorch_model-00002-of-00003.bin",
269
+ "roberta.encoder.layer.22.intermediate.dense.weight": "pytorch_model-00002-of-00003.bin",
270
+ "roberta.encoder.layer.22.output.dense.bias": "pytorch_model-00002-of-00003.bin",
271
+ "roberta.encoder.layer.22.output.dense.weight": "pytorch_model-00002-of-00003.bin",
272
+ "roberta.encoder.layer.23.LayerNorm.bias": "pytorch_model-00003-of-00003.bin",
273
+ "roberta.encoder.layer.23.LayerNorm.weight": "pytorch_model-00003-of-00003.bin",
274
+ "roberta.encoder.layer.23.attention.output.dense.bias": "pytorch_model-00002-of-00003.bin",
275
+ "roberta.encoder.layer.23.attention.output.dense.weight": "pytorch_model-00002-of-00003.bin",
276
+ "roberta.encoder.layer.23.attention.self.key.bias": "pytorch_model-00002-of-00003.bin",
277
+ "roberta.encoder.layer.23.attention.self.key.weight": "pytorch_model-00002-of-00003.bin",
278
+ "roberta.encoder.layer.23.attention.self.query.bias": "pytorch_model-00002-of-00003.bin",
279
+ "roberta.encoder.layer.23.attention.self.query.weight": "pytorch_model-00002-of-00003.bin",
280
+ "roberta.encoder.layer.23.attention.self.value.bias": "pytorch_model-00002-of-00003.bin",
281
+ "roberta.encoder.layer.23.attention.self.value.weight": "pytorch_model-00002-of-00003.bin",
282
+ "roberta.encoder.layer.23.attention.self_attn_layer_norm.bias": "pytorch_model-00002-of-00003.bin",
283
+ "roberta.encoder.layer.23.attention.self_attn_layer_norm.weight": "pytorch_model-00002-of-00003.bin",
284
+ "roberta.encoder.layer.23.intermediate.dense.bias": "pytorch_model-00003-of-00003.bin",
285
+ "roberta.encoder.layer.23.intermediate.dense.weight": "pytorch_model-00003-of-00003.bin",
286
+ "roberta.encoder.layer.23.output.dense.bias": "pytorch_model-00003-of-00003.bin",
287
+ "roberta.encoder.layer.23.output.dense.weight": "pytorch_model-00003-of-00003.bin",
288
+ "roberta.encoder.layer.24.LayerNorm.bias": "pytorch_model-00003-of-00003.bin",
289
+ "roberta.encoder.layer.24.LayerNorm.weight": "pytorch_model-00003-of-00003.bin",
290
+ "roberta.encoder.layer.24.attention.output.dense.bias": "pytorch_model-00003-of-00003.bin",
291
+ "roberta.encoder.layer.24.attention.output.dense.weight": "pytorch_model-00003-of-00003.bin",
292
+ "roberta.encoder.layer.24.attention.self.key.bias": "pytorch_model-00003-of-00003.bin",
293
+ "roberta.encoder.layer.24.attention.self.key.weight": "pytorch_model-00003-of-00003.bin",
294
+ "roberta.encoder.layer.24.attention.self.query.bias": "pytorch_model-00003-of-00003.bin",
295
+ "roberta.encoder.layer.24.attention.self.query.weight": "pytorch_model-00003-of-00003.bin",
296
+ "roberta.encoder.layer.24.attention.self.value.bias": "pytorch_model-00003-of-00003.bin",
297
+ "roberta.encoder.layer.24.attention.self.value.weight": "pytorch_model-00003-of-00003.bin",
298
+ "roberta.encoder.layer.24.attention.self_attn_layer_norm.bias": "pytorch_model-00003-of-00003.bin",
299
+ "roberta.encoder.layer.24.attention.self_attn_layer_norm.weight": "pytorch_model-00003-of-00003.bin",
300
+ "roberta.encoder.layer.24.intermediate.dense.bias": "pytorch_model-00003-of-00003.bin",
301
+ "roberta.encoder.layer.24.intermediate.dense.weight": "pytorch_model-00003-of-00003.bin",
302
+ "roberta.encoder.layer.24.output.dense.bias": "pytorch_model-00003-of-00003.bin",
303
+ "roberta.encoder.layer.24.output.dense.weight": "pytorch_model-00003-of-00003.bin",
304
+ "roberta.encoder.layer.25.LayerNorm.bias": "pytorch_model-00003-of-00003.bin",
305
+ "roberta.encoder.layer.25.LayerNorm.weight": "pytorch_model-00003-of-00003.bin",
306
+ "roberta.encoder.layer.25.attention.output.dense.bias": "pytorch_model-00003-of-00003.bin",
307
+ "roberta.encoder.layer.25.attention.output.dense.weight": "pytorch_model-00003-of-00003.bin",
308
+ "roberta.encoder.layer.25.attention.self.key.bias": "pytorch_model-00003-of-00003.bin",
309
+ "roberta.encoder.layer.25.attention.self.key.weight": "pytorch_model-00003-of-00003.bin",
310
+ "roberta.encoder.layer.25.attention.self.query.bias": "pytorch_model-00003-of-00003.bin",
311
+ "roberta.encoder.layer.25.attention.self.query.weight": "pytorch_model-00003-of-00003.bin",
312
+ "roberta.encoder.layer.25.attention.self.value.bias": "pytorch_model-00003-of-00003.bin",
313
+ "roberta.encoder.layer.25.attention.self.value.weight": "pytorch_model-00003-of-00003.bin",
314
+ "roberta.encoder.layer.25.attention.self_attn_layer_norm.bias": "pytorch_model-00003-of-00003.bin",
315
+ "roberta.encoder.layer.25.attention.self_attn_layer_norm.weight": "pytorch_model-00003-of-00003.bin",
316
+ "roberta.encoder.layer.25.intermediate.dense.bias": "pytorch_model-00003-of-00003.bin",
317
+ "roberta.encoder.layer.25.intermediate.dense.weight": "pytorch_model-00003-of-00003.bin",
318
+ "roberta.encoder.layer.25.output.dense.bias": "pytorch_model-00003-of-00003.bin",
319
+ "roberta.encoder.layer.25.output.dense.weight": "pytorch_model-00003-of-00003.bin",
320
+ "roberta.encoder.layer.26.LayerNorm.bias": "pytorch_model-00003-of-00003.bin",
321
+ "roberta.encoder.layer.26.LayerNorm.weight": "pytorch_model-00003-of-00003.bin",
322
+ "roberta.encoder.layer.26.attention.output.dense.bias": "pytorch_model-00003-of-00003.bin",
323
+ "roberta.encoder.layer.26.attention.output.dense.weight": "pytorch_model-00003-of-00003.bin",
324
+ "roberta.encoder.layer.26.attention.self.key.bias": "pytorch_model-00003-of-00003.bin",
325
+ "roberta.encoder.layer.26.attention.self.key.weight": "pytorch_model-00003-of-00003.bin",
326
+ "roberta.encoder.layer.26.attention.self.query.bias": "pytorch_model-00003-of-00003.bin",
327
+ "roberta.encoder.layer.26.attention.self.query.weight": "pytorch_model-00003-of-00003.bin",
328
+ "roberta.encoder.layer.26.attention.self.value.bias": "pytorch_model-00003-of-00003.bin",
329
+ "roberta.encoder.layer.26.attention.self.value.weight": "pytorch_model-00003-of-00003.bin",
330
+ "roberta.encoder.layer.26.attention.self_attn_layer_norm.bias": "pytorch_model-00003-of-00003.bin",
331
+ "roberta.encoder.layer.26.attention.self_attn_layer_norm.weight": "pytorch_model-00003-of-00003.bin",
332
+ "roberta.encoder.layer.26.intermediate.dense.bias": "pytorch_model-00003-of-00003.bin",
333
+ "roberta.encoder.layer.26.intermediate.dense.weight": "pytorch_model-00003-of-00003.bin",
334
+ "roberta.encoder.layer.26.output.dense.bias": "pytorch_model-00003-of-00003.bin",
335
+ "roberta.encoder.layer.26.output.dense.weight": "pytorch_model-00003-of-00003.bin",
336
+ "roberta.encoder.layer.27.LayerNorm.bias": "pytorch_model-00003-of-00003.bin",
337
+ "roberta.encoder.layer.27.LayerNorm.weight": "pytorch_model-00003-of-00003.bin",
338
+ "roberta.encoder.layer.27.attention.output.dense.bias": "pytorch_model-00003-of-00003.bin",
339
+ "roberta.encoder.layer.27.attention.output.dense.weight": "pytorch_model-00003-of-00003.bin",
340
+ "roberta.encoder.layer.27.attention.self.key.bias": "pytorch_model-00003-of-00003.bin",
341
+ "roberta.encoder.layer.27.attention.self.key.weight": "pytorch_model-00003-of-00003.bin",
342
+ "roberta.encoder.layer.27.attention.self.query.bias": "pytorch_model-00003-of-00003.bin",
343
+ "roberta.encoder.layer.27.attention.self.query.weight": "pytorch_model-00003-of-00003.bin",
344
+ "roberta.encoder.layer.27.attention.self.value.bias": "pytorch_model-00003-of-00003.bin",
345
+ "roberta.encoder.layer.27.attention.self.value.weight": "pytorch_model-00003-of-00003.bin",
346
+ "roberta.encoder.layer.27.attention.self_attn_layer_norm.bias": "pytorch_model-00003-of-00003.bin",
347
+ "roberta.encoder.layer.27.attention.self_attn_layer_norm.weight": "pytorch_model-00003-of-00003.bin",
348
+ "roberta.encoder.layer.27.intermediate.dense.bias": "pytorch_model-00003-of-00003.bin",
349
+ "roberta.encoder.layer.27.intermediate.dense.weight": "pytorch_model-00003-of-00003.bin",
350
+ "roberta.encoder.layer.27.output.dense.bias": "pytorch_model-00003-of-00003.bin",
351
+ "roberta.encoder.layer.27.output.dense.weight": "pytorch_model-00003-of-00003.bin",
352
+ "roberta.encoder.layer.28.LayerNorm.bias": "pytorch_model-00003-of-00003.bin",
353
+ "roberta.encoder.layer.28.LayerNorm.weight": "pytorch_model-00003-of-00003.bin",
354
+ "roberta.encoder.layer.28.attention.output.dense.bias": "pytorch_model-00003-of-00003.bin",
355
+ "roberta.encoder.layer.28.attention.output.dense.weight": "pytorch_model-00003-of-00003.bin",
356
+ "roberta.encoder.layer.28.attention.self.key.bias": "pytorch_model-00003-of-00003.bin",
357
+ "roberta.encoder.layer.28.attention.self.key.weight": "pytorch_model-00003-of-00003.bin",
358
+ "roberta.encoder.layer.28.attention.self.query.bias": "pytorch_model-00003-of-00003.bin",
359
+ "roberta.encoder.layer.28.attention.self.query.weight": "pytorch_model-00003-of-00003.bin",
360
+ "roberta.encoder.layer.28.attention.self.value.bias": "pytorch_model-00003-of-00003.bin",
361
+ "roberta.encoder.layer.28.attention.self.value.weight": "pytorch_model-00003-of-00003.bin",
362
+ "roberta.encoder.layer.28.attention.self_attn_layer_norm.bias": "pytorch_model-00003-of-00003.bin",
363
+ "roberta.encoder.layer.28.attention.self_attn_layer_norm.weight": "pytorch_model-00003-of-00003.bin",
364
+ "roberta.encoder.layer.28.intermediate.dense.bias": "pytorch_model-00003-of-00003.bin",
365
+ "roberta.encoder.layer.28.intermediate.dense.weight": "pytorch_model-00003-of-00003.bin",
366
+ "roberta.encoder.layer.28.output.dense.bias": "pytorch_model-00003-of-00003.bin",
367
+ "roberta.encoder.layer.28.output.dense.weight": "pytorch_model-00003-of-00003.bin",
368
+ "roberta.encoder.layer.29.LayerNorm.bias": "pytorch_model-00003-of-00003.bin",
369
+ "roberta.encoder.layer.29.LayerNorm.weight": "pytorch_model-00003-of-00003.bin",
370
+ "roberta.encoder.layer.29.attention.output.dense.bias": "pytorch_model-00003-of-00003.bin",
371
+ "roberta.encoder.layer.29.attention.output.dense.weight": "pytorch_model-00003-of-00003.bin",
372
+ "roberta.encoder.layer.29.attention.self.key.bias": "pytorch_model-00003-of-00003.bin",
373
+ "roberta.encoder.layer.29.attention.self.key.weight": "pytorch_model-00003-of-00003.bin",
374
+ "roberta.encoder.layer.29.attention.self.query.bias": "pytorch_model-00003-of-00003.bin",
375
+ "roberta.encoder.layer.29.attention.self.query.weight": "pytorch_model-00003-of-00003.bin",
376
+ "roberta.encoder.layer.29.attention.self.value.bias": "pytorch_model-00003-of-00003.bin",
377
+ "roberta.encoder.layer.29.attention.self.value.weight": "pytorch_model-00003-of-00003.bin",
378
+ "roberta.encoder.layer.29.attention.self_attn_layer_norm.bias": "pytorch_model-00003-of-00003.bin",
379
+ "roberta.encoder.layer.29.attention.self_attn_layer_norm.weight": "pytorch_model-00003-of-00003.bin",
380
+ "roberta.encoder.layer.29.intermediate.dense.bias": "pytorch_model-00003-of-00003.bin",
381
+ "roberta.encoder.layer.29.intermediate.dense.weight": "pytorch_model-00003-of-00003.bin",
382
+ "roberta.encoder.layer.29.output.dense.bias": "pytorch_model-00003-of-00003.bin",
383
+ "roberta.encoder.layer.29.output.dense.weight": "pytorch_model-00003-of-00003.bin",
384
+ "roberta.encoder.layer.3.LayerNorm.bias": "pytorch_model-00001-of-00003.bin",
385
+ "roberta.encoder.layer.3.LayerNorm.weight": "pytorch_model-00001-of-00003.bin",
386
+ "roberta.encoder.layer.3.attention.output.dense.bias": "pytorch_model-00001-of-00003.bin",
387
+ "roberta.encoder.layer.3.attention.output.dense.weight": "pytorch_model-00001-of-00003.bin",
388
+ "roberta.encoder.layer.3.attention.self.key.bias": "pytorch_model-00001-of-00003.bin",
389
+ "roberta.encoder.layer.3.attention.self.key.weight": "pytorch_model-00001-of-00003.bin",
390
+ "roberta.encoder.layer.3.attention.self.query.bias": "pytorch_model-00001-of-00003.bin",
391
+ "roberta.encoder.layer.3.attention.self.query.weight": "pytorch_model-00001-of-00003.bin",
392
+ "roberta.encoder.layer.3.attention.self.value.bias": "pytorch_model-00001-of-00003.bin",
393
+ "roberta.encoder.layer.3.attention.self.value.weight": "pytorch_model-00001-of-00003.bin",
394
+ "roberta.encoder.layer.3.attention.self_attn_layer_norm.bias": "pytorch_model-00001-of-00003.bin",
395
+ "roberta.encoder.layer.3.attention.self_attn_layer_norm.weight": "pytorch_model-00001-of-00003.bin",
396
+ "roberta.encoder.layer.3.intermediate.dense.bias": "pytorch_model-00001-of-00003.bin",
397
+ "roberta.encoder.layer.3.intermediate.dense.weight": "pytorch_model-00001-of-00003.bin",
398
+ "roberta.encoder.layer.3.output.dense.bias": "pytorch_model-00001-of-00003.bin",
399
+ "roberta.encoder.layer.3.output.dense.weight": "pytorch_model-00001-of-00003.bin",
400
+ "roberta.encoder.layer.30.LayerNorm.bias": "pytorch_model-00003-of-00003.bin",
401
+ "roberta.encoder.layer.30.LayerNorm.weight": "pytorch_model-00003-of-00003.bin",
402
+ "roberta.encoder.layer.30.attention.output.dense.bias": "pytorch_model-00003-of-00003.bin",
403
+ "roberta.encoder.layer.30.attention.output.dense.weight": "pytorch_model-00003-of-00003.bin",
404
+ "roberta.encoder.layer.30.attention.self.key.bias": "pytorch_model-00003-of-00003.bin",
405
+ "roberta.encoder.layer.30.attention.self.key.weight": "pytorch_model-00003-of-00003.bin",
406
+ "roberta.encoder.layer.30.attention.self.query.bias": "pytorch_model-00003-of-00003.bin",
407
+ "roberta.encoder.layer.30.attention.self.query.weight": "pytorch_model-00003-of-00003.bin",
408
+ "roberta.encoder.layer.30.attention.self.value.bias": "pytorch_model-00003-of-00003.bin",
409
+ "roberta.encoder.layer.30.attention.self.value.weight": "pytorch_model-00003-of-00003.bin",
410
+ "roberta.encoder.layer.30.attention.self_attn_layer_norm.bias": "pytorch_model-00003-of-00003.bin",
411
+ "roberta.encoder.layer.30.attention.self_attn_layer_norm.weight": "pytorch_model-00003-of-00003.bin",
412
+ "roberta.encoder.layer.30.intermediate.dense.bias": "pytorch_model-00003-of-00003.bin",
413
+ "roberta.encoder.layer.30.intermediate.dense.weight": "pytorch_model-00003-of-00003.bin",
414
+ "roberta.encoder.layer.30.output.dense.bias": "pytorch_model-00003-of-00003.bin",
415
+ "roberta.encoder.layer.30.output.dense.weight": "pytorch_model-00003-of-00003.bin",
416
+ "roberta.encoder.layer.31.LayerNorm.bias": "pytorch_model-00003-of-00003.bin",
417
+ "roberta.encoder.layer.31.LayerNorm.weight": "pytorch_model-00003-of-00003.bin",
418
+ "roberta.encoder.layer.31.attention.output.dense.bias": "pytorch_model-00003-of-00003.bin",
419
+ "roberta.encoder.layer.31.attention.output.dense.weight": "pytorch_model-00003-of-00003.bin",
420
+ "roberta.encoder.layer.31.attention.self.key.bias": "pytorch_model-00003-of-00003.bin",
421
+ "roberta.encoder.layer.31.attention.self.key.weight": "pytorch_model-00003-of-00003.bin",
422
+ "roberta.encoder.layer.31.attention.self.query.bias": "pytorch_model-00003-of-00003.bin",
423
+ "roberta.encoder.layer.31.attention.self.query.weight": "pytorch_model-00003-of-00003.bin",
424
+ "roberta.encoder.layer.31.attention.self.value.bias": "pytorch_model-00003-of-00003.bin",
425
+ "roberta.encoder.layer.31.attention.self.value.weight": "pytorch_model-00003-of-00003.bin",
426
+ "roberta.encoder.layer.31.attention.self_attn_layer_norm.bias": "pytorch_model-00003-of-00003.bin",
427
+ "roberta.encoder.layer.31.attention.self_attn_layer_norm.weight": "pytorch_model-00003-of-00003.bin",
428
+ "roberta.encoder.layer.31.intermediate.dense.bias": "pytorch_model-00003-of-00003.bin",
429
+ "roberta.encoder.layer.31.intermediate.dense.weight": "pytorch_model-00003-of-00003.bin",
430
+ "roberta.encoder.layer.31.output.dense.bias": "pytorch_model-00003-of-00003.bin",
431
+ "roberta.encoder.layer.31.output.dense.weight": "pytorch_model-00003-of-00003.bin",
432
+ "roberta.encoder.layer.32.LayerNorm.bias": "pytorch_model-00003-of-00003.bin",
433
+ "roberta.encoder.layer.32.LayerNorm.weight": "pytorch_model-00003-of-00003.bin",
434
+ "roberta.encoder.layer.32.attention.output.dense.bias": "pytorch_model-00003-of-00003.bin",
435
+ "roberta.encoder.layer.32.attention.output.dense.weight": "pytorch_model-00003-of-00003.bin",
436
+ "roberta.encoder.layer.32.attention.self.key.bias": "pytorch_model-00003-of-00003.bin",
437
+ "roberta.encoder.layer.32.attention.self.key.weight": "pytorch_model-00003-of-00003.bin",
438
+ "roberta.encoder.layer.32.attention.self.query.bias": "pytorch_model-00003-of-00003.bin",
439
+ "roberta.encoder.layer.32.attention.self.query.weight": "pytorch_model-00003-of-00003.bin",
440
+ "roberta.encoder.layer.32.attention.self.value.bias": "pytorch_model-00003-of-00003.bin",
441
+ "roberta.encoder.layer.32.attention.self.value.weight": "pytorch_model-00003-of-00003.bin",
442
+ "roberta.encoder.layer.32.attention.self_attn_layer_norm.bias": "pytorch_model-00003-of-00003.bin",
443
+ "roberta.encoder.layer.32.attention.self_attn_layer_norm.weight": "pytorch_model-00003-of-00003.bin",
444
+ "roberta.encoder.layer.32.intermediate.dense.bias": "pytorch_model-00003-of-00003.bin",
445
+ "roberta.encoder.layer.32.intermediate.dense.weight": "pytorch_model-00003-of-00003.bin",
446
+ "roberta.encoder.layer.32.output.dense.bias": "pytorch_model-00003-of-00003.bin",
447
+ "roberta.encoder.layer.32.output.dense.weight": "pytorch_model-00003-of-00003.bin",
448
+ "roberta.encoder.layer.33.LayerNorm.bias": "pytorch_model-00003-of-00003.bin",
449
+ "roberta.encoder.layer.33.LayerNorm.weight": "pytorch_model-00003-of-00003.bin",
450
+ "roberta.encoder.layer.33.attention.output.dense.bias": "pytorch_model-00003-of-00003.bin",
451
+ "roberta.encoder.layer.33.attention.output.dense.weight": "pytorch_model-00003-of-00003.bin",
452
+ "roberta.encoder.layer.33.attention.self.key.bias": "pytorch_model-00003-of-00003.bin",
453
+ "roberta.encoder.layer.33.attention.self.key.weight": "pytorch_model-00003-of-00003.bin",
454
+ "roberta.encoder.layer.33.attention.self.query.bias": "pytorch_model-00003-of-00003.bin",
455
+ "roberta.encoder.layer.33.attention.self.query.weight": "pytorch_model-00003-of-00003.bin",
456
+ "roberta.encoder.layer.33.attention.self.value.bias": "pytorch_model-00003-of-00003.bin",
457
+ "roberta.encoder.layer.33.attention.self.value.weight": "pytorch_model-00003-of-00003.bin",
458
+ "roberta.encoder.layer.33.attention.self_attn_layer_norm.bias": "pytorch_model-00003-of-00003.bin",
459
+ "roberta.encoder.layer.33.attention.self_attn_layer_norm.weight": "pytorch_model-00003-of-00003.bin",
460
+ "roberta.encoder.layer.33.intermediate.dense.bias": "pytorch_model-00003-of-00003.bin",
461
+ "roberta.encoder.layer.33.intermediate.dense.weight": "pytorch_model-00003-of-00003.bin",
462
+ "roberta.encoder.layer.33.output.dense.bias": "pytorch_model-00003-of-00003.bin",
463
+ "roberta.encoder.layer.33.output.dense.weight": "pytorch_model-00003-of-00003.bin",
464
+ "roberta.encoder.layer.34.LayerNorm.bias": "pytorch_model-00003-of-00003.bin",
465
+ "roberta.encoder.layer.34.LayerNorm.weight": "pytorch_model-00003-of-00003.bin",
466
+ "roberta.encoder.layer.34.attention.output.dense.bias": "pytorch_model-00003-of-00003.bin",
467
+ "roberta.encoder.layer.34.attention.output.dense.weight": "pytorch_model-00003-of-00003.bin",
468
+ "roberta.encoder.layer.34.attention.self.key.bias": "pytorch_model-00003-of-00003.bin",
469
+ "roberta.encoder.layer.34.attention.self.key.weight": "pytorch_model-00003-of-00003.bin",
470
+ "roberta.encoder.layer.34.attention.self.query.bias": "pytorch_model-00003-of-00003.bin",
471
+ "roberta.encoder.layer.34.attention.self.query.weight": "pytorch_model-00003-of-00003.bin",
472
+ "roberta.encoder.layer.34.attention.self.value.bias": "pytorch_model-00003-of-00003.bin",
473
+ "roberta.encoder.layer.34.attention.self.value.weight": "pytorch_model-00003-of-00003.bin",
474
+ "roberta.encoder.layer.34.attention.self_attn_layer_norm.bias": "pytorch_model-00003-of-00003.bin",
475
+ "roberta.encoder.layer.34.attention.self_attn_layer_norm.weight": "pytorch_model-00003-of-00003.bin",
476
+ "roberta.encoder.layer.34.intermediate.dense.bias": "pytorch_model-00003-of-00003.bin",
477
+ "roberta.encoder.layer.34.intermediate.dense.weight": "pytorch_model-00003-of-00003.bin",
478
+ "roberta.encoder.layer.34.output.dense.bias": "pytorch_model-00003-of-00003.bin",
479
+ "roberta.encoder.layer.34.output.dense.weight": "pytorch_model-00003-of-00003.bin",
480
+ "roberta.encoder.layer.35.LayerNorm.bias": "pytorch_model-00003-of-00003.bin",
481
+ "roberta.encoder.layer.35.LayerNorm.weight": "pytorch_model-00003-of-00003.bin",
482
+ "roberta.encoder.layer.35.attention.output.dense.bias": "pytorch_model-00003-of-00003.bin",
483
+ "roberta.encoder.layer.35.attention.output.dense.weight": "pytorch_model-00003-of-00003.bin",
484
+ "roberta.encoder.layer.35.attention.self.key.bias": "pytorch_model-00003-of-00003.bin",
485
+ "roberta.encoder.layer.35.attention.self.key.weight": "pytorch_model-00003-of-00003.bin",
486
+ "roberta.encoder.layer.35.attention.self.query.bias": "pytorch_model-00003-of-00003.bin",
487
+ "roberta.encoder.layer.35.attention.self.query.weight": "pytorch_model-00003-of-00003.bin",
488
+ "roberta.encoder.layer.35.attention.self.value.bias": "pytorch_model-00003-of-00003.bin",
489
+ "roberta.encoder.layer.35.attention.self.value.weight": "pytorch_model-00003-of-00003.bin",
490
+ "roberta.encoder.layer.35.attention.self_attn_layer_norm.bias": "pytorch_model-00003-of-00003.bin",
491
+ "roberta.encoder.layer.35.attention.self_attn_layer_norm.weight": "pytorch_model-00003-of-00003.bin",
492
+ "roberta.encoder.layer.35.intermediate.dense.bias": "pytorch_model-00003-of-00003.bin",
493
+ "roberta.encoder.layer.35.intermediate.dense.weight": "pytorch_model-00003-of-00003.bin",
494
+ "roberta.encoder.layer.35.output.dense.bias": "pytorch_model-00003-of-00003.bin",
495
+ "roberta.encoder.layer.35.output.dense.weight": "pytorch_model-00003-of-00003.bin",
496
+ "roberta.encoder.layer.4.LayerNorm.bias": "pytorch_model-00001-of-00003.bin",
497
+ "roberta.encoder.layer.4.LayerNorm.weight": "pytorch_model-00001-of-00003.bin",
498
+ "roberta.encoder.layer.4.attention.output.dense.bias": "pytorch_model-00001-of-00003.bin",
499
+ "roberta.encoder.layer.4.attention.output.dense.weight": "pytorch_model-00001-of-00003.bin",
500
+ "roberta.encoder.layer.4.attention.self.key.bias": "pytorch_model-00001-of-00003.bin",
501
+ "roberta.encoder.layer.4.attention.self.key.weight": "pytorch_model-00001-of-00003.bin",
502
+ "roberta.encoder.layer.4.attention.self.query.bias": "pytorch_model-00001-of-00003.bin",
503
+ "roberta.encoder.layer.4.attention.self.query.weight": "pytorch_model-00001-of-00003.bin",
504
+ "roberta.encoder.layer.4.attention.self.value.bias": "pytorch_model-00001-of-00003.bin",
505
+ "roberta.encoder.layer.4.attention.self.value.weight": "pytorch_model-00001-of-00003.bin",
506
+ "roberta.encoder.layer.4.attention.self_attn_layer_norm.bias": "pytorch_model-00001-of-00003.bin",
507
+ "roberta.encoder.layer.4.attention.self_attn_layer_norm.weight": "pytorch_model-00001-of-00003.bin",
508
+ "roberta.encoder.layer.4.intermediate.dense.bias": "pytorch_model-00001-of-00003.bin",
509
+ "roberta.encoder.layer.4.intermediate.dense.weight": "pytorch_model-00001-of-00003.bin",
510
+ "roberta.encoder.layer.4.output.dense.bias": "pytorch_model-00001-of-00003.bin",
511
+ "roberta.encoder.layer.4.output.dense.weight": "pytorch_model-00001-of-00003.bin",
512
+ "roberta.encoder.layer.5.LayerNorm.bias": "pytorch_model-00001-of-00003.bin",
513
+ "roberta.encoder.layer.5.LayerNorm.weight": "pytorch_model-00001-of-00003.bin",
514
+ "roberta.encoder.layer.5.attention.output.dense.bias": "pytorch_model-00001-of-00003.bin",
515
+ "roberta.encoder.layer.5.attention.output.dense.weight": "pytorch_model-00001-of-00003.bin",
516
+ "roberta.encoder.layer.5.attention.self.key.bias": "pytorch_model-00001-of-00003.bin",
517
+ "roberta.encoder.layer.5.attention.self.key.weight": "pytorch_model-00001-of-00003.bin",
518
+ "roberta.encoder.layer.5.attention.self.query.bias": "pytorch_model-00001-of-00003.bin",
519
+ "roberta.encoder.layer.5.attention.self.query.weight": "pytorch_model-00001-of-00003.bin",
520
+ "roberta.encoder.layer.5.attention.self.value.bias": "pytorch_model-00001-of-00003.bin",
521
+ "roberta.encoder.layer.5.attention.self.value.weight": "pytorch_model-00001-of-00003.bin",
522
+ "roberta.encoder.layer.5.attention.self_attn_layer_norm.bias": "pytorch_model-00001-of-00003.bin",
523
+ "roberta.encoder.layer.5.attention.self_attn_layer_norm.weight": "pytorch_model-00001-of-00003.bin",
524
+ "roberta.encoder.layer.5.intermediate.dense.bias": "pytorch_model-00001-of-00003.bin",
525
+ "roberta.encoder.layer.5.intermediate.dense.weight": "pytorch_model-00001-of-00003.bin",
526
+ "roberta.encoder.layer.5.output.dense.bias": "pytorch_model-00001-of-00003.bin",
527
+ "roberta.encoder.layer.5.output.dense.weight": "pytorch_model-00001-of-00003.bin",
528
+ "roberta.encoder.layer.6.LayerNorm.bias": "pytorch_model-00001-of-00003.bin",
529
+ "roberta.encoder.layer.6.LayerNorm.weight": "pytorch_model-00001-of-00003.bin",
530
+ "roberta.encoder.layer.6.attention.output.dense.bias": "pytorch_model-00001-of-00003.bin",
531
+ "roberta.encoder.layer.6.attention.output.dense.weight": "pytorch_model-00001-of-00003.bin",
532
+ "roberta.encoder.layer.6.attention.self.key.bias": "pytorch_model-00001-of-00003.bin",
533
+ "roberta.encoder.layer.6.attention.self.key.weight": "pytorch_model-00001-of-00003.bin",
534
+ "roberta.encoder.layer.6.attention.self.query.bias": "pytorch_model-00001-of-00003.bin",
535
+ "roberta.encoder.layer.6.attention.self.query.weight": "pytorch_model-00001-of-00003.bin",
536
+ "roberta.encoder.layer.6.attention.self.value.bias": "pytorch_model-00001-of-00003.bin",
537
+ "roberta.encoder.layer.6.attention.self.value.weight": "pytorch_model-00001-of-00003.bin",
538
+ "roberta.encoder.layer.6.attention.self_attn_layer_norm.bias": "pytorch_model-00001-of-00003.bin",
539
+ "roberta.encoder.layer.6.attention.self_attn_layer_norm.weight": "pytorch_model-00001-of-00003.bin",
540
+ "roberta.encoder.layer.6.intermediate.dense.bias": "pytorch_model-00001-of-00003.bin",
541
+ "roberta.encoder.layer.6.intermediate.dense.weight": "pytorch_model-00001-of-00003.bin",
542
+ "roberta.encoder.layer.6.output.dense.bias": "pytorch_model-00001-of-00003.bin",
543
+ "roberta.encoder.layer.6.output.dense.weight": "pytorch_model-00001-of-00003.bin",
544
+ "roberta.encoder.layer.7.LayerNorm.bias": "pytorch_model-00002-of-00003.bin",
545
+ "roberta.encoder.layer.7.LayerNorm.weight": "pytorch_model-00002-of-00003.bin",
546
+ "roberta.encoder.layer.7.attention.output.dense.bias": "pytorch_model-00001-of-00003.bin",
547
+ "roberta.encoder.layer.7.attention.output.dense.weight": "pytorch_model-00001-of-00003.bin",
548
+ "roberta.encoder.layer.7.attention.self.key.bias": "pytorch_model-00001-of-00003.bin",
549
+ "roberta.encoder.layer.7.attention.self.key.weight": "pytorch_model-00001-of-00003.bin",
550
+ "roberta.encoder.layer.7.attention.self.query.bias": "pytorch_model-00001-of-00003.bin",
551
+ "roberta.encoder.layer.7.attention.self.query.weight": "pytorch_model-00001-of-00003.bin",
552
+ "roberta.encoder.layer.7.attention.self.value.bias": "pytorch_model-00001-of-00003.bin",
553
+ "roberta.encoder.layer.7.attention.self.value.weight": "pytorch_model-00001-of-00003.bin",
554
+ "roberta.encoder.layer.7.attention.self_attn_layer_norm.bias": "pytorch_model-00001-of-00003.bin",
555
+ "roberta.encoder.layer.7.attention.self_attn_layer_norm.weight": "pytorch_model-00001-of-00003.bin",
556
+ "roberta.encoder.layer.7.intermediate.dense.bias": "pytorch_model-00001-of-00003.bin",
557
+ "roberta.encoder.layer.7.intermediate.dense.weight": "pytorch_model-00001-of-00003.bin",
558
+ "roberta.encoder.layer.7.output.dense.bias": "pytorch_model-00002-of-00003.bin",
559
+ "roberta.encoder.layer.7.output.dense.weight": "pytorch_model-00002-of-00003.bin",
560
+ "roberta.encoder.layer.8.LayerNorm.bias": "pytorch_model-00002-of-00003.bin",
561
+ "roberta.encoder.layer.8.LayerNorm.weight": "pytorch_model-00002-of-00003.bin",
562
+ "roberta.encoder.layer.8.attention.output.dense.bias": "pytorch_model-00002-of-00003.bin",
563
+ "roberta.encoder.layer.8.attention.output.dense.weight": "pytorch_model-00002-of-00003.bin",
564
+ "roberta.encoder.layer.8.attention.self.key.bias": "pytorch_model-00002-of-00003.bin",
565
+ "roberta.encoder.layer.8.attention.self.key.weight": "pytorch_model-00002-of-00003.bin",
566
+ "roberta.encoder.layer.8.attention.self.query.bias": "pytorch_model-00002-of-00003.bin",
567
+ "roberta.encoder.layer.8.attention.self.query.weight": "pytorch_model-00002-of-00003.bin",
568
+ "roberta.encoder.layer.8.attention.self.value.bias": "pytorch_model-00002-of-00003.bin",
569
+ "roberta.encoder.layer.8.attention.self.value.weight": "pytorch_model-00002-of-00003.bin",
570
+ "roberta.encoder.layer.8.attention.self_attn_layer_norm.bias": "pytorch_model-00002-of-00003.bin",
571
+ "roberta.encoder.layer.8.attention.self_attn_layer_norm.weight": "pytorch_model-00002-of-00003.bin",
572
+ "roberta.encoder.layer.8.intermediate.dense.bias": "pytorch_model-00002-of-00003.bin",
573
+ "roberta.encoder.layer.8.intermediate.dense.weight": "pytorch_model-00002-of-00003.bin",
574
+ "roberta.encoder.layer.8.output.dense.bias": "pytorch_model-00002-of-00003.bin",
575
+ "roberta.encoder.layer.8.output.dense.weight": "pytorch_model-00002-of-00003.bin",
576
+ "roberta.encoder.layer.9.LayerNorm.bias": "pytorch_model-00002-of-00003.bin",
577
+ "roberta.encoder.layer.9.LayerNorm.weight": "pytorch_model-00002-of-00003.bin",
578
+ "roberta.encoder.layer.9.attention.output.dense.bias": "pytorch_model-00002-of-00003.bin",
579
+ "roberta.encoder.layer.9.attention.output.dense.weight": "pytorch_model-00002-of-00003.bin",
580
+ "roberta.encoder.layer.9.attention.self.key.bias": "pytorch_model-00002-of-00003.bin",
581
+ "roberta.encoder.layer.9.attention.self.key.weight": "pytorch_model-00002-of-00003.bin",
582
+ "roberta.encoder.layer.9.attention.self.query.bias": "pytorch_model-00002-of-00003.bin",
583
+ "roberta.encoder.layer.9.attention.self.query.weight": "pytorch_model-00002-of-00003.bin",
584
+ "roberta.encoder.layer.9.attention.self.value.bias": "pytorch_model-00002-of-00003.bin",
585
+ "roberta.encoder.layer.9.attention.self.value.weight": "pytorch_model-00002-of-00003.bin",
586
+ "roberta.encoder.layer.9.attention.self_attn_layer_norm.bias": "pytorch_model-00002-of-00003.bin",
587
+ "roberta.encoder.layer.9.attention.self_attn_layer_norm.weight": "pytorch_model-00002-of-00003.bin",
588
+ "roberta.encoder.layer.9.intermediate.dense.bias": "pytorch_model-00002-of-00003.bin",
589
+ "roberta.encoder.layer.9.intermediate.dense.weight": "pytorch_model-00002-of-00003.bin",
590
+ "roberta.encoder.layer.9.output.dense.bias": "pytorch_model-00002-of-00003.bin",
591
+ "roberta.encoder.layer.9.output.dense.weight": "pytorch_model-00002-of-00003.bin"
592
+ }
593
+ }
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "mask_token": "<mask>",
49
+ "model_max_length": 1000000000000000019884624838656,
50
+ "pad_token": "<pad>",
51
+ "sep_token": "</s>",
52
+ "sp_model_kwargs": {},
53
+ "tokenizer_class": "XLMRobertaTokenizer",
54
+ "unk_token": "<unk>"
55
+ }