Elron commited on
Commit
27e031e
1 Parent(s): 5ae00f1

Pushing deberta-v3-large-irony to hub

Browse files
README.md ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ tags:
4
+ - generated_from_trainer
5
+ metrics:
6
+ - accuracy
7
+ model-index:
8
+ - name: deberta-v3-large-irony-lr8e-6-gas2-ls0.1
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ # deberta-v3-large-irony-lr8e-6-gas2-ls0.1
16
+
17
+ This model is a fine-tuned version of [microsoft/deberta-v3-large](https://huggingface.co/microsoft/deberta-v3-large) on an unknown dataset.
18
+ It achieves the following results on the evaluation set:
19
+ - Loss: 0.7673
20
+ - Accuracy: 0.7675
21
+
22
+ ## Model description
23
+
24
+ More information needed
25
+
26
+ ## Intended uses & limitations
27
+
28
+ More information needed
29
+
30
+ ## Training and evaluation data
31
+
32
+ More information needed
33
+
34
+ ## Training procedure
35
+
36
+ ### Training hyperparameters
37
+
38
+ The following hyperparameters were used during training:
39
+ - learning_rate: 8e-06
40
+ - train_batch_size: 16
41
+ - eval_batch_size: 16
42
+ - seed: 42
43
+ - gradient_accumulation_steps: 2
44
+ - total_train_batch_size: 32
45
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
46
+ - lr_scheduler_type: linear
47
+ - lr_scheduler_warmup_steps: 50
48
+ - num_epochs: 10.0
49
+ - label_smoothing_factor: 0.1
50
+
51
+ ### Training results
52
+
53
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
54
+ |:-------------:|:-----:|:----:|:---------------:|:--------:|
55
+ | 0.6478 | 1.12 | 100 | 0.5890 | 0.7529 |
56
+ | 0.5013 | 2.25 | 200 | 0.5873 | 0.7707 |
57
+ | 0.388 | 3.37 | 300 | 0.6993 | 0.7602 |
58
+ | 0.3169 | 4.49 | 400 | 0.6773 | 0.7874 |
59
+ | 0.2693 | 5.61 | 500 | 0.7172 | 0.7707 |
60
+ | 0.2396 | 6.74 | 600 | 0.7397 | 0.7801 |
61
+ | 0.2284 | 7.86 | 700 | 0.8096 | 0.7550 |
62
+ | 0.2207 | 8.98 | 800 | 0.7827 | 0.7654 |
63
+
64
+
65
+ ### Framework versions
66
+
67
+ - Transformers 4.20.0.dev0
68
+ - Pytorch 1.9.0
69
+ - Datasets 2.2.2
70
+ - Tokenizers 0.11.6
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
all_results.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.99,
3
+ "eval_accuracy": 0.7675392627716064,
4
+ "eval_loss": 0.7672834396362305,
5
+ "eval_runtime": 6.3297,
6
+ "eval_samples": 955,
7
+ "eval_samples_per_second": 150.876,
8
+ "eval_steps_per_second": 9.479,
9
+ "train_loss": 0.33834649632486063,
10
+ "train_runtime": 677.9414,
11
+ "train_samples": 2862,
12
+ "train_samples_per_second": 42.216,
13
+ "train_steps_per_second": 1.313
14
+ }
config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/deberta-v3-large",
3
+ "architectures": [
4
+ "DebertaV2ForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 1024,
10
+ "id2label": {
11
+ "0": 0,
12
+ "1": 1
13
+ },
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 4096,
16
+ "label2id": {
17
+ "0": 0,
18
+ "1": 1
19
+ },
20
+ "layer_norm_eps": 1e-07,
21
+ "max_position_embeddings": 512,
22
+ "max_relative_positions": -1,
23
+ "model_type": "deberta-v2",
24
+ "norm_rel_ebd": "layer_norm",
25
+ "num_attention_heads": 16,
26
+ "num_hidden_layers": 24,
27
+ "pad_token_id": 0,
28
+ "pooler_dropout": 0,
29
+ "pooler_hidden_act": "gelu",
30
+ "pooler_hidden_size": 1024,
31
+ "pos_att_type": [
32
+ "p2c",
33
+ "c2p"
34
+ ],
35
+ "position_biased_input": false,
36
+ "position_buckets": 256,
37
+ "relative_attention": true,
38
+ "share_att_key": true,
39
+ "torch_dtype": "float32",
40
+ "transformers_version": "4.20.0.dev0",
41
+ "type_vocab_size": 0,
42
+ "vocab_size": 128100
43
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_accuracy": 0.7874345779418945,
3
+ "eval_loss": 0.5925345420837402,
4
+ "eval_runtime": 7.2146,
5
+ "eval_samples": 955,
6
+ "eval_samples_per_second": 132.371,
7
+ "eval_steps_per_second": 8.316
8
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c4b4cd8e258c97f6ddd4ec624d9ece030473071f026ea6d805c823274f2b7d1
3
+ size 1740393387
run_test.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ jbsub -queue x86_1h -cores 4+1 -mem 30g -require a100 -o outputs/train/tweet_eval2/irony/deberta-v3-large-irony-lr8e-6-gas2-ls0.1/test.log /dccstor/tslm/envs/anaconda3/envs/tslm-gen/bin/python train_clf.py --model_name_or_path outputs/train/tweet_eval2/irony/deberta-v3-large-irony-lr8e-6-gas2-ls0.1/best_checkpoint --train_file data/tweet_eval/irony/train.csv --validation_file data/tweet_eval/irony/validation.csv --test_file data/tweet_eval/irony/test.csv --do_eval --do_predict --report_to none --per_device_eval_batch_size 16 --max_seq_length 256 --output_dir outputs/train/tweet_eval2/irony/deberta-v3-large-irony-lr8e-6-gas2-ls0.1/best_checkpoint
run_train.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ jbsub -queue x86_6h -cores 4+1 -mem 30g -require a100 -o outputs/train/tweet_eval2/irony/deberta-v3-large-irony-lr8e-6-gas2-ls0.1/train.log /dccstor/tslm/envs/anaconda3/envs/tslm-gen/bin/python train_clf.py --model_name_or_path microsoft/deberta-v3-large --train_file data/tweet_eval/irony/train.csv --validation_file data/tweet_eval/irony/validation.csv --do_train --do_eval --per_device_train_batch_size 16 --per_device_eval_batch_size 16 --max_seq_length 256 --learning_rate 8e-6 --output_dir outputs/train/tweet_eval2/irony/deberta-v3-large-irony-lr8e-6-gas2-ls0.1 --evaluation_strategy steps --save_strategy no --warmup_steps 50 --num_train_epochs 10 --overwrite_output_dir --logging_steps 100 --gradient_accumulation_steps 2 --label_smoothing_factor 0.1 --report_to clearml --metric_for_best_model accuracy --logging_dir outputs/train/tweet_eval2/irony/deberta-v3-large-irony-lr8e-6-gas2-ls0.1/tb \; rm -rf outputs/train/tweet_eval2/irony/deberta-v3-large-irony-lr8e-6-gas2-ls0.1/tb \; rm -rf outputs/train/tweet_eval2/irony/deberta-v3-large-irony-lr8e-6-gas2-ls0.1/checkpoint-* \; . outputs/train/tweet_eval2/irony/deberta-v3-large-irony-lr8e-6-gas2-ls0.1/run_test.sh
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
test_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_accuracy": 0.8711734414100647,
3
+ "eval_loss": 0.37051811814308167,
4
+ "eval_runtime": 4.8769,
5
+ "eval_samples_per_second": 160.759,
6
+ "eval_steps_per_second": 10.047,
7
+ "test_samples": 784
8
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "eos_token": "[SEP]",
6
+ "mask_token": "[MASK]",
7
+ "name_or_path": "microsoft/deberta-v3-large",
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "sp_model_kwargs": {},
11
+ "special_tokens_map_file": null,
12
+ "split_by_punct": false,
13
+ "tokenizer_class": "DebertaV2Tokenizer",
14
+ "unk_token": "[UNK]",
15
+ "vocab_type": "spm"
16
+ }
trainer_state.json ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 9.994413407821229,
5
+ "global_step": 890,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 1.12,
12
+ "learning_rate": 7.5238095238095236e-06,
13
+ "loss": 0.6478,
14
+ "step": 100
15
+ },
16
+ {
17
+ "epoch": 1.12,
18
+ "eval_accuracy": 0.7528795599937439,
19
+ "eval_loss": 0.5890260338783264,
20
+ "eval_runtime": 6.3154,
21
+ "eval_samples_per_second": 151.217,
22
+ "eval_steps_per_second": 9.501,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 2.25,
27
+ "learning_rate": 6.571428571428571e-06,
28
+ "loss": 0.5013,
29
+ "step": 200
30
+ },
31
+ {
32
+ "epoch": 2.25,
33
+ "eval_accuracy": 0.7706806063652039,
34
+ "eval_loss": 0.5872902870178223,
35
+ "eval_runtime": 6.2684,
36
+ "eval_samples_per_second": 152.353,
37
+ "eval_steps_per_second": 9.572,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 3.37,
42
+ "learning_rate": 5.6190476190476185e-06,
43
+ "loss": 0.388,
44
+ "step": 300
45
+ },
46
+ {
47
+ "epoch": 3.37,
48
+ "eval_accuracy": 0.7602094411849976,
49
+ "eval_loss": 0.6993213891983032,
50
+ "eval_runtime": 6.3121,
51
+ "eval_samples_per_second": 151.298,
52
+ "eval_steps_per_second": 9.506,
53
+ "step": 300
54
+ },
55
+ {
56
+ "epoch": 4.49,
57
+ "learning_rate": 4.666666666666667e-06,
58
+ "loss": 0.3169,
59
+ "step": 400
60
+ },
61
+ {
62
+ "epoch": 4.49,
63
+ "eval_accuracy": 0.7874345779418945,
64
+ "eval_loss": 0.6773470640182495,
65
+ "eval_runtime": 6.2982,
66
+ "eval_samples_per_second": 151.63,
67
+ "eval_steps_per_second": 9.527,
68
+ "step": 400
69
+ },
70
+ {
71
+ "epoch": 5.61,
72
+ "learning_rate": 3.714285714285714e-06,
73
+ "loss": 0.2693,
74
+ "step": 500
75
+ },
76
+ {
77
+ "epoch": 5.61,
78
+ "eval_accuracy": 0.7706806063652039,
79
+ "eval_loss": 0.717186450958252,
80
+ "eval_runtime": 6.3196,
81
+ "eval_samples_per_second": 151.116,
82
+ "eval_steps_per_second": 9.494,
83
+ "step": 500
84
+ },
85
+ {
86
+ "epoch": 6.74,
87
+ "learning_rate": 2.7619047619047616e-06,
88
+ "loss": 0.2396,
89
+ "step": 600
90
+ },
91
+ {
92
+ "epoch": 6.74,
93
+ "eval_accuracy": 0.7801046967506409,
94
+ "eval_loss": 0.7397065758705139,
95
+ "eval_runtime": 6.2974,
96
+ "eval_samples_per_second": 151.649,
97
+ "eval_steps_per_second": 9.528,
98
+ "step": 600
99
+ },
100
+ {
101
+ "epoch": 7.86,
102
+ "learning_rate": 1.8095238095238095e-06,
103
+ "loss": 0.2284,
104
+ "step": 700
105
+ },
106
+ {
107
+ "epoch": 7.86,
108
+ "eval_accuracy": 0.754973828792572,
109
+ "eval_loss": 0.8096243143081665,
110
+ "eval_runtime": 6.3109,
111
+ "eval_samples_per_second": 151.325,
112
+ "eval_steps_per_second": 9.507,
113
+ "step": 700
114
+ },
115
+ {
116
+ "epoch": 8.98,
117
+ "learning_rate": 8.57142857142857e-07,
118
+ "loss": 0.2207,
119
+ "step": 800
120
+ },
121
+ {
122
+ "epoch": 8.98,
123
+ "eval_accuracy": 0.7654450535774231,
124
+ "eval_loss": 0.7827323079109192,
125
+ "eval_runtime": 6.3145,
126
+ "eval_samples_per_second": 151.239,
127
+ "eval_steps_per_second": 9.502,
128
+ "step": 800
129
+ },
130
+ {
131
+ "epoch": 9.99,
132
+ "step": 890,
133
+ "total_flos": 1.3329504276375552e+16,
134
+ "train_loss": 0.33834649632486063,
135
+ "train_runtime": 677.9414,
136
+ "train_samples_per_second": 42.216,
137
+ "train_steps_per_second": 1.313
138
+ }
139
+ ],
140
+ "max_steps": 890,
141
+ "num_train_epochs": 10,
142
+ "total_flos": 1.3329504276375552e+16,
143
+ "trial_name": null,
144
+ "trial_params": null
145
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a579f34da76ac1f097f1e3de8367214d02e4123e699bb75515a4919f6c59ae39
3
+ size 3311