yoshitomo-matsubara commited on
Commit
4b6e99c
1 Parent(s): 8aa1940

added files

Browse files
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bert-large-uncased",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "finetuning_task": "sst2",
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 4096,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 16,
18
+ "num_hidden_layers": 24,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "problem_type": "single_label_classification",
22
+ "transformers_version": "4.6.1",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 30522
26
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7fd03c28c913459e511f50178d7e23698fbb27f8d65e51630fb3ed76ef55972
3
+ size 1340746825
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "do_lower": true, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-large-uncased"}
training.log ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2021-05-21 19:58:37,393 INFO __main__ Namespace(adjust_lr=False, config='torchdistill/configs/sample/glue/sst2/ce/bert_large_uncased.yaml', log='log/glue/sst2/ce/bert_large_uncased.txt', private_output='leaderboard/glue/standard/bert_large_uncased/', seed=None, student_only=False, task_name='sst2', test_only=False, world_size=1)
2
+ 2021-05-21 19:58:37,424 INFO __main__ Distributed environment: NO
3
+ Num processes: 1
4
+ Process index: 0
5
+ Local process index: 0
6
+ Device: cuda
7
+ Use FP16 precision: True
8
+
9
+ 2021-05-21 19:59:09,385 INFO __main__ Start training
10
+ 2021-05-21 19:59:09,385 INFO torchdistill.models.util [student model]
11
+ 2021-05-21 19:59:09,385 INFO torchdistill.models.util Using the original student model
12
+ 2021-05-21 19:59:09,385 INFO torchdistill.core.training Loss = 1.0 * OrgLoss
13
+ 2021-05-21 19:59:12,381 INFO torchdistill.misc.log Epoch: [0] [ 0/2105] eta: 0:18:28 lr: 1.9996832937450518e-05 sample/s: 7.699286986458846 loss: 0.6751 (0.6751) time: 0.5265 data: 0.0069 max mem: 5398
14
+ 2021-05-21 20:03:38,267 INFO torchdistill.misc.log Epoch: [0] [ 500/2105] eta: 0:14:13 lr: 1.8413301662707842e-05 sample/s: 9.462027736575847 loss: 0.1741 (0.3158) time: 0.5113 data: 0.0033 max mem: 9069
15
+ 2021-05-21 20:08:00,838 INFO torchdistill.misc.log Epoch: [0] [1000/2105] eta: 0:09:43 lr: 1.6829770387965163e-05 sample/s: 8.101788484086374 loss: 0.1057 (0.2503) time: 0.5107 data: 0.0031 max mem: 9077
16
+ 2021-05-21 20:12:27,083 INFO torchdistill.misc.log Epoch: [0] [1500/2105] eta: 0:05:20 lr: 1.5246239113222487e-05 sample/s: 9.412420509914696 loss: 0.1231 (0.2204) time: 0.5003 data: 0.0031 max mem: 9077
17
+ 2021-05-21 20:16:50,177 INFO torchdistill.misc.log Epoch: [0] [2000/2105] eta: 0:00:55 lr: 1.3662707838479811e-05 sample/s: 6.35372809484153 loss: 0.1248 (0.2028) time: 0.5179 data: 0.0031 max mem: 9077
18
+ 2021-05-21 20:17:45,625 INFO torchdistill.misc.log Epoch: [0] Total time: 0:18:33
19
+ 2021-05-21 20:17:50,688 INFO /usr/local/lib/python3.7/dist-packages/datasets/metric.py Removing /root/.cache/huggingface/metrics/glue/sst2/default_experiment-1-0.arrow
20
+ 2021-05-21 20:17:50,688 INFO __main__ Validation: accuracy = 0.926605504587156
21
+ 2021-05-21 20:17:50,688 INFO __main__ Updating ckpt
22
+ 2021-05-21 20:17:55,658 INFO torchdistill.misc.log Epoch: [1] [ 0/2105] eta: 0:15:42 lr: 1.3330166270783848e-05 sample/s: 9.074749835026342 loss: 0.3282 (0.3282) time: 0.4477 data: 0.0069 max mem: 9077
23
+ 2021-05-21 20:22:19,598 INFO torchdistill.misc.log Epoch: [1] [ 500/2105] eta: 0:14:06 lr: 1.1746634996041172e-05 sample/s: 9.4612220084917 loss: 0.1395 (0.1429) time: 0.5248 data: 0.0031 max mem: 9077
24
+ 2021-05-21 20:26:44,113 INFO torchdistill.misc.log Epoch: [1] [1000/2105] eta: 0:09:43 lr: 1.0163103721298497e-05 sample/s: 7.072136805747001 loss: 0.0690 (0.1390) time: 0.5324 data: 0.0032 max mem: 9077
25
+ 2021-05-21 20:31:10,736 INFO torchdistill.misc.log Epoch: [1] [1500/2105] eta: 0:05:20 lr: 8.57957244655582e-06 sample/s: 8.10929156145717 loss: 0.1115 (0.1340) time: 0.5524 data: 0.0031 max mem: 9077
26
+ 2021-05-21 20:35:36,509 INFO torchdistill.misc.log Epoch: [1] [2000/2105] eta: 0:00:55 lr: 6.996041171813144e-06 sample/s: 7.076671033728169 loss: 0.1589 (0.1332) time: 0.5150 data: 0.0032 max mem: 9077
27
+ 2021-05-21 20:36:30,123 INFO torchdistill.misc.log Epoch: [1] Total time: 0:18:34
28
+ 2021-05-21 20:36:35,181 INFO /usr/local/lib/python3.7/dist-packages/datasets/metric.py Removing /root/.cache/huggingface/metrics/glue/sst2/default_experiment-1-0.arrow
29
+ 2021-05-21 20:36:35,182 INFO __main__ Validation: accuracy = 0.9346330275229358
30
+ 2021-05-21 20:36:35,182 INFO __main__ Updating ckpt
31
+ 2021-05-21 20:36:40,238 INFO torchdistill.misc.log Epoch: [2] [ 0/2105] eta: 0:15:35 lr: 6.6634996041171816e-06 sample/s: 9.158328911713907 loss: 0.0027 (0.0027) time: 0.4445 data: 0.0077 max mem: 9077
32
+ 2021-05-21 20:41:05,862 INFO torchdistill.misc.log Epoch: [2] [ 500/2105] eta: 0:14:12 lr: 5.079968329374505e-06 sample/s: 7.149938375884941 loss: 0.0000 (0.1141) time: 0.5369 data: 0.0031 max mem: 9077
33
+ 2021-05-21 20:45:26,291 INFO torchdistill.misc.log Epoch: [2] [1000/2105] eta: 0:09:41 lr: 3.4964370546318295e-06 sample/s: 6.4006445961659955 loss: 0.0001 (0.1283) time: 0.5432 data: 0.0031 max mem: 9077
34
+ 2021-05-21 20:49:51,518 INFO torchdistill.misc.log Epoch: [2] [1500/2105] eta: 0:05:19 lr: 1.9129057798891528e-06 sample/s: 8.156752315433794 loss: 0.0000 (0.1320) time: 0.5229 data: 0.0032 max mem: 9077
35
+ 2021-05-21 20:54:14,175 INFO torchdistill.misc.log Epoch: [2] [2000/2105] eta: 0:00:55 lr: 3.293745051464767e-07 sample/s: 7.075721944619703 loss: 0.0001 (0.1348) time: 0.5369 data: 0.0032 max mem: 9077
36
+ 2021-05-21 20:55:08,906 INFO torchdistill.misc.log Epoch: [2] Total time: 0:18:29
37
+ 2021-05-21 20:55:13,964 INFO /usr/local/lib/python3.7/dist-packages/datasets/metric.py Removing /root/.cache/huggingface/metrics/glue/sst2/default_experiment-1-0.arrow
38
+ 2021-05-21 20:55:13,964 INFO __main__ Validation: accuracy = 0.9311926605504587
39
+ 2021-05-21 20:55:19,432 INFO __main__ [Student: bert-large-uncased]
40
+ 2021-05-21 20:55:24,507 INFO /usr/local/lib/python3.7/dist-packages/datasets/metric.py Removing /root/.cache/huggingface/metrics/glue/sst2/default_experiment-1-0.arrow
41
+ 2021-05-21 20:55:24,508 INFO __main__ Test: accuracy = 0.9346330275229358
42
+ 2021-05-21 20:55:24,508 INFO __main__ Start prediction for private dataset(s)
43
+ 2021-05-21 20:55:24,509 INFO __main__ sst2/test: 1821 samples
vocab.txt ADDED
The diff for this file is too large to render. See raw diff