yujiepan commited on
Commit
28b453e
1 Parent(s): 8ec289e

upload model

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ task_description.json
2
+ *.log
README.md ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - image-classification
4
+ - vision
5
+ - generated_from_trainer
6
+ datasets:
7
+ - food101
8
+ metrics:
9
+ - accuracy
10
+ model-index:
11
+ - name: lr6e-05
12
+ results:
13
+ - task:
14
+ name: Image Classification
15
+ type: image-classification
16
+ dataset:
17
+ name: food101
18
+ type: food101
19
+ config: default
20
+ split: validation
21
+ args: default
22
+ metrics:
23
+ - name: Accuracy
24
+ type: accuracy
25
+ value: 0.8971089108910891
26
+ ---
27
+
28
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
29
+ should probably proofread and complete it, then remove this comment. -->
30
+
31
+ # vit-base-patch16-224-food101
32
+
33
+ This model is a fine-tuned version of [eslamxm/vit-base-food101](https://huggingface.co/eslamxm/vit-base-food101) on the food101 dataset.
34
+ It achieves the following results on the evaluation set:
35
+ - Loss: 0.3856
36
+ - Accuracy: 0.8971
37
+
38
+ ## Model description
39
+
40
+ More information needed
41
+
42
+ ## Intended uses & limitations
43
+
44
+ More information needed
45
+
46
+ ## Training and evaluation data
47
+
48
+ More information needed
49
+
50
+ ## Script
51
+ ```python
52
+ "cmd_list": [
53
+ "python",
54
+ "run_image_classification.py",
55
+ "--model_name_or_path",
56
+ "eslamxm/vit-base-food101",
57
+ "--dataset_name",
58
+ "food101",
59
+ "--output_dir",
60
+ "<output_dir>",
61
+ "--overwrite_output_dir",
62
+ "--remove_unused_columns",
63
+ "False",
64
+ "--do_train",
65
+ "--do_eval",
66
+ "--optim",
67
+ "adamw_torch",
68
+ "--learning_rate",
69
+ "6e-05",
70
+ "--num_train_epochs",
71
+ "3",
72
+ "--dataloader_num_workers",
73
+ "10",
74
+ "--per_device_train_batch_size",
75
+ "64",
76
+ "--gradient_accumulation_steps",
77
+ "2",
78
+ "--per_device_eval_batch_size",
79
+ "128",
80
+ "--logging_strategy",
81
+ "steps",
82
+ "--logging_steps",
83
+ "10",
84
+ "--evaluation_strategy",
85
+ "steps",
86
+ "--eval_steps",
87
+ "500",
88
+ "--save_steps",
89
+ "500",
90
+ "--evaluation_strategy",
91
+ "epoch",
92
+ "--save_strategy",
93
+ "epoch",
94
+ "--load_best_model_at_end",
95
+ "False",
96
+ "--save_total_limit",
97
+ "1",
98
+ "--seed",
99
+ "42",
100
+ "--fp16"
101
+ ]
102
+ ```
103
+
104
+ ## Training procedure
105
+
106
+ ### Training hyperparameters
107
+
108
+ The following hyperparameters were used during training:
109
+ - learning_rate: 6e-05
110
+ - train_batch_size: 64
111
+ - eval_batch_size: 128
112
+ - seed: 42
113
+ - gradient_accumulation_steps: 2
114
+ - total_train_batch_size: 128
115
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
116
+ - lr_scheduler_type: linear
117
+ - num_epochs: 3.0
118
+ - mixed_precision_training: Native AMP
119
+
120
+ ### Training results
121
+
122
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
123
+ |:-------------:|:-----:|:----:|:---------------:|:--------:|
124
+ | 0.3687 | 1.0 | 592 | 0.4044 | 0.8889 |
125
+ | 0.3422 | 2.0 | 1184 | 0.3911 | 0.8953 |
126
+ | 0.3808 | 3.0 | 1776 | 0.3856 | 0.8971 |
127
+
128
+
129
+ ### Framework versions
130
+
131
+ - Transformers 4.27.4
132
+ - Pytorch 1.13.1
133
+ - Datasets 2.11.0
134
+ - Tokenizers 0.13.3
all_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_accuracy": 0.8971089108910891,
4
+ "eval_loss": 0.38559553027153015,
5
+ "eval_runtime": 33.4525,
6
+ "eval_samples_per_second": 754.801,
7
+ "eval_steps_per_second": 5.919,
8
+ "train_loss": 0.3829323179549999,
9
+ "train_runtime": 916.6333,
10
+ "train_samples_per_second": 247.918,
11
+ "train_steps_per_second": 1.938
12
+ }
config.json ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/mnt/sh_flex_storage/home/yujiepan/vit-base-food101",
3
+ "architectures": [
4
+ "ViTForImageClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "encoder_stride": 16,
8
+ "finetuning_task": "image-classification",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.0,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "apple_pie",
14
+ "1": "baby_back_ribs",
15
+ "10": "bruschetta",
16
+ "100": "waffles",
17
+ "11": "caesar_salad",
18
+ "12": "cannoli",
19
+ "13": "caprese_salad",
20
+ "14": "carrot_cake",
21
+ "15": "ceviche",
22
+ "16": "cheesecake",
23
+ "17": "cheese_plate",
24
+ "18": "chicken_curry",
25
+ "19": "chicken_quesadilla",
26
+ "2": "baklava",
27
+ "20": "chicken_wings",
28
+ "21": "chocolate_cake",
29
+ "22": "chocolate_mousse",
30
+ "23": "churros",
31
+ "24": "clam_chowder",
32
+ "25": "club_sandwich",
33
+ "26": "crab_cakes",
34
+ "27": "creme_brulee",
35
+ "28": "croque_madame",
36
+ "29": "cup_cakes",
37
+ "3": "beef_carpaccio",
38
+ "30": "deviled_eggs",
39
+ "31": "donuts",
40
+ "32": "dumplings",
41
+ "33": "edamame",
42
+ "34": "eggs_benedict",
43
+ "35": "escargots",
44
+ "36": "falafel",
45
+ "37": "filet_mignon",
46
+ "38": "fish_and_chips",
47
+ "39": "foie_gras",
48
+ "4": "beef_tartare",
49
+ "40": "french_fries",
50
+ "41": "french_onion_soup",
51
+ "42": "french_toast",
52
+ "43": "fried_calamari",
53
+ "44": "fried_rice",
54
+ "45": "frozen_yogurt",
55
+ "46": "garlic_bread",
56
+ "47": "gnocchi",
57
+ "48": "greek_salad",
58
+ "49": "grilled_cheese_sandwich",
59
+ "5": "beet_salad",
60
+ "50": "grilled_salmon",
61
+ "51": "guacamole",
62
+ "52": "gyoza",
63
+ "53": "hamburger",
64
+ "54": "hot_and_sour_soup",
65
+ "55": "hot_dog",
66
+ "56": "huevos_rancheros",
67
+ "57": "hummus",
68
+ "58": "ice_cream",
69
+ "59": "lasagna",
70
+ "6": "beignets",
71
+ "60": "lobster_bisque",
72
+ "61": "lobster_roll_sandwich",
73
+ "62": "macaroni_and_cheese",
74
+ "63": "macarons",
75
+ "64": "miso_soup",
76
+ "65": "mussels",
77
+ "66": "nachos",
78
+ "67": "omelette",
79
+ "68": "onion_rings",
80
+ "69": "oysters",
81
+ "7": "bibimbap",
82
+ "70": "pad_thai",
83
+ "71": "paella",
84
+ "72": "pancakes",
85
+ "73": "panna_cotta",
86
+ "74": "peking_duck",
87
+ "75": "pho",
88
+ "76": "pizza",
89
+ "77": "pork_chop",
90
+ "78": "poutine",
91
+ "79": "prime_rib",
92
+ "8": "bread_pudding",
93
+ "80": "pulled_pork_sandwich",
94
+ "81": "ramen",
95
+ "82": "ravioli",
96
+ "83": "red_velvet_cake",
97
+ "84": "risotto",
98
+ "85": "samosa",
99
+ "86": "sashimi",
100
+ "87": "scallops",
101
+ "88": "seaweed_salad",
102
+ "89": "shrimp_and_grits",
103
+ "9": "breakfast_burrito",
104
+ "90": "spaghetti_bolognese",
105
+ "91": "spaghetti_carbonara",
106
+ "92": "spring_rolls",
107
+ "93": "steak",
108
+ "94": "strawberry_shortcake",
109
+ "95": "sushi",
110
+ "96": "tacos",
111
+ "97": "takoyaki",
112
+ "98": "tiramisu",
113
+ "99": "tuna_tartare"
114
+ },
115
+ "image_size": 224,
116
+ "initializer_range": 0.02,
117
+ "intermediate_size": 3072,
118
+ "label2id": {
119
+ "apple_pie": "0",
120
+ "baby_back_ribs": "1",
121
+ "baklava": "2",
122
+ "beef_carpaccio": "3",
123
+ "beef_tartare": "4",
124
+ "beet_salad": "5",
125
+ "beignets": "6",
126
+ "bibimbap": "7",
127
+ "bread_pudding": "8",
128
+ "breakfast_burrito": "9",
129
+ "bruschetta": "10",
130
+ "caesar_salad": "11",
131
+ "cannoli": "12",
132
+ "caprese_salad": "13",
133
+ "carrot_cake": "14",
134
+ "ceviche": "15",
135
+ "cheese_plate": "17",
136
+ "cheesecake": "16",
137
+ "chicken_curry": "18",
138
+ "chicken_quesadilla": "19",
139
+ "chicken_wings": "20",
140
+ "chocolate_cake": "21",
141
+ "chocolate_mousse": "22",
142
+ "churros": "23",
143
+ "clam_chowder": "24",
144
+ "club_sandwich": "25",
145
+ "crab_cakes": "26",
146
+ "creme_brulee": "27",
147
+ "croque_madame": "28",
148
+ "cup_cakes": "29",
149
+ "deviled_eggs": "30",
150
+ "donuts": "31",
151
+ "dumplings": "32",
152
+ "edamame": "33",
153
+ "eggs_benedict": "34",
154
+ "escargots": "35",
155
+ "falafel": "36",
156
+ "filet_mignon": "37",
157
+ "fish_and_chips": "38",
158
+ "foie_gras": "39",
159
+ "french_fries": "40",
160
+ "french_onion_soup": "41",
161
+ "french_toast": "42",
162
+ "fried_calamari": "43",
163
+ "fried_rice": "44",
164
+ "frozen_yogurt": "45",
165
+ "garlic_bread": "46",
166
+ "gnocchi": "47",
167
+ "greek_salad": "48",
168
+ "grilled_cheese_sandwich": "49",
169
+ "grilled_salmon": "50",
170
+ "guacamole": "51",
171
+ "gyoza": "52",
172
+ "hamburger": "53",
173
+ "hot_and_sour_soup": "54",
174
+ "hot_dog": "55",
175
+ "huevos_rancheros": "56",
176
+ "hummus": "57",
177
+ "ice_cream": "58",
178
+ "lasagna": "59",
179
+ "lobster_bisque": "60",
180
+ "lobster_roll_sandwich": "61",
181
+ "macaroni_and_cheese": "62",
182
+ "macarons": "63",
183
+ "miso_soup": "64",
184
+ "mussels": "65",
185
+ "nachos": "66",
186
+ "omelette": "67",
187
+ "onion_rings": "68",
188
+ "oysters": "69",
189
+ "pad_thai": "70",
190
+ "paella": "71",
191
+ "pancakes": "72",
192
+ "panna_cotta": "73",
193
+ "peking_duck": "74",
194
+ "pho": "75",
195
+ "pizza": "76",
196
+ "pork_chop": "77",
197
+ "poutine": "78",
198
+ "prime_rib": "79",
199
+ "pulled_pork_sandwich": "80",
200
+ "ramen": "81",
201
+ "ravioli": "82",
202
+ "red_velvet_cake": "83",
203
+ "risotto": "84",
204
+ "samosa": "85",
205
+ "sashimi": "86",
206
+ "scallops": "87",
207
+ "seaweed_salad": "88",
208
+ "shrimp_and_grits": "89",
209
+ "spaghetti_bolognese": "90",
210
+ "spaghetti_carbonara": "91",
211
+ "spring_rolls": "92",
212
+ "steak": "93",
213
+ "strawberry_shortcake": "94",
214
+ "sushi": "95",
215
+ "tacos": "96",
216
+ "takoyaki": "97",
217
+ "tiramisu": "98",
218
+ "tuna_tartare": "99",
219
+ "waffles": "100"
220
+ },
221
+ "layer_norm_eps": 1e-12,
222
+ "model_type": "vit",
223
+ "num_attention_heads": 12,
224
+ "num_channels": 3,
225
+ "num_hidden_layers": 12,
226
+ "patch_size": 16,
227
+ "problem_type": "single_label_classification",
228
+ "qkv_bias": true,
229
+ "torch_dtype": "float32",
230
+ "transformers_version": "4.27.4"
231
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_accuracy": 0.8971089108910891,
4
+ "eval_loss": 0.38559553027153015,
5
+ "eval_runtime": 33.4525,
6
+ "eval_samples_per_second": 754.801,
7
+ "eval_steps_per_second": 5.919
8
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_rescale": true,
4
+ "do_resize": true,
5
+ "image_mean": [
6
+ 0.5,
7
+ 0.5,
8
+ 0.5
9
+ ],
10
+ "image_processor_type": "ViTImageProcessor",
11
+ "image_std": [
12
+ 0.5,
13
+ 0.5,
14
+ 0.5
15
+ ],
16
+ "resample": 2,
17
+ "rescale_factor": 0.00392156862745098,
18
+ "size": {
19
+ "height": 224,
20
+ "width": 224
21
+ }
22
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86a9f8e9e168929fa1bfb9ccaf6b0a3c543ccfc4345b28b7b895fb90b5e8c2a5
3
+ size 343573229
train_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "train_loss": 0.3829323179549999,
4
+ "train_runtime": 916.6333,
5
+ "train_samples_per_second": 247.918,
6
+ "train_steps_per_second": 1.938
7
+ }
trainer_state.json ADDED
@@ -0,0 +1,1114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "global_step": 1776,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.02,
12
+ "learning_rate": 5.966216216216216e-05,
13
+ "loss": 0.426,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.03,
18
+ "learning_rate": 5.9324324324324324e-05,
19
+ "loss": 0.484,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 0.05,
24
+ "learning_rate": 5.8986486486486485e-05,
25
+ "loss": 0.4457,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 0.07,
30
+ "learning_rate": 5.8648648648648653e-05,
31
+ "loss": 0.4083,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 0.08,
36
+ "learning_rate": 5.83445945945946e-05,
37
+ "loss": 0.4695,
38
+ "step": 50
39
+ },
40
+ {
41
+ "epoch": 0.1,
42
+ "learning_rate": 5.800675675675676e-05,
43
+ "loss": 0.5002,
44
+ "step": 60
45
+ },
46
+ {
47
+ "epoch": 0.12,
48
+ "learning_rate": 5.766891891891892e-05,
49
+ "loss": 0.4436,
50
+ "step": 70
51
+ },
52
+ {
53
+ "epoch": 0.14,
54
+ "learning_rate": 5.733108108108108e-05,
55
+ "loss": 0.3807,
56
+ "step": 80
57
+ },
58
+ {
59
+ "epoch": 0.15,
60
+ "learning_rate": 5.699324324324325e-05,
61
+ "loss": 0.4198,
62
+ "step": 90
63
+ },
64
+ {
65
+ "epoch": 0.17,
66
+ "learning_rate": 5.665540540540541e-05,
67
+ "loss": 0.4475,
68
+ "step": 100
69
+ },
70
+ {
71
+ "epoch": 0.19,
72
+ "learning_rate": 5.631756756756757e-05,
73
+ "loss": 0.434,
74
+ "step": 110
75
+ },
76
+ {
77
+ "epoch": 0.2,
78
+ "learning_rate": 5.5979729729729734e-05,
79
+ "loss": 0.4036,
80
+ "step": 120
81
+ },
82
+ {
83
+ "epoch": 0.22,
84
+ "learning_rate": 5.5641891891891896e-05,
85
+ "loss": 0.4386,
86
+ "step": 130
87
+ },
88
+ {
89
+ "epoch": 0.24,
90
+ "learning_rate": 5.530405405405406e-05,
91
+ "loss": 0.399,
92
+ "step": 140
93
+ },
94
+ {
95
+ "epoch": 0.25,
96
+ "learning_rate": 5.496621621621622e-05,
97
+ "loss": 0.5032,
98
+ "step": 150
99
+ },
100
+ {
101
+ "epoch": 0.27,
102
+ "learning_rate": 5.4628378378378386e-05,
103
+ "loss": 0.4435,
104
+ "step": 160
105
+ },
106
+ {
107
+ "epoch": 0.29,
108
+ "learning_rate": 5.429054054054054e-05,
109
+ "loss": 0.4947,
110
+ "step": 170
111
+ },
112
+ {
113
+ "epoch": 0.3,
114
+ "learning_rate": 5.39527027027027e-05,
115
+ "loss": 0.4708,
116
+ "step": 180
117
+ },
118
+ {
119
+ "epoch": 0.32,
120
+ "learning_rate": 5.361486486486486e-05,
121
+ "loss": 0.4644,
122
+ "step": 190
123
+ },
124
+ {
125
+ "epoch": 0.34,
126
+ "learning_rate": 5.3277027027027024e-05,
127
+ "loss": 0.4833,
128
+ "step": 200
129
+ },
130
+ {
131
+ "epoch": 0.35,
132
+ "learning_rate": 5.2939189189189186e-05,
133
+ "loss": 0.4115,
134
+ "step": 210
135
+ },
136
+ {
137
+ "epoch": 0.37,
138
+ "learning_rate": 5.260135135135135e-05,
139
+ "loss": 0.5,
140
+ "step": 220
141
+ },
142
+ {
143
+ "epoch": 0.39,
144
+ "learning_rate": 5.2263513513513515e-05,
145
+ "loss": 0.4415,
146
+ "step": 230
147
+ },
148
+ {
149
+ "epoch": 0.41,
150
+ "learning_rate": 5.1925675675675676e-05,
151
+ "loss": 0.4143,
152
+ "step": 240
153
+ },
154
+ {
155
+ "epoch": 0.42,
156
+ "learning_rate": 5.158783783783784e-05,
157
+ "loss": 0.4295,
158
+ "step": 250
159
+ },
160
+ {
161
+ "epoch": 0.44,
162
+ "learning_rate": 5.125e-05,
163
+ "loss": 0.4441,
164
+ "step": 260
165
+ },
166
+ {
167
+ "epoch": 0.46,
168
+ "learning_rate": 5.091216216216216e-05,
169
+ "loss": 0.4273,
170
+ "step": 270
171
+ },
172
+ {
173
+ "epoch": 0.47,
174
+ "learning_rate": 5.057432432432432e-05,
175
+ "loss": 0.4175,
176
+ "step": 280
177
+ },
178
+ {
179
+ "epoch": 0.49,
180
+ "learning_rate": 5.023648648648648e-05,
181
+ "loss": 0.4219,
182
+ "step": 290
183
+ },
184
+ {
185
+ "epoch": 0.51,
186
+ "learning_rate": 4.989864864864865e-05,
187
+ "loss": 0.4526,
188
+ "step": 300
189
+ },
190
+ {
191
+ "epoch": 0.52,
192
+ "learning_rate": 4.956081081081081e-05,
193
+ "loss": 0.4396,
194
+ "step": 310
195
+ },
196
+ {
197
+ "epoch": 0.54,
198
+ "learning_rate": 4.922297297297297e-05,
199
+ "loss": 0.444,
200
+ "step": 320
201
+ },
202
+ {
203
+ "epoch": 0.56,
204
+ "learning_rate": 4.8885135135135135e-05,
205
+ "loss": 0.4236,
206
+ "step": 330
207
+ },
208
+ {
209
+ "epoch": 0.57,
210
+ "learning_rate": 4.8547297297297296e-05,
211
+ "loss": 0.4228,
212
+ "step": 340
213
+ },
214
+ {
215
+ "epoch": 0.59,
216
+ "learning_rate": 4.820945945945946e-05,
217
+ "loss": 0.4055,
218
+ "step": 350
219
+ },
220
+ {
221
+ "epoch": 0.61,
222
+ "learning_rate": 4.787162162162162e-05,
223
+ "loss": 0.3954,
224
+ "step": 360
225
+ },
226
+ {
227
+ "epoch": 0.62,
228
+ "learning_rate": 4.7533783783783786e-05,
229
+ "loss": 0.3883,
230
+ "step": 370
231
+ },
232
+ {
233
+ "epoch": 0.64,
234
+ "learning_rate": 4.719594594594595e-05,
235
+ "loss": 0.4181,
236
+ "step": 380
237
+ },
238
+ {
239
+ "epoch": 0.66,
240
+ "learning_rate": 4.685810810810811e-05,
241
+ "loss": 0.4249,
242
+ "step": 390
243
+ },
244
+ {
245
+ "epoch": 0.68,
246
+ "learning_rate": 4.652027027027027e-05,
247
+ "loss": 0.4619,
248
+ "step": 400
249
+ },
250
+ {
251
+ "epoch": 0.69,
252
+ "learning_rate": 4.618243243243243e-05,
253
+ "loss": 0.4298,
254
+ "step": 410
255
+ },
256
+ {
257
+ "epoch": 0.71,
258
+ "learning_rate": 4.584459459459459e-05,
259
+ "loss": 0.4184,
260
+ "step": 420
261
+ },
262
+ {
263
+ "epoch": 0.73,
264
+ "learning_rate": 4.550675675675676e-05,
265
+ "loss": 0.4062,
266
+ "step": 430
267
+ },
268
+ {
269
+ "epoch": 0.74,
270
+ "learning_rate": 4.516891891891892e-05,
271
+ "loss": 0.3768,
272
+ "step": 440
273
+ },
274
+ {
275
+ "epoch": 0.76,
276
+ "learning_rate": 4.4831081081081083e-05,
277
+ "loss": 0.4605,
278
+ "step": 450
279
+ },
280
+ {
281
+ "epoch": 0.78,
282
+ "learning_rate": 4.4493243243243245e-05,
283
+ "loss": 0.485,
284
+ "step": 460
285
+ },
286
+ {
287
+ "epoch": 0.79,
288
+ "learning_rate": 4.4155405405405406e-05,
289
+ "loss": 0.4169,
290
+ "step": 470
291
+ },
292
+ {
293
+ "epoch": 0.81,
294
+ "learning_rate": 4.381756756756757e-05,
295
+ "loss": 0.4136,
296
+ "step": 480
297
+ },
298
+ {
299
+ "epoch": 0.83,
300
+ "learning_rate": 4.347972972972973e-05,
301
+ "loss": 0.4337,
302
+ "step": 490
303
+ },
304
+ {
305
+ "epoch": 0.84,
306
+ "learning_rate": 4.31418918918919e-05,
307
+ "loss": 0.444,
308
+ "step": 500
309
+ },
310
+ {
311
+ "epoch": 0.86,
312
+ "learning_rate": 4.280405405405406e-05,
313
+ "loss": 0.4044,
314
+ "step": 510
315
+ },
316
+ {
317
+ "epoch": 0.88,
318
+ "learning_rate": 4.246621621621622e-05,
319
+ "loss": 0.3761,
320
+ "step": 520
321
+ },
322
+ {
323
+ "epoch": 0.9,
324
+ "learning_rate": 4.212837837837838e-05,
325
+ "loss": 0.3932,
326
+ "step": 530
327
+ },
328
+ {
329
+ "epoch": 0.91,
330
+ "learning_rate": 4.179054054054054e-05,
331
+ "loss": 0.382,
332
+ "step": 540
333
+ },
334
+ {
335
+ "epoch": 0.93,
336
+ "learning_rate": 4.14527027027027e-05,
337
+ "loss": 0.3858,
338
+ "step": 550
339
+ },
340
+ {
341
+ "epoch": 0.95,
342
+ "learning_rate": 4.1114864864864864e-05,
343
+ "loss": 0.4047,
344
+ "step": 560
345
+ },
346
+ {
347
+ "epoch": 0.96,
348
+ "learning_rate": 4.077702702702703e-05,
349
+ "loss": 0.4456,
350
+ "step": 570
351
+ },
352
+ {
353
+ "epoch": 0.98,
354
+ "learning_rate": 4.0439189189189194e-05,
355
+ "loss": 0.351,
356
+ "step": 580
357
+ },
358
+ {
359
+ "epoch": 1.0,
360
+ "learning_rate": 4.0101351351351355e-05,
361
+ "loss": 0.3687,
362
+ "step": 590
363
+ },
364
+ {
365
+ "epoch": 1.0,
366
+ "eval_accuracy": 0.8888712871287129,
367
+ "eval_loss": 0.4043530225753784,
368
+ "eval_runtime": 35.1515,
369
+ "eval_samples_per_second": 718.32,
370
+ "eval_steps_per_second": 5.633,
371
+ "step": 592
372
+ },
373
+ {
374
+ "epoch": 1.01,
375
+ "learning_rate": 3.9763513513513516e-05,
376
+ "loss": 0.3674,
377
+ "step": 600
378
+ },
379
+ {
380
+ "epoch": 1.03,
381
+ "learning_rate": 3.942567567567568e-05,
382
+ "loss": 0.3655,
383
+ "step": 610
384
+ },
385
+ {
386
+ "epoch": 1.05,
387
+ "learning_rate": 3.908783783783784e-05,
388
+ "loss": 0.4221,
389
+ "step": 620
390
+ },
391
+ {
392
+ "epoch": 1.06,
393
+ "learning_rate": 3.875e-05,
394
+ "loss": 0.3846,
395
+ "step": 630
396
+ },
397
+ {
398
+ "epoch": 1.08,
399
+ "learning_rate": 3.841216216216217e-05,
400
+ "loss": 0.4122,
401
+ "step": 640
402
+ },
403
+ {
404
+ "epoch": 1.1,
405
+ "learning_rate": 3.807432432432433e-05,
406
+ "loss": 0.4057,
407
+ "step": 650
408
+ },
409
+ {
410
+ "epoch": 1.11,
411
+ "learning_rate": 3.773648648648649e-05,
412
+ "loss": 0.4538,
413
+ "step": 660
414
+ },
415
+ {
416
+ "epoch": 1.13,
417
+ "learning_rate": 3.739864864864865e-05,
418
+ "loss": 0.3738,
419
+ "step": 670
420
+ },
421
+ {
422
+ "epoch": 1.15,
423
+ "learning_rate": 3.706081081081081e-05,
424
+ "loss": 0.4286,
425
+ "step": 680
426
+ },
427
+ {
428
+ "epoch": 1.17,
429
+ "learning_rate": 3.6722972972972974e-05,
430
+ "loss": 0.3568,
431
+ "step": 690
432
+ },
433
+ {
434
+ "epoch": 1.18,
435
+ "learning_rate": 3.638513513513514e-05,
436
+ "loss": 0.3664,
437
+ "step": 700
438
+ },
439
+ {
440
+ "epoch": 1.2,
441
+ "learning_rate": 3.6047297297297304e-05,
442
+ "loss": 0.3662,
443
+ "step": 710
444
+ },
445
+ {
446
+ "epoch": 1.22,
447
+ "learning_rate": 3.5709459459459465e-05,
448
+ "loss": 0.4049,
449
+ "step": 720
450
+ },
451
+ {
452
+ "epoch": 1.23,
453
+ "learning_rate": 3.537162162162162e-05,
454
+ "loss": 0.3396,
455
+ "step": 730
456
+ },
457
+ {
458
+ "epoch": 1.25,
459
+ "learning_rate": 3.503378378378378e-05,
460
+ "loss": 0.3986,
461
+ "step": 740
462
+ },
463
+ {
464
+ "epoch": 1.27,
465
+ "learning_rate": 3.469594594594594e-05,
466
+ "loss": 0.4362,
467
+ "step": 750
468
+ },
469
+ {
470
+ "epoch": 1.28,
471
+ "learning_rate": 3.43581081081081e-05,
472
+ "loss": 0.3804,
473
+ "step": 760
474
+ },
475
+ {
476
+ "epoch": 1.3,
477
+ "learning_rate": 3.402027027027027e-05,
478
+ "loss": 0.3586,
479
+ "step": 770
480
+ },
481
+ {
482
+ "epoch": 1.32,
483
+ "learning_rate": 3.368243243243243e-05,
484
+ "loss": 0.3433,
485
+ "step": 780
486
+ },
487
+ {
488
+ "epoch": 1.33,
489
+ "learning_rate": 3.3344594594594594e-05,
490
+ "loss": 0.4387,
491
+ "step": 790
492
+ },
493
+ {
494
+ "epoch": 1.35,
495
+ "learning_rate": 3.3006756756756755e-05,
496
+ "loss": 0.3773,
497
+ "step": 800
498
+ },
499
+ {
500
+ "epoch": 1.37,
501
+ "learning_rate": 3.2668918918918917e-05,
502
+ "loss": 0.3687,
503
+ "step": 810
504
+ },
505
+ {
506
+ "epoch": 1.39,
507
+ "learning_rate": 3.233108108108108e-05,
508
+ "loss": 0.3975,
509
+ "step": 820
510
+ },
511
+ {
512
+ "epoch": 1.4,
513
+ "learning_rate": 3.199324324324324e-05,
514
+ "loss": 0.3489,
515
+ "step": 830
516
+ },
517
+ {
518
+ "epoch": 1.42,
519
+ "learning_rate": 3.165540540540541e-05,
520
+ "loss": 0.3892,
521
+ "step": 840
522
+ },
523
+ {
524
+ "epoch": 1.44,
525
+ "learning_rate": 3.131756756756757e-05,
526
+ "loss": 0.3245,
527
+ "step": 850
528
+ },
529
+ {
530
+ "epoch": 1.45,
531
+ "learning_rate": 3.097972972972973e-05,
532
+ "loss": 0.3773,
533
+ "step": 860
534
+ },
535
+ {
536
+ "epoch": 1.47,
537
+ "learning_rate": 3.064189189189189e-05,
538
+ "loss": 0.377,
539
+ "step": 870
540
+ },
541
+ {
542
+ "epoch": 1.49,
543
+ "learning_rate": 3.0304054054054052e-05,
544
+ "loss": 0.3985,
545
+ "step": 880
546
+ },
547
+ {
548
+ "epoch": 1.5,
549
+ "learning_rate": 2.9966216216216217e-05,
550
+ "loss": 0.3336,
551
+ "step": 890
552
+ },
553
+ {
554
+ "epoch": 1.52,
555
+ "learning_rate": 2.9628378378378378e-05,
556
+ "loss": 0.3807,
557
+ "step": 900
558
+ },
559
+ {
560
+ "epoch": 1.54,
561
+ "learning_rate": 2.929054054054054e-05,
562
+ "loss": 0.3827,
563
+ "step": 910
564
+ },
565
+ {
566
+ "epoch": 1.55,
567
+ "learning_rate": 2.8952702702702704e-05,
568
+ "loss": 0.3531,
569
+ "step": 920
570
+ },
571
+ {
572
+ "epoch": 1.57,
573
+ "learning_rate": 2.8614864864864865e-05,
574
+ "loss": 0.3814,
575
+ "step": 930
576
+ },
577
+ {
578
+ "epoch": 1.59,
579
+ "learning_rate": 2.8277027027027027e-05,
580
+ "loss": 0.3634,
581
+ "step": 940
582
+ },
583
+ {
584
+ "epoch": 1.6,
585
+ "learning_rate": 2.7939189189189188e-05,
586
+ "loss": 0.3309,
587
+ "step": 950
588
+ },
589
+ {
590
+ "epoch": 1.62,
591
+ "learning_rate": 2.7601351351351353e-05,
592
+ "loss": 0.3763,
593
+ "step": 960
594
+ },
595
+ {
596
+ "epoch": 1.64,
597
+ "learning_rate": 2.7263513513513514e-05,
598
+ "loss": 0.3521,
599
+ "step": 970
600
+ },
601
+ {
602
+ "epoch": 1.66,
603
+ "learning_rate": 2.6925675675675675e-05,
604
+ "loss": 0.364,
605
+ "step": 980
606
+ },
607
+ {
608
+ "epoch": 1.67,
609
+ "learning_rate": 2.658783783783784e-05,
610
+ "loss": 0.4004,
611
+ "step": 990
612
+ },
613
+ {
614
+ "epoch": 1.69,
615
+ "learning_rate": 2.625e-05,
616
+ "loss": 0.3656,
617
+ "step": 1000
618
+ },
619
+ {
620
+ "epoch": 1.71,
621
+ "learning_rate": 2.5912162162162162e-05,
622
+ "loss": 0.3789,
623
+ "step": 1010
624
+ },
625
+ {
626
+ "epoch": 1.72,
627
+ "learning_rate": 2.5574324324324324e-05,
628
+ "loss": 0.3441,
629
+ "step": 1020
630
+ },
631
+ {
632
+ "epoch": 1.74,
633
+ "learning_rate": 2.5236486486486488e-05,
634
+ "loss": 0.3743,
635
+ "step": 1030
636
+ },
637
+ {
638
+ "epoch": 1.76,
639
+ "learning_rate": 2.489864864864865e-05,
640
+ "loss": 0.3844,
641
+ "step": 1040
642
+ },
643
+ {
644
+ "epoch": 1.77,
645
+ "learning_rate": 2.456081081081081e-05,
646
+ "loss": 0.4033,
647
+ "step": 1050
648
+ },
649
+ {
650
+ "epoch": 1.79,
651
+ "learning_rate": 2.4222972972972976e-05,
652
+ "loss": 0.3749,
653
+ "step": 1060
654
+ },
655
+ {
656
+ "epoch": 1.81,
657
+ "learning_rate": 2.3885135135135137e-05,
658
+ "loss": 0.3956,
659
+ "step": 1070
660
+ },
661
+ {
662
+ "epoch": 1.82,
663
+ "learning_rate": 2.3547297297297298e-05,
664
+ "loss": 0.3968,
665
+ "step": 1080
666
+ },
667
+ {
668
+ "epoch": 1.84,
669
+ "learning_rate": 2.3209459459459463e-05,
670
+ "loss": 0.3801,
671
+ "step": 1090
672
+ },
673
+ {
674
+ "epoch": 1.86,
675
+ "learning_rate": 2.2871621621621624e-05,
676
+ "loss": 0.321,
677
+ "step": 1100
678
+ },
679
+ {
680
+ "epoch": 1.88,
681
+ "learning_rate": 2.2533783783783785e-05,
682
+ "loss": 0.3457,
683
+ "step": 1110
684
+ },
685
+ {
686
+ "epoch": 1.89,
687
+ "learning_rate": 2.2195945945945947e-05,
688
+ "loss": 0.3789,
689
+ "step": 1120
690
+ },
691
+ {
692
+ "epoch": 1.91,
693
+ "learning_rate": 2.185810810810811e-05,
694
+ "loss": 0.3738,
695
+ "step": 1130
696
+ },
697
+ {
698
+ "epoch": 1.93,
699
+ "learning_rate": 2.152027027027027e-05,
700
+ "loss": 0.3785,
701
+ "step": 1140
702
+ },
703
+ {
704
+ "epoch": 1.94,
705
+ "learning_rate": 2.118243243243243e-05,
706
+ "loss": 0.329,
707
+ "step": 1150
708
+ },
709
+ {
710
+ "epoch": 1.96,
711
+ "learning_rate": 2.0844594594594595e-05,
712
+ "loss": 0.3649,
713
+ "step": 1160
714
+ },
715
+ {
716
+ "epoch": 1.98,
717
+ "learning_rate": 2.0506756756756756e-05,
718
+ "loss": 0.4189,
719
+ "step": 1170
720
+ },
721
+ {
722
+ "epoch": 1.99,
723
+ "learning_rate": 2.0168918918918918e-05,
724
+ "loss": 0.3422,
725
+ "step": 1180
726
+ },
727
+ {
728
+ "epoch": 2.0,
729
+ "eval_accuracy": 0.8953267326732673,
730
+ "eval_loss": 0.3911304175853729,
731
+ "eval_runtime": 34.9199,
732
+ "eval_samples_per_second": 723.084,
733
+ "eval_steps_per_second": 5.67,
734
+ "step": 1184
735
+ },
736
+ {
737
+ "epoch": 2.01,
738
+ "learning_rate": 1.983108108108108e-05,
739
+ "loss": 0.3691,
740
+ "step": 1190
741
+ },
742
+ {
743
+ "epoch": 2.03,
744
+ "learning_rate": 1.9493243243243244e-05,
745
+ "loss": 0.3039,
746
+ "step": 1200
747
+ },
748
+ {
749
+ "epoch": 2.04,
750
+ "learning_rate": 1.9155405405405405e-05,
751
+ "loss": 0.307,
752
+ "step": 1210
753
+ },
754
+ {
755
+ "epoch": 2.06,
756
+ "learning_rate": 1.8817567567567566e-05,
757
+ "loss": 0.3207,
758
+ "step": 1220
759
+ },
760
+ {
761
+ "epoch": 2.08,
762
+ "learning_rate": 1.847972972972973e-05,
763
+ "loss": 0.3473,
764
+ "step": 1230
765
+ },
766
+ {
767
+ "epoch": 2.09,
768
+ "learning_rate": 1.8141891891891892e-05,
769
+ "loss": 0.3155,
770
+ "step": 1240
771
+ },
772
+ {
773
+ "epoch": 2.11,
774
+ "learning_rate": 1.7804054054054053e-05,
775
+ "loss": 0.3217,
776
+ "step": 1250
777
+ },
778
+ {
779
+ "epoch": 2.13,
780
+ "learning_rate": 1.7466216216216218e-05,
781
+ "loss": 0.3473,
782
+ "step": 1260
783
+ },
784
+ {
785
+ "epoch": 2.15,
786
+ "learning_rate": 1.712837837837838e-05,
787
+ "loss": 0.2966,
788
+ "step": 1270
789
+ },
790
+ {
791
+ "epoch": 2.16,
792
+ "learning_rate": 1.679054054054054e-05,
793
+ "loss": 0.3411,
794
+ "step": 1280
795
+ },
796
+ {
797
+ "epoch": 2.18,
798
+ "learning_rate": 1.6452702702702702e-05,
799
+ "loss": 0.3876,
800
+ "step": 1290
801
+ },
802
+ {
803
+ "epoch": 2.2,
804
+ "learning_rate": 1.6114864864864866e-05,
805
+ "loss": 0.2818,
806
+ "step": 1300
807
+ },
808
+ {
809
+ "epoch": 2.21,
810
+ "learning_rate": 1.5777027027027028e-05,
811
+ "loss": 0.351,
812
+ "step": 1310
813
+ },
814
+ {
815
+ "epoch": 2.23,
816
+ "learning_rate": 1.543918918918919e-05,
817
+ "loss": 0.3947,
818
+ "step": 1320
819
+ },
820
+ {
821
+ "epoch": 2.25,
822
+ "learning_rate": 1.5101351351351352e-05,
823
+ "loss": 0.3157,
824
+ "step": 1330
825
+ },
826
+ {
827
+ "epoch": 2.26,
828
+ "learning_rate": 1.4763513513513515e-05,
829
+ "loss": 0.4128,
830
+ "step": 1340
831
+ },
832
+ {
833
+ "epoch": 2.28,
834
+ "learning_rate": 1.4425675675675675e-05,
835
+ "loss": 0.3189,
836
+ "step": 1350
837
+ },
838
+ {
839
+ "epoch": 2.3,
840
+ "learning_rate": 1.4087837837837838e-05,
841
+ "loss": 0.3791,
842
+ "step": 1360
843
+ },
844
+ {
845
+ "epoch": 2.31,
846
+ "learning_rate": 1.375e-05,
847
+ "loss": 0.3299,
848
+ "step": 1370
849
+ },
850
+ {
851
+ "epoch": 2.33,
852
+ "learning_rate": 1.3412162162162162e-05,
853
+ "loss": 0.3351,
854
+ "step": 1380
855
+ },
856
+ {
857
+ "epoch": 2.35,
858
+ "learning_rate": 1.3074324324324325e-05,
859
+ "loss": 0.3197,
860
+ "step": 1390
861
+ },
862
+ {
863
+ "epoch": 2.36,
864
+ "learning_rate": 1.2736486486486486e-05,
865
+ "loss": 0.3191,
866
+ "step": 1400
867
+ },
868
+ {
869
+ "epoch": 2.38,
870
+ "learning_rate": 1.2398648648648649e-05,
871
+ "loss": 0.3765,
872
+ "step": 1410
873
+ },
874
+ {
875
+ "epoch": 2.4,
876
+ "learning_rate": 1.2060810810810812e-05,
877
+ "loss": 0.3019,
878
+ "step": 1420
879
+ },
880
+ {
881
+ "epoch": 2.42,
882
+ "learning_rate": 1.1722972972972973e-05,
883
+ "loss": 0.3662,
884
+ "step": 1430
885
+ },
886
+ {
887
+ "epoch": 2.43,
888
+ "learning_rate": 1.1385135135135136e-05,
889
+ "loss": 0.3232,
890
+ "step": 1440
891
+ },
892
+ {
893
+ "epoch": 2.45,
894
+ "learning_rate": 1.1047297297297297e-05,
895
+ "loss": 0.3149,
896
+ "step": 1450
897
+ },
898
+ {
899
+ "epoch": 2.47,
900
+ "learning_rate": 1.070945945945946e-05,
901
+ "loss": 0.3516,
902
+ "step": 1460
903
+ },
904
+ {
905
+ "epoch": 2.48,
906
+ "learning_rate": 1.0371621621621622e-05,
907
+ "loss": 0.3339,
908
+ "step": 1470
909
+ },
910
+ {
911
+ "epoch": 2.5,
912
+ "learning_rate": 1.0033783783783785e-05,
913
+ "loss": 0.3046,
914
+ "step": 1480
915
+ },
916
+ {
917
+ "epoch": 2.52,
918
+ "learning_rate": 9.695945945945946e-06,
919
+ "loss": 0.3794,
920
+ "step": 1490
921
+ },
922
+ {
923
+ "epoch": 2.53,
924
+ "learning_rate": 9.358108108108107e-06,
925
+ "loss": 0.4017,
926
+ "step": 1500
927
+ },
928
+ {
929
+ "epoch": 2.55,
930
+ "learning_rate": 9.02027027027027e-06,
931
+ "loss": 0.3851,
932
+ "step": 1510
933
+ },
934
+ {
935
+ "epoch": 2.57,
936
+ "learning_rate": 8.682432432432431e-06,
937
+ "loss": 0.3449,
938
+ "step": 1520
939
+ },
940
+ {
941
+ "epoch": 2.58,
942
+ "learning_rate": 8.344594594594594e-06,
943
+ "loss": 0.3558,
944
+ "step": 1530
945
+ },
946
+ {
947
+ "epoch": 2.6,
948
+ "learning_rate": 8.006756756756757e-06,
949
+ "loss": 0.3176,
950
+ "step": 1540
951
+ },
952
+ {
953
+ "epoch": 2.62,
954
+ "learning_rate": 7.668918918918919e-06,
955
+ "loss": 0.3305,
956
+ "step": 1550
957
+ },
958
+ {
959
+ "epoch": 2.64,
960
+ "learning_rate": 7.331081081081082e-06,
961
+ "loss": 0.3171,
962
+ "step": 1560
963
+ },
964
+ {
965
+ "epoch": 2.65,
966
+ "learning_rate": 6.993243243243244e-06,
967
+ "loss": 0.332,
968
+ "step": 1570
969
+ },
970
+ {
971
+ "epoch": 2.67,
972
+ "learning_rate": 6.655405405405406e-06,
973
+ "loss": 0.2867,
974
+ "step": 1580
975
+ },
976
+ {
977
+ "epoch": 2.69,
978
+ "learning_rate": 6.317567567567567e-06,
979
+ "loss": 0.3478,
980
+ "step": 1590
981
+ },
982
+ {
983
+ "epoch": 2.7,
984
+ "learning_rate": 5.979729729729729e-06,
985
+ "loss": 0.3609,
986
+ "step": 1600
987
+ },
988
+ {
989
+ "epoch": 2.72,
990
+ "learning_rate": 5.6418918918918914e-06,
991
+ "loss": 0.3591,
992
+ "step": 1610
993
+ },
994
+ {
995
+ "epoch": 2.74,
996
+ "learning_rate": 5.304054054054054e-06,
997
+ "loss": 0.3131,
998
+ "step": 1620
999
+ },
1000
+ {
1001
+ "epoch": 2.75,
1002
+ "learning_rate": 4.9662162162162165e-06,
1003
+ "loss": 0.3224,
1004
+ "step": 1630
1005
+ },
1006
+ {
1007
+ "epoch": 2.77,
1008
+ "learning_rate": 4.628378378378379e-06,
1009
+ "loss": 0.3849,
1010
+ "step": 1640
1011
+ },
1012
+ {
1013
+ "epoch": 2.79,
1014
+ "learning_rate": 4.290540540540541e-06,
1015
+ "loss": 0.4212,
1016
+ "step": 1650
1017
+ },
1018
+ {
1019
+ "epoch": 2.8,
1020
+ "learning_rate": 3.952702702702702e-06,
1021
+ "loss": 0.3889,
1022
+ "step": 1660
1023
+ },
1024
+ {
1025
+ "epoch": 2.82,
1026
+ "learning_rate": 3.614864864864865e-06,
1027
+ "loss": 0.3444,
1028
+ "step": 1670
1029
+ },
1030
+ {
1031
+ "epoch": 2.84,
1032
+ "learning_rate": 3.277027027027027e-06,
1033
+ "loss": 0.3445,
1034
+ "step": 1680
1035
+ },
1036
+ {
1037
+ "epoch": 2.85,
1038
+ "learning_rate": 2.9391891891891893e-06,
1039
+ "loss": 0.3322,
1040
+ "step": 1690
1041
+ },
1042
+ {
1043
+ "epoch": 2.87,
1044
+ "learning_rate": 2.6013513513513514e-06,
1045
+ "loss": 0.3587,
1046
+ "step": 1700
1047
+ },
1048
+ {
1049
+ "epoch": 2.89,
1050
+ "learning_rate": 2.263513513513514e-06,
1051
+ "loss": 0.3127,
1052
+ "step": 1710
1053
+ },
1054
+ {
1055
+ "epoch": 2.91,
1056
+ "learning_rate": 1.9256756756756756e-06,
1057
+ "loss": 0.3552,
1058
+ "step": 1720
1059
+ },
1060
+ {
1061
+ "epoch": 2.92,
1062
+ "learning_rate": 1.587837837837838e-06,
1063
+ "loss": 0.3377,
1064
+ "step": 1730
1065
+ },
1066
+ {
1067
+ "epoch": 2.94,
1068
+ "learning_rate": 1.2499999999999999e-06,
1069
+ "loss": 0.3269,
1070
+ "step": 1740
1071
+ },
1072
+ {
1073
+ "epoch": 2.96,
1074
+ "learning_rate": 9.121621621621622e-07,
1075
+ "loss": 0.3098,
1076
+ "step": 1750
1077
+ },
1078
+ {
1079
+ "epoch": 2.97,
1080
+ "learning_rate": 5.743243243243243e-07,
1081
+ "loss": 0.3506,
1082
+ "step": 1760
1083
+ },
1084
+ {
1085
+ "epoch": 2.99,
1086
+ "learning_rate": 2.3648648648648647e-07,
1087
+ "loss": 0.3808,
1088
+ "step": 1770
1089
+ },
1090
+ {
1091
+ "epoch": 3.0,
1092
+ "eval_accuracy": 0.8971089108910891,
1093
+ "eval_loss": 0.38559553027153015,
1094
+ "eval_runtime": 33.8726,
1095
+ "eval_samples_per_second": 745.441,
1096
+ "eval_steps_per_second": 5.845,
1097
+ "step": 1776
1098
+ },
1099
+ {
1100
+ "epoch": 3.0,
1101
+ "step": 1776,
1102
+ "total_flos": 1.76256801415296e+19,
1103
+ "train_loss": 0.3829323179549999,
1104
+ "train_runtime": 916.6333,
1105
+ "train_samples_per_second": 247.918,
1106
+ "train_steps_per_second": 1.938
1107
+ }
1108
+ ],
1109
+ "max_steps": 1776,
1110
+ "num_train_epochs": 3,
1111
+ "total_flos": 1.76256801415296e+19,
1112
+ "trial_name": null,
1113
+ "trial_params": null
1114
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99375d127eeeb0528bc62baffa4c5e80febdd57a4666587ea2223f1d43d64cf0
3
+ size 3771