anton-l HF staff commited on
Commit
172e214
1 Parent(s): e160ef4
optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:06bcda9875d78e84a67823f2816a0b70c9f4ef59eaaf3c751f57fc4c23e1bf7a
3
- size 4735250
 
 
 
 
rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d528da29fcd37f6a0dc448517a000f9d27057f065cda48d9c2f61cad3ea082b2
3
- size 14180
 
 
 
 
scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:807cda89e32cea0b443893509b253215295b99009aaa01c922bdd6035bfc2f66
3
- size 1064
 
 
 
 
src/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Educational value classifier
2
+
3
+ ### 1. Finetune a model for educational value regression
4
+
5
+ * edit `train_edu_bert.slurm`
6
+ ```bash
7
+ --base_model_name="Snowflake/snowflake-arctic-embed-m" \ # BERT-like base model
8
+ --dataset_name="HuggingFaceTB/LLM_juries_fineweb_430k_annotations" \ # Llama3-annotated eduational value dataset
9
+ --target_column="score"
10
+ ```
11
+ * run the training script on a SLURM cluster:
12
+ ```bash
13
+ sbatch train_edu_bert.slurm
14
+ ```
15
+
16
+ ### 2. Annotate a dataset with the educational scores predicted by the model
17
+
18
+ ```bash
19
+ sbatch run_edu_bert.slurm
20
+ ```
src/run_edu_bert.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import argparse
3
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
+ from datasets import load_dataset
5
+
6
+
7
+ def main(args):
8
+ tokenizer = AutoTokenizer.from_pretrained(args.model_name)
9
+ model = AutoModelForSequenceClassification.from_pretrained(args.model_name, torch_dtype=torch.bfloat16)
10
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
+ model.to(device)
12
+
13
+ dataset = load_dataset(args.dataset_name, args.dataset_config,
14
+ split="train", cache_dir="/scratch/cosmo/cache/", num_proc=12)
15
+ dataset = dataset.filter(lambda x, i: i % args.num_shards == args.shard, with_indices=True, num_proc=12)
16
+
17
+ def compute_scores(batch):
18
+ inputs = tokenizer(batch[args.text_column], return_tensors="pt", padding="longest", truncation=True).to(device)
19
+ with torch.no_grad():
20
+ outputs = model(**inputs)
21
+ logits = outputs.logits.squeeze(-1).float().cpu().numpy()
22
+
23
+ batch["score"] = logits.tolist()
24
+ batch["int_score"] = [int(round(max(0, min(score, 5)))) for score in logits]
25
+ return batch
26
+
27
+ dataset = dataset.map(compute_scores, batched=True, batch_size=512)
28
+
29
+ while True:
30
+ try:
31
+ config_name = f"{args.output_dataset_config}_{args.shard}"
32
+ dataset.push_to_hub(args.output_dataset_name, config_name=config_name, private=True, max_shard_size="4096MB")
33
+ break
34
+ except Exception as e:
35
+ print(e)
36
+ continue
37
+
38
+
39
+ if __name__ == "__main__":
40
+ parser = argparse.ArgumentParser()
41
+
42
+ parser.add_argument("--model_name", type=str, default="HHuggingFaceFW/fineweb-edu-classifier")
43
+ parser.add_argument("--dataset_name", type=str, default="HuggingFaceFW/fineweb")
44
+ parser.add_argument("--dataset_config", type=str, default="default")
45
+ parser.add_argument("--output_dataset_name", type=str, default="HuggingFaceFW/fineweb-edu")
46
+ parser.add_argument("--output_dataset_config", type=str, default="default")
47
+ parser.add_argument("--text_column", type=str, default="text")
48
+ parser.add_argument("--shard", type=int, required=True)
49
+ parser.add_argument("--num_shards", type=int, required=True)
50
+
51
+ args = parser.parse_args()
52
+ main(args)
src/run_edu_bert.slurm ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --job-name=run_edu_bert
3
+ #SBATCH --partition hopper-prod
4
+ #SBATCH --qos=normal
5
+ #SBATCH --requeue
6
+ #SBATCH --nodes=1
7
+ #SBATCH --ntasks-per-node=1
8
+ #SBATCH --cpus-per-task=12
9
+ #SBATCH --mem-per-cpu=20G
10
+ #SBATCH --gpus=1
11
+ #SBATCH -o %x_%j.out
12
+ #SBATCH -e %x_%j.err
13
+ #SBATCH --time=7-00:00:00
14
+ #SBATCH --array=0,1,2,3,4,6,7,8,26,29,31%32
15
+
16
+ set -x -e
17
+ source ~/.bashrc
18
+ source "/admin/home/anton/miniforge3/etc/profile.d/conda.sh"
19
+ source activate lighteval
20
+
21
+ python run_edu_bert.py \
22
+ --model_name="HuggingFaceFW/fineweb-edu-classifier" \
23
+ --dataset_name="HuggingFaceFW/fineweb" \
24
+ --dataset_config="CC-MAIN-2019-04" \
25
+ --output_dataset_name="HuggingFaceFW/fineweb-edu-annotations" \
26
+ --output_dataset_config="CC-MAIN-2019-04" \
27
+ --text_column="text" \
28
+ --shard ${SLURM_ARRAY_TASK_ID} \
29
+ --num_shards 32
src/train_edu_bert.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ AutoTokenizer,
3
+ DataCollatorWithPadding,
4
+ TrainingArguments,
5
+ Trainer,
6
+ AutoModelForSequenceClassification,
7
+ )
8
+ from datasets import load_dataset, ClassLabel
9
+ import numpy as np
10
+ import evaluate
11
+ import argparse
12
+ import os
13
+ from sklearn.metrics import classification_report, confusion_matrix
14
+
15
+
16
+ def compute_metrics(eval_pred):
17
+ precision_metric = evaluate.load("precision")
18
+ recall_metric = evaluate.load("recall")
19
+ f1_metric = evaluate.load("f1")
20
+ accuracy_metric = evaluate.load("accuracy")
21
+
22
+ logits, labels = eval_pred
23
+ preds = np.round(logits.squeeze()).clip(0, 5).astype(int)
24
+ labels = np.round(labels.squeeze()).astype(int)
25
+ precision = precision_metric.compute(
26
+ predictions=preds, references=labels, average="macro"
27
+ )["precision"]
28
+ recall = recall_metric.compute(
29
+ predictions=preds, references=labels, average="macro"
30
+ )["recall"]
31
+ f1 = f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]
32
+ accuracy = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
33
+
34
+ report = classification_report(labels, preds)
35
+ cm = confusion_matrix(labels, preds)
36
+ print("Validation Report:\n" + report)
37
+ print("Confusion Matrix:\n" + str(cm))
38
+
39
+ return {
40
+ "precision": precision,
41
+ "recall": recall,
42
+ "f1_macro": f1,
43
+ "accuracy": accuracy,
44
+ }
45
+
46
+
47
+ def main(args):
48
+ dataset = load_dataset(
49
+ args.dataset_name, split="train", cache_dir="/scratch/cosmo/cache/", num_proc=8
50
+ )
51
+ dataset = dataset.map(
52
+ lambda x: {args.target_column: np.clip(int(x[args.target_column]), 0, 5)}, num_proc=8
53
+ )
54
+
55
+ dataset = dataset.cast_column(
56
+ args.target_column, ClassLabel(names=[str(i) for i in range(6)])
57
+ )
58
+ dataset = dataset.train_test_split(
59
+ train_size=0.9, seed=42, stratify_by_column=args.target_column
60
+ )
61
+
62
+ tokenizer = AutoTokenizer.from_pretrained(args.base_model_name)
63
+
64
+ def preprocess(examples):
65
+ batch = tokenizer(examples["text"], truncation=True)
66
+ batch["labels"] = np.float32(examples[args.target_column])
67
+ return batch
68
+
69
+ dataset = dataset.map(preprocess, batched=True)
70
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
71
+ model = AutoModelForSequenceClassification.from_pretrained(args.base_model_name, num_labels=1, classifier_dropout=0.0, hidden_dropout_prob=0.0)
72
+
73
+ for param in model.bert.embeddings.parameters():
74
+ param.requires_grad = False
75
+ for param in model.bert.encoder.parameters():
76
+ param.requires_grad = False
77
+
78
+ training_args = TrainingArguments(
79
+ output_dir=args.checkpoint_dir,
80
+ evaluation_strategy="steps",
81
+ save_strategy="steps",
82
+ eval_steps=1000,
83
+ save_steps=1000,
84
+ logging_steps=100,
85
+ learning_rate=3e-4,
86
+ num_train_epochs=20,
87
+ seed=0,
88
+ per_device_train_batch_size=256,
89
+ per_device_eval_batch_size=128,
90
+ load_best_model_at_end=True,
91
+ metric_for_best_model="f1_macro",
92
+ greater_is_better=True,
93
+ bf16=True,
94
+ )
95
+
96
+ trainer = Trainer(
97
+ model=model,
98
+ args=training_args,
99
+ train_dataset=dataset["train"],
100
+ eval_dataset=dataset["test"],
101
+ tokenizer=tokenizer,
102
+ data_collator=data_collator,
103
+ compute_metrics=compute_metrics,
104
+ )
105
+
106
+ trainer.train()
107
+ trainer.save_model(os.path.join(args.checkpoint_dir, "final"))
108
+
109
+
110
+ if __name__ == "__main__":
111
+ parser = argparse.ArgumentParser()
112
+ parser.add_argument("--base_model_name", type=str, default="Snowflake/snowflake-arctic-embed-m")
113
+ parser.add_argument("--dataset_name", type=str, default="HuggingFaceTB/llama3_edu_500k_binary_labels")
114
+ parser.add_argument("--target_column", type=str, default="score")
115
+ parser.add_argument("--checkpoint_dir", type=str, default="/fsx/anton/cosmopedia/edu_score/bert_snowflake_regression")
116
+ args = parser.parse_args()
117
+
118
+ main(args)
src/train_edu_bert.slurm ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --job-name=train_edu_bert
3
+ #SBATCH --partition hopper-prod
4
+ #SBATCH --nodes=1
5
+ #SBATCH --ntasks-per-node=1
6
+ #SBATCH --cpus-per-task=16
7
+ #SBATCH --mem-per-cpu=20G
8
+ #SBATCH --gpus=1
9
+ #SBATCH -o %x_%j.out
10
+ #SBATCH -e %x_%j.err
11
+ #SBATCH --time=1-00:00:00
12
+
13
+ set -x -e
14
+ source ~/.bashrc
15
+ source "/admin/home/anton/miniforge3/etc/profile.d/conda.sh"
16
+ source activate lighteval
17
+
18
+ python train_edu_bert.py \
19
+ --base_model_name="Snowflake/snowflake-arctic-embed-m" \
20
+ --dataset_name="HuggingFaceTB/LLM_juries_fineweb_430k_annotations" \
21
+ --target_column="median_score"\
22
+ --checkpoint_dir="/fsx/anton/cosmopedia/edu_score/snowflake_regression_median_jury"
trainer_state.json DELETED
@@ -1,2235 +0,0 @@
1
- {
2
- "best_metric": 0.4959826756763837,
3
- "best_model_checkpoint": "/fsx/anton/cosmopedia/edu_score/bert_snowflake_regression_4/checkpoint-27000",
4
- "epoch": 16.383495145631066,
5
- "eval_steps": 1000,
6
- "global_step": 27000,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 0.06067961165048544,
13
- "grad_norm": 0.5638211965560913,
14
- "learning_rate": 0.0002990898058252427,
15
- "loss": 0.4753,
16
- "step": 100
17
- },
18
- {
19
- "epoch": 0.12135922330097088,
20
- "grad_norm": 0.47830212116241455,
21
- "learning_rate": 0.0002981796116504854,
22
- "loss": 0.357,
23
- "step": 200
24
- },
25
- {
26
- "epoch": 0.1820388349514563,
27
- "grad_norm": 0.6941384077072144,
28
- "learning_rate": 0.0002972694174757281,
29
- "loss": 0.3542,
30
- "step": 300
31
- },
32
- {
33
- "epoch": 0.24271844660194175,
34
- "grad_norm": 0.459163635969162,
35
- "learning_rate": 0.00029635922330097087,
36
- "loss": 0.3508,
37
- "step": 400
38
- },
39
- {
40
- "epoch": 0.30339805825242716,
41
- "grad_norm": 1.1585971117019653,
42
- "learning_rate": 0.0002954490291262136,
43
- "loss": 0.3407,
44
- "step": 500
45
- },
46
- {
47
- "epoch": 0.3640776699029126,
48
- "grad_norm": 0.6505594849586487,
49
- "learning_rate": 0.0002945388349514563,
50
- "loss": 0.3394,
51
- "step": 600
52
- },
53
- {
54
- "epoch": 0.42475728155339804,
55
- "grad_norm": 0.9804072976112366,
56
- "learning_rate": 0.000293628640776699,
57
- "loss": 0.3435,
58
- "step": 700
59
- },
60
- {
61
- "epoch": 0.4854368932038835,
62
- "grad_norm": 0.5816351175308228,
63
- "learning_rate": 0.0002927184466019417,
64
- "loss": 0.3323,
65
- "step": 800
66
- },
67
- {
68
- "epoch": 0.5461165048543689,
69
- "grad_norm": 0.6582027673721313,
70
- "learning_rate": 0.00029180825242718447,
71
- "loss": 0.3293,
72
- "step": 900
73
- },
74
- {
75
- "epoch": 0.6067961165048543,
76
- "grad_norm": 0.8432559370994568,
77
- "learning_rate": 0.0002908980582524271,
78
- "loss": 0.3337,
79
- "step": 1000
80
- },
81
- {
82
- "epoch": 0.6067961165048543,
83
- "eval_accuracy": 0.6516312117268014,
84
- "eval_f1_macro": 0.388617225018415,
85
- "eval_loss": 0.32428401708602905,
86
- "eval_precision": 0.5273989146868351,
87
- "eval_recall": 0.3731209274235363,
88
- "eval_runtime": 63.5791,
89
- "eval_samples_per_second": 737.144,
90
- "eval_steps_per_second": 5.772,
91
- "step": 1000
92
- },
93
- {
94
- "epoch": 0.6674757281553398,
95
- "grad_norm": 0.6387248039245605,
96
- "learning_rate": 0.0002899878640776699,
97
- "loss": 0.3292,
98
- "step": 1100
99
- },
100
- {
101
- "epoch": 0.7281553398058253,
102
- "grad_norm": 2.0111730098724365,
103
- "learning_rate": 0.0002890776699029126,
104
- "loss": 0.3283,
105
- "step": 1200
106
- },
107
- {
108
- "epoch": 0.7888349514563107,
109
- "grad_norm": 1.0319699048995972,
110
- "learning_rate": 0.0002881674757281553,
111
- "loss": 0.3236,
112
- "step": 1300
113
- },
114
- {
115
- "epoch": 0.8495145631067961,
116
- "grad_norm": 1.194286584854126,
117
- "learning_rate": 0.000287257281553398,
118
- "loss": 0.3178,
119
- "step": 1400
120
- },
121
- {
122
- "epoch": 0.9101941747572816,
123
- "grad_norm": 0.4329046308994293,
124
- "learning_rate": 0.00028634708737864073,
125
- "loss": 0.3234,
126
- "step": 1500
127
- },
128
- {
129
- "epoch": 0.970873786407767,
130
- "grad_norm": 0.4490291476249695,
131
- "learning_rate": 0.0002854368932038835,
132
- "loss": 0.3148,
133
- "step": 1600
134
- },
135
- {
136
- "epoch": 1.0315533980582525,
137
- "grad_norm": 1.9341398477554321,
138
- "learning_rate": 0.0002845266990291262,
139
- "loss": 0.3157,
140
- "step": 1700
141
- },
142
- {
143
- "epoch": 1.0922330097087378,
144
- "grad_norm": 0.6705629825592041,
145
- "learning_rate": 0.0002836165048543689,
146
- "loss": 0.3144,
147
- "step": 1800
148
- },
149
- {
150
- "epoch": 1.1529126213592233,
151
- "grad_norm": 0.3708420395851135,
152
- "learning_rate": 0.0002827063106796116,
153
- "loss": 0.3106,
154
- "step": 1900
155
- },
156
- {
157
- "epoch": 1.2135922330097086,
158
- "grad_norm": 0.4166070222854614,
159
- "learning_rate": 0.00028179611650485433,
160
- "loss": 0.3065,
161
- "step": 2000
162
- },
163
- {
164
- "epoch": 1.2135922330097086,
165
- "eval_accuracy": 0.6756993193505025,
166
- "eval_f1_macro": 0.4177625223934091,
167
- "eval_loss": 0.3020932972431183,
168
- "eval_precision": 0.5208365532058327,
169
- "eval_recall": 0.3908095186846785,
170
- "eval_runtime": 63.8686,
171
- "eval_samples_per_second": 733.803,
172
- "eval_steps_per_second": 5.746,
173
- "step": 2000
174
- },
175
- {
176
- "epoch": 1.2742718446601942,
177
- "grad_norm": 0.949810266494751,
178
- "learning_rate": 0.00028088592233009704,
179
- "loss": 0.309,
180
- "step": 2100
181
- },
182
- {
183
- "epoch": 1.3349514563106797,
184
- "grad_norm": 0.6933236718177795,
185
- "learning_rate": 0.00027997572815533975,
186
- "loss": 0.3039,
187
- "step": 2200
188
- },
189
- {
190
- "epoch": 1.395631067961165,
191
- "grad_norm": 0.4874693751335144,
192
- "learning_rate": 0.0002790655339805825,
193
- "loss": 0.3016,
194
- "step": 2300
195
- },
196
- {
197
- "epoch": 1.4563106796116505,
198
- "grad_norm": 0.5307803750038147,
199
- "learning_rate": 0.0002781553398058252,
200
- "loss": 0.295,
201
- "step": 2400
202
- },
203
- {
204
- "epoch": 1.516990291262136,
205
- "grad_norm": 0.7260825634002686,
206
- "learning_rate": 0.00027724514563106793,
207
- "loss": 0.298,
208
- "step": 2500
209
- },
210
- {
211
- "epoch": 1.5776699029126213,
212
- "grad_norm": 1.3546072244644165,
213
- "learning_rate": 0.00027633495145631064,
214
- "loss": 0.2937,
215
- "step": 2600
216
- },
217
- {
218
- "epoch": 1.6383495145631068,
219
- "grad_norm": 0.7695233821868896,
220
- "learning_rate": 0.00027542475728155335,
221
- "loss": 0.2939,
222
- "step": 2700
223
- },
224
- {
225
- "epoch": 1.6990291262135924,
226
- "grad_norm": 0.46857160329818726,
227
- "learning_rate": 0.0002745145631067961,
228
- "loss": 0.2911,
229
- "step": 2800
230
- },
231
- {
232
- "epoch": 1.7597087378640777,
233
- "grad_norm": 0.521542489528656,
234
- "learning_rate": 0.0002736043689320388,
235
- "loss": 0.2936,
236
- "step": 2900
237
- },
238
- {
239
- "epoch": 1.820388349514563,
240
- "grad_norm": 1.1797749996185303,
241
- "learning_rate": 0.00027269417475728154,
242
- "loss": 0.291,
243
- "step": 3000
244
- },
245
- {
246
- "epoch": 1.820388349514563,
247
- "eval_accuracy": 0.6800947361683061,
248
- "eval_f1_macro": 0.43005056999542096,
249
- "eval_loss": 0.2904761731624603,
250
- "eval_precision": 0.5283372179121356,
251
- "eval_recall": 0.40012210038254903,
252
- "eval_runtime": 63.5697,
253
- "eval_samples_per_second": 737.253,
254
- "eval_steps_per_second": 5.773,
255
- "step": 3000
256
- },
257
- {
258
- "epoch": 1.8810679611650487,
259
- "grad_norm": 0.9966709613800049,
260
- "learning_rate": 0.00027178398058252425,
261
- "loss": 0.2965,
262
- "step": 3100
263
- },
264
- {
265
- "epoch": 1.941747572815534,
266
- "grad_norm": 0.40996024012565613,
267
- "learning_rate": 0.00027087378640776696,
268
- "loss": 0.2941,
269
- "step": 3200
270
- },
271
- {
272
- "epoch": 2.0024271844660193,
273
- "grad_norm": 0.5450060367584229,
274
- "learning_rate": 0.00026996359223300967,
275
- "loss": 0.2912,
276
- "step": 3300
277
- },
278
- {
279
- "epoch": 2.063106796116505,
280
- "grad_norm": 0.5307539701461792,
281
- "learning_rate": 0.0002690533980582524,
282
- "loss": 0.2872,
283
- "step": 3400
284
- },
285
- {
286
- "epoch": 2.1237864077669903,
287
- "grad_norm": 0.5863193273544312,
288
- "learning_rate": 0.00026814320388349514,
289
- "loss": 0.2929,
290
- "step": 3500
291
- },
292
- {
293
- "epoch": 2.1844660194174756,
294
- "grad_norm": 0.584078311920166,
295
- "learning_rate": 0.00026723300970873785,
296
- "loss": 0.2879,
297
- "step": 3600
298
- },
299
- {
300
- "epoch": 2.2451456310679614,
301
- "grad_norm": 0.6381602883338928,
302
- "learning_rate": 0.00026632281553398056,
303
- "loss": 0.2892,
304
- "step": 3700
305
- },
306
- {
307
- "epoch": 2.3058252427184467,
308
- "grad_norm": 0.4760149121284485,
309
- "learning_rate": 0.00026541262135922327,
310
- "loss": 0.2863,
311
- "step": 3800
312
- },
313
- {
314
- "epoch": 2.366504854368932,
315
- "grad_norm": 0.4088296890258789,
316
- "learning_rate": 0.000264502427184466,
317
- "loss": 0.2913,
318
- "step": 3900
319
- },
320
- {
321
- "epoch": 2.4271844660194173,
322
- "grad_norm": 1.3476176261901855,
323
- "learning_rate": 0.00026359223300970874,
324
- "loss": 0.2845,
325
- "step": 4000
326
- },
327
- {
328
- "epoch": 2.4271844660194173,
329
- "eval_accuracy": 0.6929182580493738,
330
- "eval_f1_macro": 0.4442664055274559,
331
- "eval_loss": 0.280377060174942,
332
- "eval_precision": 0.5328225535144124,
333
- "eval_recall": 0.41160029395774395,
334
- "eval_runtime": 64.199,
335
- "eval_samples_per_second": 730.027,
336
- "eval_steps_per_second": 5.717,
337
- "step": 4000
338
- },
339
- {
340
- "epoch": 2.487864077669903,
341
- "grad_norm": 0.6318752765655518,
342
- "learning_rate": 0.0002626820388349514,
343
- "loss": 0.2812,
344
- "step": 4100
345
- },
346
- {
347
- "epoch": 2.5485436893203883,
348
- "grad_norm": 0.49435973167419434,
349
- "learning_rate": 0.00026177184466019416,
350
- "loss": 0.2803,
351
- "step": 4200
352
- },
353
- {
354
- "epoch": 2.6092233009708736,
355
- "grad_norm": 0.4300900101661682,
356
- "learning_rate": 0.00026086165048543687,
357
- "loss": 0.2853,
358
- "step": 4300
359
- },
360
- {
361
- "epoch": 2.6699029126213594,
362
- "grad_norm": 0.9545436501502991,
363
- "learning_rate": 0.0002599514563106796,
364
- "loss": 0.2813,
365
- "step": 4400
366
- },
367
- {
368
- "epoch": 2.7305825242718447,
369
- "grad_norm": 0.5803716778755188,
370
- "learning_rate": 0.0002590412621359223,
371
- "loss": 0.2838,
372
- "step": 4500
373
- },
374
- {
375
- "epoch": 2.79126213592233,
376
- "grad_norm": 1.4714713096618652,
377
- "learning_rate": 0.000258131067961165,
378
- "loss": 0.2814,
379
- "step": 4600
380
- },
381
- {
382
- "epoch": 2.8519417475728153,
383
- "grad_norm": 0.6767821311950684,
384
- "learning_rate": 0.00025722087378640777,
385
- "loss": 0.2741,
386
- "step": 4700
387
- },
388
- {
389
- "epoch": 2.912621359223301,
390
- "grad_norm": 0.4653462767601013,
391
- "learning_rate": 0.0002563106796116505,
392
- "loss": 0.2783,
393
- "step": 4800
394
- },
395
- {
396
- "epoch": 2.9733009708737863,
397
- "grad_norm": 1.3012775182724,
398
- "learning_rate": 0.0002554004854368932,
399
- "loss": 0.283,
400
- "step": 4900
401
- },
402
- {
403
- "epoch": 3.033980582524272,
404
- "grad_norm": 0.4733451306819916,
405
- "learning_rate": 0.0002544902912621359,
406
- "loss": 0.2767,
407
- "step": 5000
408
- },
409
- {
410
- "epoch": 3.033980582524272,
411
- "eval_accuracy": 0.6949666076343696,
412
- "eval_f1_macro": 0.4524166681181929,
413
- "eval_loss": 0.2772601842880249,
414
- "eval_precision": 0.52914110464024,
415
- "eval_recall": 0.42261704559523156,
416
- "eval_runtime": 63.4257,
417
- "eval_samples_per_second": 738.927,
418
- "eval_steps_per_second": 5.786,
419
- "step": 5000
420
- },
421
- {
422
- "epoch": 3.0946601941747574,
423
- "grad_norm": 0.4103662371635437,
424
- "learning_rate": 0.0002535800970873786,
425
- "loss": 0.2796,
426
- "step": 5100
427
- },
428
- {
429
- "epoch": 3.1553398058252426,
430
- "grad_norm": 0.4195462763309479,
431
- "learning_rate": 0.0002526699029126213,
432
- "loss": 0.2757,
433
- "step": 5200
434
- },
435
- {
436
- "epoch": 3.216019417475728,
437
- "grad_norm": 1.2391552925109863,
438
- "learning_rate": 0.0002517597087378641,
439
- "loss": 0.2783,
440
- "step": 5300
441
- },
442
- {
443
- "epoch": 3.2766990291262137,
444
- "grad_norm": 1.2029412984848022,
445
- "learning_rate": 0.0002508495145631068,
446
- "loss": 0.2772,
447
- "step": 5400
448
- },
449
- {
450
- "epoch": 3.337378640776699,
451
- "grad_norm": 0.5050978660583496,
452
- "learning_rate": 0.0002499393203883495,
453
- "loss": 0.2776,
454
- "step": 5500
455
- },
456
- {
457
- "epoch": 3.3980582524271843,
458
- "grad_norm": 1.0107412338256836,
459
- "learning_rate": 0.0002490291262135922,
460
- "loss": 0.2766,
461
- "step": 5600
462
- },
463
- {
464
- "epoch": 3.45873786407767,
465
- "grad_norm": 0.4374917149543762,
466
- "learning_rate": 0.0002481189320388349,
467
- "loss": 0.2719,
468
- "step": 5700
469
- },
470
- {
471
- "epoch": 3.5194174757281553,
472
- "grad_norm": 1.6768765449523926,
473
- "learning_rate": 0.0002472087378640777,
474
- "loss": 0.2803,
475
- "step": 5800
476
- },
477
- {
478
- "epoch": 3.5800970873786406,
479
- "grad_norm": 0.8120823502540588,
480
- "learning_rate": 0.0002462985436893204,
481
- "loss": 0.2723,
482
- "step": 5900
483
- },
484
- {
485
- "epoch": 3.6407766990291264,
486
- "grad_norm": 1.3967177867889404,
487
- "learning_rate": 0.0002453883495145631,
488
- "loss": 0.2796,
489
- "step": 6000
490
- },
491
- {
492
- "epoch": 3.6407766990291264,
493
- "eval_accuracy": 0.7001301555465466,
494
- "eval_f1_macro": 0.46180498265852127,
495
- "eval_loss": 0.272257536649704,
496
- "eval_precision": 0.5281578618931982,
497
- "eval_recall": 0.4315295129889904,
498
- "eval_runtime": 63.8213,
499
- "eval_samples_per_second": 734.347,
500
- "eval_steps_per_second": 5.75,
501
- "step": 6000
502
- },
503
- {
504
- "epoch": 3.7014563106796117,
505
- "grad_norm": 0.6093985438346863,
506
- "learning_rate": 0.0002444781553398058,
507
- "loss": 0.2744,
508
- "step": 6100
509
- },
510
- {
511
- "epoch": 3.762135922330097,
512
- "grad_norm": 0.7282202243804932,
513
- "learning_rate": 0.00024356796116504852,
514
- "loss": 0.2715,
515
- "step": 6200
516
- },
517
- {
518
- "epoch": 3.8228155339805827,
519
- "grad_norm": 1.1341967582702637,
520
- "learning_rate": 0.00024265776699029123,
521
- "loss": 0.2709,
522
- "step": 6300
523
- },
524
- {
525
- "epoch": 3.883495145631068,
526
- "grad_norm": 0.8576841354370117,
527
- "learning_rate": 0.00024174757281553394,
528
- "loss": 0.275,
529
- "step": 6400
530
- },
531
- {
532
- "epoch": 3.9441747572815533,
533
- "grad_norm": 0.5656840205192566,
534
- "learning_rate": 0.00024083737864077668,
535
- "loss": 0.2676,
536
- "step": 6500
537
- },
538
- {
539
- "epoch": 4.004854368932039,
540
- "grad_norm": 0.6544743180274963,
541
- "learning_rate": 0.0002399271844660194,
542
- "loss": 0.2734,
543
- "step": 6600
544
- },
545
- {
546
- "epoch": 4.065533980582524,
547
- "grad_norm": 1.5159205198287964,
548
- "learning_rate": 0.0002390169902912621,
549
- "loss": 0.2666,
550
- "step": 6700
551
- },
552
- {
553
- "epoch": 4.12621359223301,
554
- "grad_norm": 0.9112799763679504,
555
- "learning_rate": 0.00023810679611650483,
556
- "loss": 0.2646,
557
- "step": 6800
558
- },
559
- {
560
- "epoch": 4.186893203883495,
561
- "grad_norm": 0.6971092224121094,
562
- "learning_rate": 0.00023719660194174754,
563
- "loss": 0.2681,
564
- "step": 6900
565
- },
566
- {
567
- "epoch": 4.247572815533981,
568
- "grad_norm": 0.5126680731773376,
569
- "learning_rate": 0.00023628640776699028,
570
- "loss": 0.2669,
571
- "step": 7000
572
- },
573
- {
574
- "epoch": 4.247572815533981,
575
- "eval_accuracy": 0.7024985597541981,
576
- "eval_f1_macro": 0.4678440617843855,
577
- "eval_loss": 0.26851192116737366,
578
- "eval_precision": 0.5330566032345198,
579
- "eval_recall": 0.43620949446199525,
580
- "eval_runtime": 63.9009,
581
- "eval_samples_per_second": 733.432,
582
- "eval_steps_per_second": 5.743,
583
- "step": 7000
584
- },
585
- {
586
- "epoch": 4.308252427184466,
587
- "grad_norm": 0.40426695346832275,
588
- "learning_rate": 0.000235376213592233,
589
- "loss": 0.2722,
590
- "step": 7100
591
- },
592
- {
593
- "epoch": 4.368932038834951,
594
- "grad_norm": 0.4849171042442322,
595
- "learning_rate": 0.0002344660194174757,
596
- "loss": 0.273,
597
- "step": 7200
598
- },
599
- {
600
- "epoch": 4.429611650485437,
601
- "grad_norm": 0.7887744307518005,
602
- "learning_rate": 0.00023355582524271844,
603
- "loss": 0.2711,
604
- "step": 7300
605
- },
606
- {
607
- "epoch": 4.490291262135923,
608
- "grad_norm": 1.282893419265747,
609
- "learning_rate": 0.00023264563106796115,
610
- "loss": 0.2633,
611
- "step": 7400
612
- },
613
- {
614
- "epoch": 4.550970873786408,
615
- "grad_norm": 0.4942808449268341,
616
- "learning_rate": 0.00023173543689320386,
617
- "loss": 0.2655,
618
- "step": 7500
619
- },
620
- {
621
- "epoch": 4.611650485436893,
622
- "grad_norm": 0.4098761975765228,
623
- "learning_rate": 0.0002308252427184466,
624
- "loss": 0.2719,
625
- "step": 7600
626
- },
627
- {
628
- "epoch": 4.672330097087379,
629
- "grad_norm": 1.1584999561309814,
630
- "learning_rate": 0.0002299150485436893,
631
- "loss": 0.2646,
632
- "step": 7700
633
- },
634
- {
635
- "epoch": 4.733009708737864,
636
- "grad_norm": 0.5956116318702698,
637
- "learning_rate": 0.00022900485436893204,
638
- "loss": 0.2681,
639
- "step": 7800
640
- },
641
- {
642
- "epoch": 4.793689320388349,
643
- "grad_norm": 0.7886734008789062,
644
- "learning_rate": 0.00022809466019417472,
645
- "loss": 0.2648,
646
- "step": 7900
647
- },
648
- {
649
- "epoch": 4.854368932038835,
650
- "grad_norm": 0.7086551785469055,
651
- "learning_rate": 0.00022718446601941746,
652
- "loss": 0.2667,
653
- "step": 8000
654
- },
655
- {
656
- "epoch": 4.854368932038835,
657
- "eval_accuracy": 0.7028826253013848,
658
- "eval_f1_macro": 0.4680422447785217,
659
- "eval_loss": 0.2663130462169647,
660
- "eval_precision": 0.5388642351614853,
661
- "eval_recall": 0.4370831407609595,
662
- "eval_runtime": 63.9774,
663
- "eval_samples_per_second": 732.556,
664
- "eval_steps_per_second": 5.736,
665
- "step": 8000
666
- },
667
- {
668
- "epoch": 4.915048543689321,
669
- "grad_norm": 0.6959311366081238,
670
- "learning_rate": 0.0002262742718446602,
671
- "loss": 0.2662,
672
- "step": 8100
673
- },
674
- {
675
- "epoch": 4.975728155339806,
676
- "grad_norm": 1.318164587020874,
677
- "learning_rate": 0.0002253640776699029,
678
- "loss": 0.2679,
679
- "step": 8200
680
- },
681
- {
682
- "epoch": 5.036407766990291,
683
- "grad_norm": 0.5080001354217529,
684
- "learning_rate": 0.00022445388349514561,
685
- "loss": 0.2662,
686
- "step": 8300
687
- },
688
- {
689
- "epoch": 5.097087378640777,
690
- "grad_norm": 0.40693196654319763,
691
- "learning_rate": 0.00022354368932038832,
692
- "loss": 0.2641,
693
- "step": 8400
694
- },
695
- {
696
- "epoch": 5.157766990291262,
697
- "grad_norm": 0.42492175102233887,
698
- "learning_rate": 0.00022263349514563106,
699
- "loss": 0.2669,
700
- "step": 8500
701
- },
702
- {
703
- "epoch": 5.218446601941747,
704
- "grad_norm": 0.9580904841423035,
705
- "learning_rate": 0.00022172330097087374,
706
- "loss": 0.2613,
707
- "step": 8600
708
- },
709
- {
710
- "epoch": 5.279126213592233,
711
- "grad_norm": 0.573049008846283,
712
- "learning_rate": 0.00022081310679611648,
713
- "loss": 0.26,
714
- "step": 8700
715
- },
716
- {
717
- "epoch": 5.339805825242719,
718
- "grad_norm": 0.7355623841285706,
719
- "learning_rate": 0.00021990291262135922,
720
- "loss": 0.2682,
721
- "step": 8800
722
- },
723
- {
724
- "epoch": 5.400485436893204,
725
- "grad_norm": 0.5867640972137451,
726
- "learning_rate": 0.00021899271844660193,
727
- "loss": 0.2677,
728
- "step": 8900
729
- },
730
- {
731
- "epoch": 5.461165048543689,
732
- "grad_norm": 1.283672571182251,
733
- "learning_rate": 0.00021808252427184464,
734
- "loss": 0.2582,
735
- "step": 9000
736
- },
737
- {
738
- "epoch": 5.461165048543689,
739
- "eval_accuracy": 0.7030319841252907,
740
- "eval_f1_macro": 0.476191462667983,
741
- "eval_loss": 0.2642744779586792,
742
- "eval_precision": 0.5389130008234221,
743
- "eval_recall": 0.4475692410266268,
744
- "eval_runtime": 63.527,
745
- "eval_samples_per_second": 737.749,
746
- "eval_steps_per_second": 5.777,
747
- "step": 9000
748
- },
749
- {
750
- "epoch": 5.521844660194175,
751
- "grad_norm": 0.4764248728752136,
752
- "learning_rate": 0.00021717233009708735,
753
- "loss": 0.2596,
754
- "step": 9100
755
- },
756
- {
757
- "epoch": 5.58252427184466,
758
- "grad_norm": 0.7470163106918335,
759
- "learning_rate": 0.00021626213592233008,
760
- "loss": 0.2632,
761
- "step": 9200
762
- },
763
- {
764
- "epoch": 5.643203883495145,
765
- "grad_norm": 0.7628265023231506,
766
- "learning_rate": 0.00021535194174757282,
767
- "loss": 0.2583,
768
- "step": 9300
769
- },
770
- {
771
- "epoch": 5.703883495145631,
772
- "grad_norm": 0.4257694184780121,
773
- "learning_rate": 0.0002144417475728155,
774
- "loss": 0.2627,
775
- "step": 9400
776
- },
777
- {
778
- "epoch": 5.764563106796117,
779
- "grad_norm": 0.5351702570915222,
780
- "learning_rate": 0.00021353155339805824,
781
- "loss": 0.2578,
782
- "step": 9500
783
- },
784
- {
785
- "epoch": 5.825242718446602,
786
- "grad_norm": 1.9470020532608032,
787
- "learning_rate": 0.00021262135922330095,
788
- "loss": 0.2599,
789
- "step": 9600
790
- },
791
- {
792
- "epoch": 5.885922330097087,
793
- "grad_norm": 0.41918280720710754,
794
- "learning_rate": 0.00021171116504854369,
795
- "loss": 0.2604,
796
- "step": 9700
797
- },
798
- {
799
- "epoch": 5.946601941747573,
800
- "grad_norm": 0.7866085767745972,
801
- "learning_rate": 0.00021080097087378637,
802
- "loss": 0.2645,
803
- "step": 9800
804
- },
805
- {
806
- "epoch": 6.007281553398058,
807
- "grad_norm": 0.773632287979126,
808
- "learning_rate": 0.0002098907766990291,
809
- "loss": 0.2594,
810
- "step": 9900
811
- },
812
- {
813
- "epoch": 6.067961165048544,
814
- "grad_norm": 0.905390202999115,
815
- "learning_rate": 0.00020898058252427184,
816
- "loss": 0.2587,
817
- "step": 10000
818
- },
819
- {
820
- "epoch": 6.067961165048544,
821
- "eval_accuracy": 0.7069153135468453,
822
- "eval_f1_macro": 0.47329615730761665,
823
- "eval_loss": 0.26157495379447937,
824
- "eval_precision": 0.5382757148939651,
825
- "eval_recall": 0.4417990410758896,
826
- "eval_runtime": 64.0102,
827
- "eval_samples_per_second": 732.181,
828
- "eval_steps_per_second": 5.733,
829
- "step": 10000
830
- },
831
- {
832
- "epoch": 6.128640776699029,
833
- "grad_norm": 0.7645794749259949,
834
- "learning_rate": 0.00020807038834951455,
835
- "loss": 0.2584,
836
- "step": 10100
837
- },
838
- {
839
- "epoch": 6.189320388349515,
840
- "grad_norm": 0.80213862657547,
841
- "learning_rate": 0.00020716019417475726,
842
- "loss": 0.2576,
843
- "step": 10200
844
- },
845
- {
846
- "epoch": 6.25,
847
- "grad_norm": 0.49435296654701233,
848
- "learning_rate": 0.00020624999999999997,
849
- "loss": 0.2578,
850
- "step": 10300
851
- },
852
- {
853
- "epoch": 6.310679611650485,
854
- "grad_norm": 1.2024767398834229,
855
- "learning_rate": 0.0002053398058252427,
856
- "loss": 0.2628,
857
- "step": 10400
858
- },
859
- {
860
- "epoch": 6.371359223300971,
861
- "grad_norm": 0.43948036432266235,
862
- "learning_rate": 0.0002044296116504854,
863
- "loss": 0.2555,
864
- "step": 10500
865
- },
866
- {
867
- "epoch": 6.432038834951456,
868
- "grad_norm": 1.0257009267807007,
869
- "learning_rate": 0.00020351941747572813,
870
- "loss": 0.2582,
871
- "step": 10600
872
- },
873
- {
874
- "epoch": 6.492718446601942,
875
- "grad_norm": 1.2218210697174072,
876
- "learning_rate": 0.00020260922330097086,
877
- "loss": 0.2603,
878
- "step": 10700
879
- },
880
- {
881
- "epoch": 6.553398058252427,
882
- "grad_norm": 0.43541598320007324,
883
- "learning_rate": 0.00020169902912621357,
884
- "loss": 0.2529,
885
- "step": 10800
886
- },
887
- {
888
- "epoch": 6.614077669902913,
889
- "grad_norm": 0.4533527195453644,
890
- "learning_rate": 0.00020078883495145628,
891
- "loss": 0.255,
892
- "step": 10900
893
- },
894
- {
895
- "epoch": 6.674757281553398,
896
- "grad_norm": 0.714527428150177,
897
- "learning_rate": 0.000199878640776699,
898
- "loss": 0.2524,
899
- "step": 11000
900
- },
901
- {
902
- "epoch": 6.674757281553398,
903
- "eval_accuracy": 0.7088356412827789,
904
- "eval_f1_macro": 0.471743421411692,
905
- "eval_loss": 0.2603248357772827,
906
- "eval_precision": 0.5487634473468738,
907
- "eval_recall": 0.43766522368406974,
908
- "eval_runtime": 64.8962,
909
- "eval_samples_per_second": 722.184,
910
- "eval_steps_per_second": 5.655,
911
- "step": 11000
912
- },
913
- {
914
- "epoch": 6.735436893203883,
915
- "grad_norm": 0.6983916163444519,
916
- "learning_rate": 0.00019896844660194173,
917
- "loss": 0.2585,
918
- "step": 11100
919
- },
920
- {
921
- "epoch": 6.796116504854369,
922
- "grad_norm": 0.8627694845199585,
923
- "learning_rate": 0.00019805825242718447,
924
- "loss": 0.2557,
925
- "step": 11200
926
- },
927
- {
928
- "epoch": 6.856796116504855,
929
- "grad_norm": 0.40715476870536804,
930
- "learning_rate": 0.00019714805825242715,
931
- "loss": 0.2588,
932
- "step": 11300
933
- },
934
- {
935
- "epoch": 6.91747572815534,
936
- "grad_norm": 1.0038883686065674,
937
- "learning_rate": 0.0001962378640776699,
938
- "loss": 0.2542,
939
- "step": 11400
940
- },
941
- {
942
- "epoch": 6.978155339805825,
943
- "grad_norm": 0.941238522529602,
944
- "learning_rate": 0.0001953276699029126,
945
- "loss": 0.2597,
946
- "step": 11500
947
- },
948
- {
949
- "epoch": 7.038834951456311,
950
- "grad_norm": 0.6316859722137451,
951
- "learning_rate": 0.00019441747572815533,
952
- "loss": 0.2576,
953
- "step": 11600
954
- },
955
- {
956
- "epoch": 7.099514563106796,
957
- "grad_norm": 0.5387765169143677,
958
- "learning_rate": 0.00019350728155339804,
959
- "loss": 0.2525,
960
- "step": 11700
961
- },
962
- {
963
- "epoch": 7.160194174757281,
964
- "grad_norm": 0.9042656421661377,
965
- "learning_rate": 0.00019259708737864075,
966
- "loss": 0.2552,
967
- "step": 11800
968
- },
969
- {
970
- "epoch": 7.220873786407767,
971
- "grad_norm": 0.6641530990600586,
972
- "learning_rate": 0.0001916868932038835,
973
- "loss": 0.2495,
974
- "step": 11900
975
- },
976
- {
977
- "epoch": 7.281553398058253,
978
- "grad_norm": 1.050493836402893,
979
- "learning_rate": 0.0001907766990291262,
980
- "loss": 0.2538,
981
- "step": 12000
982
- },
983
- {
984
- "epoch": 7.281553398058253,
985
- "eval_accuracy": 0.7098171421255894,
986
- "eval_f1_macro": 0.4800779183645662,
987
- "eval_loss": 0.25889766216278076,
988
- "eval_precision": 0.5445483450890162,
989
- "eval_recall": 0.44845043483484776,
990
- "eval_runtime": 63.7915,
991
- "eval_samples_per_second": 734.69,
992
- "eval_steps_per_second": 5.753,
993
- "step": 12000
994
- },
995
- {
996
- "epoch": 7.342233009708738,
997
- "grad_norm": 0.7460100054740906,
998
- "learning_rate": 0.0001898665048543689,
999
- "loss": 0.2576,
1000
- "step": 12100
1001
- },
1002
- {
1003
- "epoch": 7.402912621359223,
1004
- "grad_norm": 1.8915176391601562,
1005
- "learning_rate": 0.00018895631067961165,
1006
- "loss": 0.2563,
1007
- "step": 12200
1008
- },
1009
- {
1010
- "epoch": 7.463592233009709,
1011
- "grad_norm": 0.6642426252365112,
1012
- "learning_rate": 0.00018804611650485436,
1013
- "loss": 0.2515,
1014
- "step": 12300
1015
- },
1016
- {
1017
- "epoch": 7.524271844660194,
1018
- "grad_norm": 0.7037068009376526,
1019
- "learning_rate": 0.00018713592233009707,
1020
- "loss": 0.2525,
1021
- "step": 12400
1022
- },
1023
- {
1024
- "epoch": 7.584951456310679,
1025
- "grad_norm": 0.44775909185409546,
1026
- "learning_rate": 0.00018622572815533978,
1027
- "loss": 0.2557,
1028
- "step": 12500
1029
- },
1030
- {
1031
- "epoch": 7.645631067961165,
1032
- "grad_norm": 1.2254287004470825,
1033
- "learning_rate": 0.0001853155339805825,
1034
- "loss": 0.252,
1035
- "step": 12600
1036
- },
1037
- {
1038
- "epoch": 7.706310679611651,
1039
- "grad_norm": 0.7916478514671326,
1040
- "learning_rate": 0.00018440533980582525,
1041
- "loss": 0.2503,
1042
- "step": 12700
1043
- },
1044
- {
1045
- "epoch": 7.766990291262136,
1046
- "grad_norm": 0.5165425539016724,
1047
- "learning_rate": 0.00018349514563106793,
1048
- "loss": 0.259,
1049
- "step": 12800
1050
- },
1051
- {
1052
- "epoch": 7.827669902912621,
1053
- "grad_norm": 0.45590564608573914,
1054
- "learning_rate": 0.00018258495145631067,
1055
- "loss": 0.2536,
1056
- "step": 12900
1057
- },
1058
- {
1059
- "epoch": 7.888349514563107,
1060
- "grad_norm": 0.7460587620735168,
1061
- "learning_rate": 0.00018167475728155338,
1062
- "loss": 0.2536,
1063
- "step": 13000
1064
- },
1065
- {
1066
- "epoch": 7.888349514563107,
1067
- "eval_accuracy": 0.7118014807860542,
1068
- "eval_f1_macro": 0.48153346170307537,
1069
- "eval_loss": 0.2587451934814453,
1070
- "eval_precision": 0.5428621211876972,
1071
- "eval_recall": 0.44971967808318786,
1072
- "eval_runtime": 63.3555,
1073
- "eval_samples_per_second": 739.747,
1074
- "eval_steps_per_second": 5.793,
1075
- "step": 13000
1076
- },
1077
- {
1078
- "epoch": 7.949029126213592,
1079
- "grad_norm": 0.5832739472389221,
1080
- "learning_rate": 0.00018076456310679612,
1081
- "loss": 0.2549,
1082
- "step": 13100
1083
- },
1084
- {
1085
- "epoch": 8.009708737864077,
1086
- "grad_norm": 0.991656482219696,
1087
- "learning_rate": 0.0001798543689320388,
1088
- "loss": 0.2516,
1089
- "step": 13200
1090
- },
1091
- {
1092
- "epoch": 8.070388349514563,
1093
- "grad_norm": 0.5339873433113098,
1094
- "learning_rate": 0.00017894417475728154,
1095
- "loss": 0.2492,
1096
- "step": 13300
1097
- },
1098
- {
1099
- "epoch": 8.131067961165048,
1100
- "grad_norm": 0.5186127424240112,
1101
- "learning_rate": 0.00017803398058252427,
1102
- "loss": 0.2521,
1103
- "step": 13400
1104
- },
1105
- {
1106
- "epoch": 8.191747572815533,
1107
- "grad_norm": 0.7012351751327515,
1108
- "learning_rate": 0.00017712378640776698,
1109
- "loss": 0.247,
1110
- "step": 13500
1111
- },
1112
- {
1113
- "epoch": 8.25242718446602,
1114
- "grad_norm": 0.5204672813415527,
1115
- "learning_rate": 0.0001762135922330097,
1116
- "loss": 0.2528,
1117
- "step": 13600
1118
- },
1119
- {
1120
- "epoch": 8.313106796116505,
1121
- "grad_norm": 0.46670639514923096,
1122
- "learning_rate": 0.0001753033980582524,
1123
- "loss": 0.2484,
1124
- "step": 13700
1125
- },
1126
- {
1127
- "epoch": 8.37378640776699,
1128
- "grad_norm": 0.8041334748268127,
1129
- "learning_rate": 0.00017439320388349514,
1130
- "loss": 0.2534,
1131
- "step": 13800
1132
- },
1133
- {
1134
- "epoch": 8.434466019417476,
1135
- "grad_norm": 0.482832133769989,
1136
- "learning_rate": 0.00017348300970873787,
1137
- "loss": 0.2463,
1138
- "step": 13900
1139
- },
1140
- {
1141
- "epoch": 8.495145631067961,
1142
- "grad_norm": 0.9541172981262207,
1143
- "learning_rate": 0.00017257281553398056,
1144
- "loss": 0.2527,
1145
- "step": 14000
1146
- },
1147
- {
1148
- "epoch": 8.495145631067961,
1149
- "eval_accuracy": 0.7112040454904304,
1150
- "eval_f1_macro": 0.4703790685018183,
1151
- "eval_loss": 0.2594839036464691,
1152
- "eval_precision": 0.5469933734414426,
1153
- "eval_recall": 0.43675078934194306,
1154
- "eval_runtime": 64.5706,
1155
- "eval_samples_per_second": 725.825,
1156
- "eval_steps_per_second": 5.684,
1157
- "step": 14000
1158
- },
1159
- {
1160
- "epoch": 8.555825242718447,
1161
- "grad_norm": 0.41729798913002014,
1162
- "learning_rate": 0.0001716626213592233,
1163
- "loss": 0.2566,
1164
- "step": 14100
1165
- },
1166
- {
1167
- "epoch": 8.616504854368932,
1168
- "grad_norm": 1.1643282175064087,
1169
- "learning_rate": 0.000170752427184466,
1170
- "loss": 0.2492,
1171
- "step": 14200
1172
- },
1173
- {
1174
- "epoch": 8.677184466019417,
1175
- "grad_norm": 0.5015943050384521,
1176
- "learning_rate": 0.00016984223300970874,
1177
- "loss": 0.2484,
1178
- "step": 14300
1179
- },
1180
- {
1181
- "epoch": 8.737864077669903,
1182
- "grad_norm": 0.6088297963142395,
1183
- "learning_rate": 0.00016893203883495142,
1184
- "loss": 0.2481,
1185
- "step": 14400
1186
- },
1187
- {
1188
- "epoch": 8.798543689320388,
1189
- "grad_norm": 0.6608941555023193,
1190
- "learning_rate": 0.00016802184466019416,
1191
- "loss": 0.2509,
1192
- "step": 14500
1193
- },
1194
- {
1195
- "epoch": 8.859223300970873,
1196
- "grad_norm": 0.506313145160675,
1197
- "learning_rate": 0.0001671116504854369,
1198
- "loss": 0.2488,
1199
- "step": 14600
1200
- },
1201
- {
1202
- "epoch": 8.919902912621358,
1203
- "grad_norm": 0.579694390296936,
1204
- "learning_rate": 0.00016620145631067958,
1205
- "loss": 0.252,
1206
- "step": 14700
1207
- },
1208
- {
1209
- "epoch": 8.980582524271846,
1210
- "grad_norm": 1.048781156539917,
1211
- "learning_rate": 0.00016529126213592232,
1212
- "loss": 0.2569,
1213
- "step": 14800
1214
- },
1215
- {
1216
- "epoch": 9.04126213592233,
1217
- "grad_norm": 0.4762475788593292,
1218
- "learning_rate": 0.00016438106796116503,
1219
- "loss": 0.247,
1220
- "step": 14900
1221
- },
1222
- {
1223
- "epoch": 9.101941747572816,
1224
- "grad_norm": 0.5784612894058228,
1225
- "learning_rate": 0.00016347087378640776,
1226
- "loss": 0.249,
1227
- "step": 15000
1228
- },
1229
- {
1230
- "epoch": 9.101941747572816,
1231
- "eval_accuracy": 0.7119295026351163,
1232
- "eval_f1_macro": 0.48462198319681066,
1233
- "eval_loss": 0.2577269971370697,
1234
- "eval_precision": 0.6220453203620933,
1235
- "eval_recall": 0.45259946630146386,
1236
- "eval_runtime": 63.7117,
1237
- "eval_samples_per_second": 735.611,
1238
- "eval_steps_per_second": 5.76,
1239
- "step": 15000
1240
- },
1241
- {
1242
- "epoch": 9.162621359223301,
1243
- "grad_norm": 1.0214685201644897,
1244
- "learning_rate": 0.00016256067961165045,
1245
- "loss": 0.2449,
1246
- "step": 15100
1247
- },
1248
- {
1249
- "epoch": 9.223300970873787,
1250
- "grad_norm": 0.5309824347496033,
1251
- "learning_rate": 0.00016165048543689318,
1252
- "loss": 0.2456,
1253
- "step": 15200
1254
- },
1255
- {
1256
- "epoch": 9.283980582524272,
1257
- "grad_norm": 2.5317320823669434,
1258
- "learning_rate": 0.00016074029126213592,
1259
- "loss": 0.2428,
1260
- "step": 15300
1261
- },
1262
- {
1263
- "epoch": 9.344660194174757,
1264
- "grad_norm": 0.588573157787323,
1265
- "learning_rate": 0.00015983009708737863,
1266
- "loss": 0.2451,
1267
- "step": 15400
1268
- },
1269
- {
1270
- "epoch": 9.405339805825243,
1271
- "grad_norm": 0.687719464302063,
1272
- "learning_rate": 0.00015891990291262134,
1273
- "loss": 0.2503,
1274
- "step": 15500
1275
- },
1276
- {
1277
- "epoch": 9.466019417475728,
1278
- "grad_norm": 1.0443726778030396,
1279
- "learning_rate": 0.00015800970873786405,
1280
- "loss": 0.2485,
1281
- "step": 15600
1282
- },
1283
- {
1284
- "epoch": 9.526699029126213,
1285
- "grad_norm": 1.1230727434158325,
1286
- "learning_rate": 0.00015709951456310679,
1287
- "loss": 0.2479,
1288
- "step": 15700
1289
- },
1290
- {
1291
- "epoch": 9.587378640776699,
1292
- "grad_norm": 0.6641112565994263,
1293
- "learning_rate": 0.00015618932038834952,
1294
- "loss": 0.2489,
1295
- "step": 15800
1296
- },
1297
- {
1298
- "epoch": 9.648058252427184,
1299
- "grad_norm": 0.4918789863586426,
1300
- "learning_rate": 0.0001552791262135922,
1301
- "loss": 0.2481,
1302
- "step": 15900
1303
- },
1304
- {
1305
- "epoch": 9.70873786407767,
1306
- "grad_norm": 0.7862838506698608,
1307
- "learning_rate": 0.00015436893203883494,
1308
- "loss": 0.2481,
1309
- "step": 16000
1310
- },
1311
- {
1312
- "epoch": 9.70873786407767,
1313
- "eval_accuracy": 0.713487101798707,
1314
- "eval_f1_macro": 0.4839261840621368,
1315
- "eval_loss": 0.2561655640602112,
1316
- "eval_precision": 0.6345013597016592,
1317
- "eval_recall": 0.4492915653199398,
1318
- "eval_runtime": 63.9745,
1319
- "eval_samples_per_second": 732.589,
1320
- "eval_steps_per_second": 5.737,
1321
- "step": 16000
1322
- },
1323
- {
1324
- "epoch": 9.769417475728154,
1325
- "grad_norm": 0.8769936561584473,
1326
- "learning_rate": 0.00015345873786407765,
1327
- "loss": 0.2479,
1328
- "step": 16100
1329
- },
1330
- {
1331
- "epoch": 9.830097087378642,
1332
- "grad_norm": 0.9663941860198975,
1333
- "learning_rate": 0.0001525485436893204,
1334
- "loss": 0.2548,
1335
- "step": 16200
1336
- },
1337
- {
1338
- "epoch": 9.890776699029127,
1339
- "grad_norm": 0.4475904405117035,
1340
- "learning_rate": 0.00015163834951456307,
1341
- "loss": 0.2504,
1342
- "step": 16300
1343
- },
1344
- {
1345
- "epoch": 9.951456310679612,
1346
- "grad_norm": 0.9180653691291809,
1347
- "learning_rate": 0.0001507281553398058,
1348
- "loss": 0.246,
1349
- "step": 16400
1350
- },
1351
- {
1352
- "epoch": 10.012135922330097,
1353
- "grad_norm": 1.1535005569458008,
1354
- "learning_rate": 0.00014981796116504852,
1355
- "loss": 0.2463,
1356
- "step": 16500
1357
- },
1358
- {
1359
- "epoch": 10.072815533980583,
1360
- "grad_norm": 0.5621392130851746,
1361
- "learning_rate": 0.00014890776699029125,
1362
- "loss": 0.2432,
1363
- "step": 16600
1364
- },
1365
- {
1366
- "epoch": 10.133495145631068,
1367
- "grad_norm": 0.5198004245758057,
1368
- "learning_rate": 0.00014799757281553396,
1369
- "loss": 0.2463,
1370
- "step": 16700
1371
- },
1372
- {
1373
- "epoch": 10.194174757281553,
1374
- "grad_norm": 0.7625685930252075,
1375
- "learning_rate": 0.00014708737864077667,
1376
- "loss": 0.2474,
1377
- "step": 16800
1378
- },
1379
- {
1380
- "epoch": 10.254854368932039,
1381
- "grad_norm": 0.4673856794834137,
1382
- "learning_rate": 0.0001461771844660194,
1383
- "loss": 0.244,
1384
- "step": 16900
1385
- },
1386
- {
1387
- "epoch": 10.315533980582524,
1388
- "grad_norm": 0.6320896744728088,
1389
- "learning_rate": 0.00014526699029126212,
1390
- "loss": 0.241,
1391
- "step": 17000
1392
- },
1393
- {
1394
- "epoch": 10.315533980582524,
1395
- "eval_accuracy": 0.7026265816032603,
1396
- "eval_f1_macro": 0.4873663248833824,
1397
- "eval_loss": 0.2613597810268402,
1398
- "eval_precision": 0.5926410330413402,
1399
- "eval_recall": 0.46290412629796984,
1400
- "eval_runtime": 63.8686,
1401
- "eval_samples_per_second": 733.803,
1402
- "eval_steps_per_second": 5.746,
1403
- "step": 17000
1404
- },
1405
- {
1406
- "epoch": 10.37621359223301,
1407
- "grad_norm": 1.3767356872558594,
1408
- "learning_rate": 0.00014435679611650483,
1409
- "loss": 0.2431,
1410
- "step": 17100
1411
- },
1412
- {
1413
- "epoch": 10.436893203883495,
1414
- "grad_norm": 0.6615713834762573,
1415
- "learning_rate": 0.00014344660194174757,
1416
- "loss": 0.2401,
1417
- "step": 17200
1418
- },
1419
- {
1420
- "epoch": 10.49757281553398,
1421
- "grad_norm": 0.5104607343673706,
1422
- "learning_rate": 0.00014253640776699028,
1423
- "loss": 0.2467,
1424
- "step": 17300
1425
- },
1426
- {
1427
- "epoch": 10.558252427184467,
1428
- "grad_norm": 1.2690117359161377,
1429
- "learning_rate": 0.00014162621359223301,
1430
- "loss": 0.2474,
1431
- "step": 17400
1432
- },
1433
- {
1434
- "epoch": 10.618932038834952,
1435
- "grad_norm": 0.5738195180892944,
1436
- "learning_rate": 0.00014071601941747572,
1437
- "loss": 0.2508,
1438
- "step": 17500
1439
- },
1440
- {
1441
- "epoch": 10.679611650485437,
1442
- "grad_norm": 0.9724490642547607,
1443
- "learning_rate": 0.00013980582524271843,
1444
- "loss": 0.2457,
1445
- "step": 17600
1446
- },
1447
- {
1448
- "epoch": 10.740291262135923,
1449
- "grad_norm": 0.5149191617965698,
1450
- "learning_rate": 0.00013889563106796114,
1451
- "loss": 0.2453,
1452
- "step": 17700
1453
- },
1454
- {
1455
- "epoch": 10.800970873786408,
1456
- "grad_norm": 0.4608076512813568,
1457
- "learning_rate": 0.00013798543689320388,
1458
- "loss": 0.2461,
1459
- "step": 17800
1460
- },
1461
- {
1462
- "epoch": 10.861650485436893,
1463
- "grad_norm": 0.7955921292304993,
1464
- "learning_rate": 0.0001370752427184466,
1465
- "loss": 0.2427,
1466
- "step": 17900
1467
- },
1468
- {
1469
- "epoch": 10.922330097087379,
1470
- "grad_norm": 0.75131756067276,
1471
- "learning_rate": 0.00013616504854368933,
1472
- "loss": 0.2441,
1473
- "step": 18000
1474
- },
1475
- {
1476
- "epoch": 10.922330097087379,
1477
- "eval_accuracy": 0.712249557257772,
1478
- "eval_f1_macro": 0.48099926256661046,
1479
- "eval_loss": 0.2561965882778168,
1480
- "eval_precision": 0.6261557755424023,
1481
- "eval_recall": 0.44725199588180664,
1482
- "eval_runtime": 63.857,
1483
- "eval_samples_per_second": 733.937,
1484
- "eval_steps_per_second": 5.747,
1485
- "step": 18000
1486
- },
1487
- {
1488
- "epoch": 10.983009708737864,
1489
- "grad_norm": 0.791315495967865,
1490
- "learning_rate": 0.00013525485436893204,
1491
- "loss": 0.2486,
1492
- "step": 18100
1493
- },
1494
- {
1495
- "epoch": 11.04368932038835,
1496
- "grad_norm": 0.7312906980514526,
1497
- "learning_rate": 0.00013434466019417475,
1498
- "loss": 0.2431,
1499
- "step": 18200
1500
- },
1501
- {
1502
- "epoch": 11.104368932038835,
1503
- "grad_norm": 0.609377384185791,
1504
- "learning_rate": 0.00013343446601941746,
1505
- "loss": 0.2399,
1506
- "step": 18300
1507
- },
1508
- {
1509
- "epoch": 11.16504854368932,
1510
- "grad_norm": 0.5242405533790588,
1511
- "learning_rate": 0.0001325242718446602,
1512
- "loss": 0.2424,
1513
- "step": 18400
1514
- },
1515
- {
1516
- "epoch": 11.225728155339805,
1517
- "grad_norm": 0.6860948204994202,
1518
- "learning_rate": 0.0001316140776699029,
1519
- "loss": 0.2434,
1520
- "step": 18500
1521
- },
1522
- {
1523
- "epoch": 11.28640776699029,
1524
- "grad_norm": 2.105470657348633,
1525
- "learning_rate": 0.0001307038834951456,
1526
- "loss": 0.2393,
1527
- "step": 18600
1528
- },
1529
- {
1530
- "epoch": 11.347087378640778,
1531
- "grad_norm": 0.990110456943512,
1532
- "learning_rate": 0.00012979368932038835,
1533
- "loss": 0.2446,
1534
- "step": 18700
1535
- },
1536
- {
1537
- "epoch": 11.407766990291263,
1538
- "grad_norm": 0.5341915488243103,
1539
- "learning_rate": 0.00012888349514563106,
1540
- "loss": 0.2449,
1541
- "step": 18800
1542
- },
1543
- {
1544
- "epoch": 11.468446601941748,
1545
- "grad_norm": 1.2902977466583252,
1546
- "learning_rate": 0.00012797330097087377,
1547
- "loss": 0.245,
1548
- "step": 18900
1549
- },
1550
- {
1551
- "epoch": 11.529126213592233,
1552
- "grad_norm": 0.5241170525550842,
1553
- "learning_rate": 0.00012706310679611648,
1554
- "loss": 0.2433,
1555
- "step": 19000
1556
- },
1557
- {
1558
- "epoch": 11.529126213592233,
1559
- "eval_accuracy": 0.7136791345723004,
1560
- "eval_f1_macro": 0.47974793179174946,
1561
- "eval_loss": 0.25500378012657166,
1562
- "eval_precision": 0.5467676042682321,
1563
- "eval_recall": 0.44681795057266877,
1564
- "eval_runtime": 63.4183,
1565
- "eval_samples_per_second": 739.014,
1566
- "eval_steps_per_second": 5.787,
1567
- "step": 19000
1568
- },
1569
- {
1570
- "epoch": 11.589805825242719,
1571
- "grad_norm": 1.6583833694458008,
1572
- "learning_rate": 0.00012615291262135922,
1573
- "loss": 0.2444,
1574
- "step": 19100
1575
- },
1576
- {
1577
- "epoch": 11.650485436893204,
1578
- "grad_norm": 0.5068759322166443,
1579
- "learning_rate": 0.00012524271844660192,
1580
- "loss": 0.2417,
1581
- "step": 19200
1582
- },
1583
- {
1584
- "epoch": 11.71116504854369,
1585
- "grad_norm": 0.8810881972312927,
1586
- "learning_rate": 0.00012433252427184466,
1587
- "loss": 0.2443,
1588
- "step": 19300
1589
- },
1590
- {
1591
- "epoch": 11.771844660194175,
1592
- "grad_norm": 1.347113847732544,
1593
- "learning_rate": 0.00012342233009708737,
1594
- "loss": 0.2408,
1595
- "step": 19400
1596
- },
1597
- {
1598
- "epoch": 11.83252427184466,
1599
- "grad_norm": 0.5012452602386475,
1600
- "learning_rate": 0.00012251213592233008,
1601
- "loss": 0.2443,
1602
- "step": 19500
1603
- },
1604
- {
1605
- "epoch": 11.893203883495145,
1606
- "grad_norm": 0.6998531222343445,
1607
- "learning_rate": 0.0001216019417475728,
1608
- "loss": 0.2449,
1609
- "step": 19600
1610
- },
1611
- {
1612
- "epoch": 11.95388349514563,
1613
- "grad_norm": 0.9693962335586548,
1614
- "learning_rate": 0.00012069174757281553,
1615
- "loss": 0.2415,
1616
- "step": 19700
1617
- },
1618
- {
1619
- "epoch": 12.014563106796116,
1620
- "grad_norm": 0.5657163858413696,
1621
- "learning_rate": 0.00011978155339805824,
1622
- "loss": 0.2416,
1623
- "step": 19800
1624
- },
1625
- {
1626
- "epoch": 12.075242718446601,
1627
- "grad_norm": 0.527268648147583,
1628
- "learning_rate": 0.00011887135922330096,
1629
- "loss": 0.2377,
1630
- "step": 19900
1631
- },
1632
- {
1633
- "epoch": 12.135922330097088,
1634
- "grad_norm": 0.4967799782752991,
1635
- "learning_rate": 0.00011796116504854367,
1636
- "loss": 0.2436,
1637
- "step": 20000
1638
- },
1639
- {
1640
- "epoch": 12.135922330097088,
1641
- "eval_accuracy": 0.7124629270062091,
1642
- "eval_f1_macro": 0.48898241670534826,
1643
- "eval_loss": 0.2549245059490204,
1644
- "eval_precision": 0.6280111982698415,
1645
- "eval_recall": 0.45813565668329703,
1646
- "eval_runtime": 63.6697,
1647
- "eval_samples_per_second": 736.095,
1648
- "eval_steps_per_second": 5.764,
1649
- "step": 20000
1650
- },
1651
- {
1652
- "epoch": 12.196601941747574,
1653
- "grad_norm": 0.5225503444671631,
1654
- "learning_rate": 0.00011705097087378641,
1655
- "loss": 0.2405,
1656
- "step": 20100
1657
- },
1658
- {
1659
- "epoch": 12.257281553398059,
1660
- "grad_norm": 0.7009778022766113,
1661
- "learning_rate": 0.00011614077669902912,
1662
- "loss": 0.2375,
1663
- "step": 20200
1664
- },
1665
- {
1666
- "epoch": 12.317961165048544,
1667
- "grad_norm": 0.5107778310775757,
1668
- "learning_rate": 0.00011523058252427184,
1669
- "loss": 0.2396,
1670
- "step": 20300
1671
- },
1672
- {
1673
- "epoch": 12.37864077669903,
1674
- "grad_norm": 0.5113606452941895,
1675
- "learning_rate": 0.00011432038834951455,
1676
- "loss": 0.2379,
1677
- "step": 20400
1678
- },
1679
- {
1680
- "epoch": 12.439320388349515,
1681
- "grad_norm": 0.6248862743377686,
1682
- "learning_rate": 0.00011341019417475727,
1683
- "loss": 0.2453,
1684
- "step": 20500
1685
- },
1686
- {
1687
- "epoch": 12.5,
1688
- "grad_norm": 0.5648460984230042,
1689
- "learning_rate": 0.0001125,
1690
- "loss": 0.2391,
1691
- "step": 20600
1692
- },
1693
- {
1694
- "epoch": 12.560679611650485,
1695
- "grad_norm": 0.7846024632453918,
1696
- "learning_rate": 0.0001115898058252427,
1697
- "loss": 0.2445,
1698
- "step": 20700
1699
- },
1700
- {
1701
- "epoch": 12.62135922330097,
1702
- "grad_norm": 0.8849174976348877,
1703
- "learning_rate": 0.00011067961165048543,
1704
- "loss": 0.2392,
1705
- "step": 20800
1706
- },
1707
- {
1708
- "epoch": 12.682038834951456,
1709
- "grad_norm": 0.5515842437744141,
1710
- "learning_rate": 0.00010976941747572814,
1711
- "loss": 0.2429,
1712
- "step": 20900
1713
- },
1714
- {
1715
- "epoch": 12.742718446601941,
1716
- "grad_norm": 0.9129722118377686,
1717
- "learning_rate": 0.00010885922330097086,
1718
- "loss": 0.2459,
1719
- "step": 21000
1720
- },
1721
- {
1722
- "epoch": 12.742718446601941,
1723
- "eval_accuracy": 0.7137218085219877,
1724
- "eval_f1_macro": 0.49319871257798575,
1725
- "eval_loss": 0.25367996096611023,
1726
- "eval_precision": 0.6236252217705004,
1727
- "eval_recall": 0.4640439902470517,
1728
- "eval_runtime": 63.865,
1729
- "eval_samples_per_second": 733.844,
1730
- "eval_steps_per_second": 5.746,
1731
- "step": 21000
1732
- },
1733
- {
1734
- "epoch": 12.803398058252426,
1735
- "grad_norm": 0.8533498048782349,
1736
- "learning_rate": 0.00010794902912621357,
1737
- "loss": 0.2384,
1738
- "step": 21100
1739
- },
1740
- {
1741
- "epoch": 12.864077669902912,
1742
- "grad_norm": 0.5744712352752686,
1743
- "learning_rate": 0.00010703883495145631,
1744
- "loss": 0.241,
1745
- "step": 21200
1746
- },
1747
- {
1748
- "epoch": 12.924757281553399,
1749
- "grad_norm": 0.9214051961898804,
1750
- "learning_rate": 0.00010612864077669902,
1751
- "loss": 0.2437,
1752
- "step": 21300
1753
- },
1754
- {
1755
- "epoch": 12.985436893203884,
1756
- "grad_norm": 0.561357319355011,
1757
- "learning_rate": 0.00010521844660194174,
1758
- "loss": 0.2394,
1759
- "step": 21400
1760
- },
1761
- {
1762
- "epoch": 13.04611650485437,
1763
- "grad_norm": 0.746048629283905,
1764
- "learning_rate": 0.00010430825242718445,
1765
- "loss": 0.2367,
1766
- "step": 21500
1767
- },
1768
- {
1769
- "epoch": 13.106796116504855,
1770
- "grad_norm": 0.5449076294898987,
1771
- "learning_rate": 0.00010339805825242718,
1772
- "loss": 0.2392,
1773
- "step": 21600
1774
- },
1775
- {
1776
- "epoch": 13.16747572815534,
1777
- "grad_norm": 0.7307832837104797,
1778
- "learning_rate": 0.00010248786407766989,
1779
- "loss": 0.2379,
1780
- "step": 21700
1781
- },
1782
- {
1783
- "epoch": 13.228155339805825,
1784
- "grad_norm": 0.9420238137245178,
1785
- "learning_rate": 0.00010157766990291262,
1786
- "loss": 0.2348,
1787
- "step": 21800
1788
- },
1789
- {
1790
- "epoch": 13.28883495145631,
1791
- "grad_norm": 0.43917521834373474,
1792
- "learning_rate": 0.00010066747572815533,
1793
- "loss": 0.2375,
1794
- "step": 21900
1795
- },
1796
- {
1797
- "epoch": 13.349514563106796,
1798
- "grad_norm": 0.5659081339836121,
1799
- "learning_rate": 9.975728155339806e-05,
1800
- "loss": 0.24,
1801
- "step": 22000
1802
- },
1803
- {
1804
- "epoch": 13.349514563106796,
1805
- "eval_accuracy": 0.7130176883521454,
1806
- "eval_f1_macro": 0.4889124257466886,
1807
- "eval_loss": 0.254099041223526,
1808
- "eval_precision": 0.6244022587227488,
1809
- "eval_recall": 0.4579878521374155,
1810
- "eval_runtime": 63.8542,
1811
- "eval_samples_per_second": 733.968,
1812
- "eval_steps_per_second": 5.747,
1813
- "step": 22000
1814
- },
1815
- {
1816
- "epoch": 13.410194174757281,
1817
- "grad_norm": 0.6600030660629272,
1818
- "learning_rate": 9.884708737864077e-05,
1819
- "loss": 0.2355,
1820
- "step": 22100
1821
- },
1822
- {
1823
- "epoch": 13.470873786407767,
1824
- "grad_norm": 0.49261102080345154,
1825
- "learning_rate": 9.793689320388349e-05,
1826
- "loss": 0.2388,
1827
- "step": 22200
1828
- },
1829
- {
1830
- "epoch": 13.531553398058252,
1831
- "grad_norm": 0.9407384395599365,
1832
- "learning_rate": 9.70266990291262e-05,
1833
- "loss": 0.2384,
1834
- "step": 22300
1835
- },
1836
- {
1837
- "epoch": 13.592233009708737,
1838
- "grad_norm": 0.9749634265899658,
1839
- "learning_rate": 9.611650485436893e-05,
1840
- "loss": 0.2392,
1841
- "step": 22400
1842
- },
1843
- {
1844
- "epoch": 13.652912621359224,
1845
- "grad_norm": 0.5800509452819824,
1846
- "learning_rate": 9.520631067961164e-05,
1847
- "loss": 0.2373,
1848
- "step": 22500
1849
- },
1850
- {
1851
- "epoch": 13.71359223300971,
1852
- "grad_norm": 0.7015953063964844,
1853
- "learning_rate": 9.429611650485437e-05,
1854
- "loss": 0.2384,
1855
- "step": 22600
1856
- },
1857
- {
1858
- "epoch": 13.774271844660195,
1859
- "grad_norm": 0.5446757078170776,
1860
- "learning_rate": 9.338592233009708e-05,
1861
- "loss": 0.2399,
1862
- "step": 22700
1863
- },
1864
- {
1865
- "epoch": 13.83495145631068,
1866
- "grad_norm": 1.1201865673065186,
1867
- "learning_rate": 9.247572815533979e-05,
1868
- "loss": 0.2423,
1869
- "step": 22800
1870
- },
1871
- {
1872
- "epoch": 13.895631067961165,
1873
- "grad_norm": 0.789397656917572,
1874
- "learning_rate": 9.156553398058251e-05,
1875
- "loss": 0.2401,
1876
- "step": 22900
1877
- },
1878
- {
1879
- "epoch": 13.95631067961165,
1880
- "grad_norm": 0.9811580777168274,
1881
- "learning_rate": 9.065533980582523e-05,
1882
- "loss": 0.2429,
1883
- "step": 23000
1884
- },
1885
- {
1886
- "epoch": 13.95631067961165,
1887
- "eval_accuracy": 0.7146819723899546,
1888
- "eval_f1_macro": 0.4950660124078677,
1889
- "eval_loss": 0.2540215849876404,
1890
- "eval_precision": 0.6269585302337652,
1891
- "eval_recall": 0.4640707919907739,
1892
- "eval_runtime": 63.4303,
1893
- "eval_samples_per_second": 738.874,
1894
- "eval_steps_per_second": 5.786,
1895
- "step": 23000
1896
- },
1897
- {
1898
- "epoch": 14.016990291262136,
1899
- "grad_norm": 0.6500277519226074,
1900
- "learning_rate": 8.974514563106796e-05,
1901
- "loss": 0.2399,
1902
- "step": 23100
1903
- },
1904
- {
1905
- "epoch": 14.077669902912621,
1906
- "grad_norm": 0.7648112177848816,
1907
- "learning_rate": 8.883495145631067e-05,
1908
- "loss": 0.2355,
1909
- "step": 23200
1910
- },
1911
- {
1912
- "epoch": 14.138349514563107,
1913
- "grad_norm": 0.6670911312103271,
1914
- "learning_rate": 8.792475728155339e-05,
1915
- "loss": 0.2371,
1916
- "step": 23300
1917
- },
1918
- {
1919
- "epoch": 14.199029126213592,
1920
- "grad_norm": 0.8316827416419983,
1921
- "learning_rate": 8.70145631067961e-05,
1922
- "loss": 0.2355,
1923
- "step": 23400
1924
- },
1925
- {
1926
- "epoch": 14.259708737864077,
1927
- "grad_norm": 0.4973256587982178,
1928
- "learning_rate": 8.610436893203884e-05,
1929
- "loss": 0.24,
1930
- "step": 23500
1931
- },
1932
- {
1933
- "epoch": 14.320388349514563,
1934
- "grad_norm": 0.6986468434333801,
1935
- "learning_rate": 8.519417475728155e-05,
1936
- "loss": 0.2319,
1937
- "step": 23600
1938
- },
1939
- {
1940
- "epoch": 14.381067961165048,
1941
- "grad_norm": 0.5316904783248901,
1942
- "learning_rate": 8.428398058252427e-05,
1943
- "loss": 0.2334,
1944
- "step": 23700
1945
- },
1946
- {
1947
- "epoch": 14.441747572815533,
1948
- "grad_norm": 0.5285237431526184,
1949
- "learning_rate": 8.337378640776698e-05,
1950
- "loss": 0.2372,
1951
- "step": 23800
1952
- },
1953
- {
1954
- "epoch": 14.50242718446602,
1955
- "grad_norm": 0.7617068290710449,
1956
- "learning_rate": 8.24635922330097e-05,
1957
- "loss": 0.2434,
1958
- "step": 23900
1959
- },
1960
- {
1961
- "epoch": 14.563106796116505,
1962
- "grad_norm": 0.4870689809322357,
1963
- "learning_rate": 8.155339805825241e-05,
1964
- "loss": 0.2388,
1965
- "step": 24000
1966
- },
1967
- {
1968
- "epoch": 14.563106796116505,
1969
- "eval_accuracy": 0.7128896665030832,
1970
- "eval_f1_macro": 0.48631927316876294,
1971
- "eval_loss": 0.2543109357357025,
1972
- "eval_precision": 0.6364589670810381,
1973
- "eval_recall": 0.451184525833245,
1974
- "eval_runtime": 63.5544,
1975
- "eval_samples_per_second": 737.432,
1976
- "eval_steps_per_second": 5.775,
1977
- "step": 24000
1978
- },
1979
- {
1980
- "epoch": 14.62378640776699,
1981
- "grad_norm": 0.48867112398147583,
1982
- "learning_rate": 8.064320388349515e-05,
1983
- "loss": 0.2424,
1984
- "step": 24100
1985
- },
1986
- {
1987
- "epoch": 14.684466019417476,
1988
- "grad_norm": 1.3717137575149536,
1989
- "learning_rate": 7.973300970873786e-05,
1990
- "loss": 0.2373,
1991
- "step": 24200
1992
- },
1993
- {
1994
- "epoch": 14.745145631067961,
1995
- "grad_norm": 0.5244446396827698,
1996
- "learning_rate": 7.882281553398058e-05,
1997
- "loss": 0.2406,
1998
- "step": 24300
1999
- },
2000
- {
2001
- "epoch": 14.805825242718447,
2002
- "grad_norm": 0.49141696095466614,
2003
- "learning_rate": 7.791262135922329e-05,
2004
- "loss": 0.2345,
2005
- "step": 24400
2006
- },
2007
- {
2008
- "epoch": 14.866504854368932,
2009
- "grad_norm": 0.5758472681045532,
2010
- "learning_rate": 7.700242718446602e-05,
2011
- "loss": 0.2372,
2012
- "step": 24500
2013
- },
2014
- {
2015
- "epoch": 14.927184466019417,
2016
- "grad_norm": 0.4706755578517914,
2017
- "learning_rate": 7.609223300970873e-05,
2018
- "loss": 0.2363,
2019
- "step": 24600
2020
- },
2021
- {
2022
- "epoch": 14.987864077669903,
2023
- "grad_norm": 1.1661343574523926,
2024
- "learning_rate": 7.518203883495146e-05,
2025
- "loss": 0.2363,
2026
- "step": 24700
2027
- },
2028
- {
2029
- "epoch": 15.048543689320388,
2030
- "grad_norm": 0.6530236005783081,
2031
- "learning_rate": 7.427184466019417e-05,
2032
- "loss": 0.2371,
2033
- "step": 24800
2034
- },
2035
- {
2036
- "epoch": 15.109223300970873,
2037
- "grad_norm": 0.6235638856887817,
2038
- "learning_rate": 7.336165048543688e-05,
2039
- "loss": 0.2365,
2040
- "step": 24900
2041
- },
2042
- {
2043
- "epoch": 15.169902912621358,
2044
- "grad_norm": 1.0099172592163086,
2045
- "learning_rate": 7.24514563106796e-05,
2046
- "loss": 0.2333,
2047
- "step": 25000
2048
- },
2049
- {
2050
- "epoch": 15.169902912621358,
2051
- "eval_accuracy": 0.7117374698615231,
2052
- "eval_f1_macro": 0.4930065435769275,
2053
- "eval_loss": 0.25366273522377014,
2054
- "eval_precision": 0.6254110142417604,
2055
- "eval_recall": 0.4636190323809964,
2056
- "eval_runtime": 63.6634,
2057
- "eval_samples_per_second": 736.168,
2058
- "eval_steps_per_second": 5.765,
2059
- "step": 25000
2060
- },
2061
- {
2062
- "epoch": 15.230582524271846,
2063
- "grad_norm": 0.4479919672012329,
2064
- "learning_rate": 7.154126213592233e-05,
2065
- "loss": 0.2363,
2066
- "step": 25100
2067
- },
2068
- {
2069
- "epoch": 15.29126213592233,
2070
- "grad_norm": 0.5574977993965149,
2071
- "learning_rate": 7.063106796116504e-05,
2072
- "loss": 0.2323,
2073
- "step": 25200
2074
- },
2075
- {
2076
- "epoch": 15.351941747572816,
2077
- "grad_norm": 0.815531849861145,
2078
- "learning_rate": 6.972087378640776e-05,
2079
- "loss": 0.2378,
2080
- "step": 25300
2081
- },
2082
- {
2083
- "epoch": 15.412621359223301,
2084
- "grad_norm": 0.47219938039779663,
2085
- "learning_rate": 6.881067961165048e-05,
2086
- "loss": 0.2363,
2087
- "step": 25400
2088
- },
2089
- {
2090
- "epoch": 15.473300970873787,
2091
- "grad_norm": 1.4584532976150513,
2092
- "learning_rate": 6.79004854368932e-05,
2093
- "loss": 0.2323,
2094
- "step": 25500
2095
- },
2096
- {
2097
- "epoch": 15.533980582524272,
2098
- "grad_norm": 0.5099394917488098,
2099
- "learning_rate": 6.699029126213592e-05,
2100
- "loss": 0.2343,
2101
- "step": 25600
2102
- },
2103
- {
2104
- "epoch": 15.594660194174757,
2105
- "grad_norm": 0.624839186668396,
2106
- "learning_rate": 6.608009708737864e-05,
2107
- "loss": 0.2357,
2108
- "step": 25700
2109
- },
2110
- {
2111
- "epoch": 15.655339805825243,
2112
- "grad_norm": 0.9318602085113525,
2113
- "learning_rate": 6.516990291262135e-05,
2114
- "loss": 0.2331,
2115
- "step": 25800
2116
- },
2117
- {
2118
- "epoch": 15.716019417475728,
2119
- "grad_norm": 0.6715418100357056,
2120
- "learning_rate": 6.425970873786407e-05,
2121
- "loss": 0.2365,
2122
- "step": 25900
2123
- },
2124
- {
2125
- "epoch": 15.776699029126213,
2126
- "grad_norm": 0.48603758215904236,
2127
- "learning_rate": 6.334951456310678e-05,
2128
- "loss": 0.2392,
2129
- "step": 26000
2130
- },
2131
- {
2132
- "epoch": 15.776699029126213,
2133
- "eval_accuracy": 0.7145326135660486,
2134
- "eval_f1_macro": 0.49345940490874235,
2135
- "eval_loss": 0.2531893253326416,
2136
- "eval_precision": 0.6264577154052439,
2137
- "eval_recall": 0.4631683363045007,
2138
- "eval_runtime": 63.9314,
2139
- "eval_samples_per_second": 733.082,
2140
- "eval_steps_per_second": 5.741,
2141
- "step": 26000
2142
- },
2143
- {
2144
- "epoch": 15.837378640776699,
2145
- "grad_norm": 1.03753662109375,
2146
- "learning_rate": 6.243932038834951e-05,
2147
- "loss": 0.234,
2148
- "step": 26100
2149
- },
2150
- {
2151
- "epoch": 15.898058252427184,
2152
- "grad_norm": 0.7353742718696594,
2153
- "learning_rate": 6.152912621359223e-05,
2154
- "loss": 0.2381,
2155
- "step": 26200
2156
- },
2157
- {
2158
- "epoch": 15.95873786407767,
2159
- "grad_norm": 0.8260138034820557,
2160
- "learning_rate": 6.061893203883495e-05,
2161
- "loss": 0.2394,
2162
- "step": 26300
2163
- },
2164
- {
2165
- "epoch": 16.019417475728154,
2166
- "grad_norm": 0.7960408926010132,
2167
- "learning_rate": 5.9708737864077663e-05,
2168
- "loss": 0.2377,
2169
- "step": 26400
2170
- },
2171
- {
2172
- "epoch": 16.08009708737864,
2173
- "grad_norm": 0.5640716552734375,
2174
- "learning_rate": 5.879854368932038e-05,
2175
- "loss": 0.2374,
2176
- "step": 26500
2177
- },
2178
- {
2179
- "epoch": 16.140776699029125,
2180
- "grad_norm": 0.8281972408294678,
2181
- "learning_rate": 5.78883495145631e-05,
2182
- "loss": 0.2369,
2183
- "step": 26600
2184
- },
2185
- {
2186
- "epoch": 16.20145631067961,
2187
- "grad_norm": 0.49466079473495483,
2188
- "learning_rate": 5.697815533980582e-05,
2189
- "loss": 0.2352,
2190
- "step": 26700
2191
- },
2192
- {
2193
- "epoch": 16.262135922330096,
2194
- "grad_norm": 0.5278394818305969,
2195
- "learning_rate": 5.6067961165048536e-05,
2196
- "loss": 0.2321,
2197
- "step": 26800
2198
- },
2199
- {
2200
- "epoch": 16.32281553398058,
2201
- "grad_norm": 1.5943635702133179,
2202
- "learning_rate": 5.515776699029126e-05,
2203
- "loss": 0.2368,
2204
- "step": 26900
2205
- },
2206
- {
2207
- "epoch": 16.383495145631066,
2208
- "grad_norm": 0.7204076647758484,
2209
- "learning_rate": 5.4247572815533976e-05,
2210
- "loss": 0.2344,
2211
- "step": 27000
2212
- },
2213
- {
2214
- "epoch": 16.383495145631066,
2215
- "eval_accuracy": 0.7135511127232381,
2216
- "eval_f1_macro": 0.4959826756763837,
2217
- "eval_loss": 0.2531072497367859,
2218
- "eval_precision": 0.5967354989597543,
2219
- "eval_recall": 0.4670103239766295,
2220
- "eval_runtime": 63.8776,
2221
- "eval_samples_per_second": 733.7,
2222
- "eval_steps_per_second": 5.745,
2223
- "step": 27000
2224
- }
2225
- ],
2226
- "logging_steps": 100,
2227
- "max_steps": 32960,
2228
- "num_input_tokens_seen": 0,
2229
- "num_train_epochs": 20,
2230
- "save_steps": 1000,
2231
- "total_flos": 1.8182410386600591e+18,
2232
- "train_batch_size": 256,
2233
- "trial_name": null,
2234
- "trial_params": null
2235
- }