shuheng commited on
Commit
d18c8b0
verified
1 Parent(s): 9c644cb

End of training

Browse files
README.md CHANGED
@@ -4,6 +4,8 @@ license: llama3.2
4
  base_model: meta-llama/Llama-3.2-1B
5
  tags:
6
  - generated_from_trainer
 
 
7
  model-index:
8
  - name: squad_llama_finetuned
9
  results: []
@@ -14,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
14
 
15
  # squad_llama_finetuned
16
 
17
- This model is a fine-tuned version of [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) on an unknown dataset.
18
 
19
  ## Model description
20
 
 
4
  base_model: meta-llama/Llama-3.2-1B
5
  tags:
6
  - generated_from_trainer
7
+ datasets:
8
+ - squad
9
  model-index:
10
  - name: squad_llama_finetuned
11
  results: []
 
16
 
17
  # squad_llama_finetuned
18
 
19
+ This model is a fine-tuned version of [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) on the squad dataset.
20
 
21
  ## Model description
22
 
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
  "epoch": 2.0,
3
- "eval_exact_match": 0.40681173131504256,
4
- "eval_f1": 4.453179113928262,
5
- "eval_runtime": 250.6555,
6
  "eval_samples": 10787,
7
- "eval_samples_per_second": 43.035,
8
- "eval_steps_per_second": 5.382,
9
  "total_flos": 3.971230131335731e+17,
10
- "train_loss": 5.428521726861854,
11
- "train_runtime": 4331.0324,
12
  "train_samples": 88559,
13
- "train_samples_per_second": 40.895,
14
- "train_steps_per_second": 1.278
15
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "eval_exact_match": 0.3122043519394513,
4
+ "eval_f1": 4.956028770172976,
5
+ "eval_runtime": 247.7335,
6
  "eval_samples": 10787,
7
+ "eval_samples_per_second": 43.543,
8
+ "eval_steps_per_second": 5.445,
9
  "total_flos": 3.971230131335731e+17,
10
+ "train_loss": 5.790556044936869,
11
+ "train_runtime": 4316.8958,
12
  "train_samples": 88559,
13
+ "train_samples_per_second": 41.029,
14
+ "train_steps_per_second": 1.282
15
  }
eval_nbest_predictions.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:428e4e13e4d8c42b1bdd2b22237d109c38dd26e659db53e70fafd794f061c72a
3
- size 51596787
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7556ea17d1e93ba7ca10aaaba07c10da6224304790052cc0cd444714cb00b8c2
3
+ size 51674491
eval_predictions.json CHANGED
The diff for this file is too large to render. See raw diff
 
eval_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 2.0,
3
- "eval_exact_match": 0.40681173131504256,
4
- "eval_f1": 4.453179113928262,
5
- "eval_runtime": 250.6555,
6
  "eval_samples": 10787,
7
- "eval_samples_per_second": 43.035,
8
- "eval_steps_per_second": 5.382
9
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "eval_exact_match": 0.3122043519394513,
4
+ "eval_f1": 4.956028770172976,
5
+ "eval_runtime": 247.7335,
6
  "eval_samples": 10787,
7
+ "eval_samples_per_second": 43.543,
8
+ "eval_steps_per_second": 5.445
9
  }
runs/Dec13_22-26-50_xgpi3/events.out.tfevents.1734105188.xgpi3.2488284.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45e95bb7c06400ab6ef597c40f38cea4851ad7c584379651b27e8ea0fbbb013f
3
+ size 412
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 2.0,
3
  "total_flos": 3.971230131335731e+17,
4
- "train_loss": 5.428521726861854,
5
- "train_runtime": 4331.0324,
6
  "train_samples": 88559,
7
- "train_samples_per_second": 40.895,
8
- "train_steps_per_second": 1.278
9
  }
 
1
  {
2
  "epoch": 2.0,
3
  "total_flos": 3.971230131335731e+17,
4
+ "train_loss": 5.790556044936869,
5
+ "train_runtime": 4316.8958,
6
  "train_samples": 88559,
7
+ "train_samples_per_second": 41.029,
8
+ "train_steps_per_second": 1.282
9
  }
trainer_state.json CHANGED
@@ -10,89 +10,89 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.18063583815028902,
13
- "grad_norm": 5.121304512023926,
14
- "learning_rate": 0.0009096820809248554,
15
- "loss": 5.5363,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 0.36127167630057805,
20
- "grad_norm": 4.908692836761475,
21
- "learning_rate": 0.000819364161849711,
22
- "loss": 5.4577,
23
  "step": 1000
24
  },
25
  {
26
  "epoch": 0.541907514450867,
27
- "grad_norm": 4.932715892791748,
28
- "learning_rate": 0.0007290462427745664,
29
- "loss": 5.434,
30
  "step": 1500
31
  },
32
  {
33
  "epoch": 0.7225433526011561,
34
- "grad_norm": 4.944921970367432,
35
- "learning_rate": 0.0006387283236994221,
36
- "loss": 5.4331,
37
  "step": 2000
38
  },
39
  {
40
  "epoch": 0.903179190751445,
41
- "grad_norm": 5.103818893432617,
42
- "learning_rate": 0.0005484104046242775,
43
- "loss": 5.4312,
44
  "step": 2500
45
  },
46
  {
47
  "epoch": 1.083815028901734,
48
- "grad_norm": 5.026283264160156,
49
- "learning_rate": 0.00045809248554913297,
50
- "loss": 5.4126,
51
  "step": 3000
52
  },
53
  {
54
  "epoch": 1.2644508670520231,
55
- "grad_norm": 4.974510669708252,
56
- "learning_rate": 0.00036777456647398845,
57
- "loss": 5.4098,
58
  "step": 3500
59
  },
60
  {
61
  "epoch": 1.4450867052023122,
62
- "grad_norm": 4.894430160522461,
63
- "learning_rate": 0.00027745664739884393,
64
- "loss": 5.4071,
65
  "step": 4000
66
  },
67
  {
68
  "epoch": 1.6257225433526012,
69
- "grad_norm": 4.973602294921875,
70
- "learning_rate": 0.00018713872832369944,
71
- "loss": 5.4051,
72
  "step": 4500
73
  },
74
  {
75
  "epoch": 1.80635838150289,
76
- "grad_norm": 4.884088039398193,
77
- "learning_rate": 9.682080924855491e-05,
78
- "loss": 5.3928,
79
  "step": 5000
80
  },
81
  {
82
  "epoch": 1.9869942196531793,
83
- "grad_norm": 4.870421409606934,
84
- "learning_rate": 6.502890173410405e-06,
85
- "loss": 5.3965,
86
  "step": 5500
87
  },
88
  {
89
  "epoch": 2.0,
90
  "step": 5536,
91
  "total_flos": 3.971230131335731e+17,
92
- "train_loss": 5.428521726861854,
93
- "train_runtime": 4331.0324,
94
- "train_samples_per_second": 40.895,
95
- "train_steps_per_second": 1.278
96
  }
97
  ],
98
  "logging_steps": 500,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.18063583815028902,
13
+ "grad_norm": 5.165233135223389,
14
+ "learning_rate": 0.009096820809248556,
15
+ "loss": 6.17,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 0.36127167630057805,
20
+ "grad_norm": 5.07661771774292,
21
+ "learning_rate": 0.00819364161849711,
22
+ "loss": 6.0936,
23
  "step": 1000
24
  },
25
  {
26
  "epoch": 0.541907514450867,
27
+ "grad_norm": 5.147761821746826,
28
+ "learning_rate": 0.007290462427745665,
29
+ "loss": 5.9967,
30
  "step": 1500
31
  },
32
  {
33
  "epoch": 0.7225433526011561,
34
+ "grad_norm": 5.032444953918457,
35
+ "learning_rate": 0.00638728323699422,
36
+ "loss": 5.9309,
37
  "step": 2000
38
  },
39
  {
40
  "epoch": 0.903179190751445,
41
+ "grad_norm": 5.182621479034424,
42
+ "learning_rate": 0.005484104046242775,
43
+ "loss": 5.862,
44
  "step": 2500
45
  },
46
  {
47
  "epoch": 1.083815028901734,
48
+ "grad_norm": 5.072362422943115,
49
+ "learning_rate": 0.00458092485549133,
50
+ "loss": 5.7864,
51
  "step": 3000
52
  },
53
  {
54
  "epoch": 1.2644508670520231,
55
+ "grad_norm": 5.014511585235596,
56
+ "learning_rate": 0.0036777456647398843,
57
+ "loss": 5.7157,
58
  "step": 3500
59
  },
60
  {
61
  "epoch": 1.4450867052023122,
62
+ "grad_norm": 4.894152641296387,
63
+ "learning_rate": 0.0027745664739884392,
64
+ "loss": 5.641,
65
  "step": 4000
66
  },
67
  {
68
  "epoch": 1.6257225433526012,
69
+ "grad_norm": 5.007359027862549,
70
+ "learning_rate": 0.0018713872832369944,
71
+ "loss": 5.5787,
72
  "step": 4500
73
  },
74
  {
75
  "epoch": 1.80635838150289,
76
+ "grad_norm": 4.906238555908203,
77
+ "learning_rate": 0.0009682080924855491,
78
+ "loss": 5.5,
79
  "step": 5000
80
  },
81
  {
82
  "epoch": 1.9869942196531793,
83
+ "grad_norm": 4.897428512573242,
84
+ "learning_rate": 6.502890173410405e-05,
85
+ "loss": 5.4477,
86
  "step": 5500
87
  },
88
  {
89
  "epoch": 2.0,
90
  "step": 5536,
91
  "total_flos": 3.971230131335731e+17,
92
+ "train_loss": 5.790556044936869,
93
+ "train_runtime": 4316.8958,
94
+ "train_samples_per_second": 41.029,
95
+ "train_steps_per_second": 1.282
96
  }
97
  ],
98
  "logging_steps": 500,