chansung commited on
Commit
3a2eccc
1 Parent(s): d90f718

Model save

Browse files
README.md CHANGED
@@ -2,13 +2,13 @@
2
  license: gemma
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
  - trl
7
  - sft
 
8
  - generated_from_trainer
9
  base_model: google/gemma-2b
10
  datasets:
11
- - llama-duo/synth_summarize_dataset_dedup
12
  model-index:
13
  - name: gemma2b-summarize-gemini1_5flash-256k
14
  results: []
@@ -19,9 +19,9 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # gemma2b-summarize-gemini1_5flash-256k
21
 
22
- This model is a fine-tuned version of [google/gemma-2b](https://huggingface.co/google/gemma-2b) on the llama-duo/synth_summarize_dataset_dedup dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 2.5669
25
 
26
  ## Model description
27
 
@@ -67,7 +67,7 @@ The following hyperparameters were used during training:
67
  | 0.8464 | 6.9976 | 1452 | 2.5513 |
68
  | 0.8353 | 8.0 | 1660 | 2.5615 |
69
  | 0.8267 | 8.9976 | 1867 | 2.5674 |
70
- | 0.827 | 9.9759 | 2070 | 2.5669 |
71
 
72
 
73
  ### Framework versions
 
2
  license: gemma
3
  library_name: peft
4
  tags:
 
5
  - trl
6
  - sft
7
+ - alignment-handbook
8
  - generated_from_trainer
9
  base_model: google/gemma-2b
10
  datasets:
11
+ - generator
12
  model-index:
13
  - name: gemma2b-summarize-gemini1_5flash-256k
14
  results: []
 
19
 
20
  # gemma2b-summarize-gemini1_5flash-256k
21
 
22
+ This model is a fine-tuned version of [google/gemma-2b](https://huggingface.co/google/gemma-2b) on the generator dataset.
23
  It achieves the following results on the evaluation set:
24
+ - Loss: 2.5681
25
 
26
  ## Model description
27
 
 
67
  | 0.8464 | 6.9976 | 1452 | 2.5513 |
68
  | 0.8353 | 8.0 | 1660 | 2.5615 |
69
  | 0.8267 | 8.9976 | 1867 | 2.5674 |
70
+ | 0.8289 | 9.9976 | 2070 | 2.5681 |
71
 
72
 
73
  ### Framework versions
adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "up_proj",
24
  "o_proj",
25
- "gate_proj",
26
  "k_proj",
27
- "down_proj",
28
  "v_proj",
29
- "q_proj"
 
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "o_proj",
 
24
  "k_proj",
25
+ "gate_proj",
26
  "v_proj",
27
+ "down_proj",
28
+ "q_proj",
29
+ "up_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:465c1b3666bad23b46105166da89ba228655adfb0e551b2809da6f6c1f2df5f3
3
  size 78480320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:778b553eda2c7f1735b85277d6ed5dd9bc1d8df6648e3ab13383249ff9a901a0
3
  size 78480320
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
- "epoch": 9.975903614457831,
3
  "eval_loss": 2.566892147064209,
4
  "eval_runtime": 0.495,
5
  "eval_samples": 25,
6
  "eval_samples_per_second": 20.203,
7
  "eval_steps_per_second": 2.02,
8
  "total_flos": 3.290190024938619e+18,
9
- "train_loss": 0.9333097650233099,
10
- "train_runtime": 14306.303,
11
  "train_samples": 253412,
12
- "train_samples_per_second": 18.532,
13
- "train_steps_per_second": 0.145
14
  }
 
1
  {
2
+ "epoch": 9.997590361445782,
3
  "eval_loss": 2.566892147064209,
4
  "eval_runtime": 0.495,
5
  "eval_samples": 25,
6
  "eval_samples_per_second": 20.203,
7
  "eval_steps_per_second": 2.02,
8
  "total_flos": 3.290190024938619e+18,
9
+ "train_loss": 0.02805145143886695,
10
+ "train_runtime": 534.5593,
11
  "train_samples": 253412,
12
+ "train_samples_per_second": 495.96,
13
+ "train_steps_per_second": 3.872
14
  }
runs/Jun10_15-56-44_6511c8cdb6e6/events.out.tfevents.1718035040.6511c8cdb6e6.3078.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe2d556b664b2e6d4aeb253ad5f3304790e343fb732889c7fc5c0e31cd2629da
3
+ size 9187
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 9.975903614457831,
3
  "total_flos": 3.290190024938619e+18,
4
- "train_loss": 0.9333097650233099,
5
- "train_runtime": 14306.303,
6
  "train_samples": 253412,
7
- "train_samples_per_second": 18.532,
8
- "train_steps_per_second": 0.145
9
  }
 
1
  {
2
+ "epoch": 9.997590361445782,
3
  "total_flos": 3.290190024938619e+18,
4
+ "train_loss": 0.02805145143886695,
5
+ "train_runtime": 534.5593,
6
  "train_samples": 253412,
7
+ "train_samples_per_second": 495.96,
8
+ "train_steps_per_second": 3.872
9
  }
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 9.975903614457831,
5
  "eval_steps": 500,
6
  "global_step": 2070,
7
  "is_hyper_param_search": false,
@@ -2888,119 +2888,119 @@
2888
  "step": 2000
2889
  },
2890
  {
2891
- "epoch": 9.662650602409638,
2892
- "grad_norm": 0.19921875,
2893
  "learning_rate": 6.001169527811268e-07,
2894
- "loss": 0.8274,
2895
  "step": 2005
2896
  },
2897
  {
2898
- "epoch": 9.686746987951807,
2899
  "grad_norm": 0.20703125,
2900
  "learning_rate": 5.114180081645214e-07,
2901
- "loss": 0.8303,
2902
  "step": 2010
2903
  },
2904
  {
2905
- "epoch": 9.710843373493976,
2906
- "grad_norm": 0.212890625,
2907
  "learning_rate": 4.2979175500050817e-07,
2908
- "loss": 0.8301,
2909
  "step": 2015
2910
  },
2911
  {
2912
- "epoch": 9.734939759036145,
2913
- "grad_norm": 0.201171875,
2914
  "learning_rate": 3.552439961389431e-07,
2915
- "loss": 0.829,
2916
  "step": 2020
2917
  },
2918
  {
2919
- "epoch": 9.759036144578314,
2920
- "grad_norm": 0.212890625,
2921
  "learning_rate": 2.877800312160783e-07,
2922
- "loss": 0.8267,
2923
  "step": 2025
2924
  },
2925
  {
2926
- "epoch": 9.783132530120483,
2927
- "grad_norm": 0.20703125,
2928
  "learning_rate": 2.274046562778409e-07,
2929
- "loss": 0.8285,
2930
  "step": 2030
2931
  },
2932
  {
2933
- "epoch": 9.80722891566265,
2934
- "grad_norm": 0.20703125,
2935
  "learning_rate": 1.7412216343885014e-07,
2936
- "loss": 0.8311,
2937
  "step": 2035
2938
  },
2939
  {
2940
- "epoch": 9.831325301204819,
2941
- "grad_norm": 0.2119140625,
2942
  "learning_rate": 1.2793634057732818e-07,
2943
- "loss": 0.8311,
2944
  "step": 2040
2945
  },
2946
  {
2947
- "epoch": 9.855421686746988,
2948
- "grad_norm": 0.2109375,
2949
  "learning_rate": 8.885047106578227e-08,
2950
- "loss": 0.83,
2951
  "step": 2045
2952
  },
2953
  {
2954
- "epoch": 9.879518072289157,
2955
- "grad_norm": 0.205078125,
2956
  "learning_rate": 5.6867333537580226e-08,
2957
- "loss": 0.8322,
2958
  "step": 2050
2959
  },
2960
  {
2961
- "epoch": 9.903614457831326,
2962
- "grad_norm": 0.21484375,
2963
  "learning_rate": 3.1989201689452967e-08,
2964
- "loss": 0.8305,
2965
  "step": 2055
2966
  },
2967
  {
2968
- "epoch": 9.927710843373493,
2969
  "grad_norm": 0.2080078125,
2970
  "learning_rate": 1.4217844119857048e-08,
2971
- "loss": 0.8293,
2972
  "step": 2060
2973
  },
2974
  {
2975
- "epoch": 9.951807228915662,
2976
- "grad_norm": 0.2041015625,
2977
  "learning_rate": 3.554524203175369e-09,
2978
- "loss": 0.8287,
2979
  "step": 2065
2980
  },
2981
  {
2982
- "epoch": 9.975903614457831,
2983
- "grad_norm": 0.2177734375,
2984
  "learning_rate": 0.0,
2985
- "loss": 0.827,
2986
  "step": 2070
2987
  },
2988
  {
2989
- "epoch": 9.975903614457831,
2990
- "eval_loss": 2.566892147064209,
2991
- "eval_runtime": 0.4844,
2992
- "eval_samples_per_second": 20.643,
2993
- "eval_steps_per_second": 2.064,
2994
  "step": 2070
2995
  },
2996
  {
2997
- "epoch": 9.975903614457831,
2998
  "step": 2070,
2999
  "total_flos": 3.290190024938619e+18,
3000
- "train_loss": 0.9333097650233099,
3001
- "train_runtime": 14306.303,
3002
- "train_samples_per_second": 18.532,
3003
- "train_steps_per_second": 0.145
3004
  }
3005
  ],
3006
  "logging_steps": 5,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 9.997590361445782,
5
  "eval_steps": 500,
6
  "global_step": 2070,
7
  "is_hyper_param_search": false,
 
2888
  "step": 2000
2889
  },
2890
  {
2891
+ "epoch": 9.684337349397591,
2892
+ "grad_norm": 0.2099609375,
2893
  "learning_rate": 6.001169527811268e-07,
2894
+ "loss": 0.8287,
2895
  "step": 2005
2896
  },
2897
  {
2898
+ "epoch": 9.708433734939758,
2899
  "grad_norm": 0.20703125,
2900
  "learning_rate": 5.114180081645214e-07,
2901
+ "loss": 0.8321,
2902
  "step": 2010
2903
  },
2904
  {
2905
+ "epoch": 9.732530120481927,
2906
+ "grad_norm": 0.19921875,
2907
  "learning_rate": 4.2979175500050817e-07,
2908
+ "loss": 0.8297,
2909
  "step": 2015
2910
  },
2911
  {
2912
+ "epoch": 9.756626506024096,
2913
+ "grad_norm": 0.228515625,
2914
  "learning_rate": 3.552439961389431e-07,
2915
+ "loss": 0.8235,
2916
  "step": 2020
2917
  },
2918
  {
2919
+ "epoch": 9.780722891566265,
2920
+ "grad_norm": 0.203125,
2921
  "learning_rate": 2.877800312160783e-07,
2922
+ "loss": 0.8302,
2923
  "step": 2025
2924
  },
2925
  {
2926
+ "epoch": 9.804819277108434,
2927
+ "grad_norm": 0.2109375,
2928
  "learning_rate": 2.274046562778409e-07,
2929
+ "loss": 0.8331,
2930
  "step": 2030
2931
  },
2932
  {
2933
+ "epoch": 9.828915662650603,
2934
+ "grad_norm": 0.2119140625,
2935
  "learning_rate": 1.7412216343885014e-07,
2936
+ "loss": 0.8281,
2937
  "step": 2035
2938
  },
2939
  {
2940
+ "epoch": 9.85301204819277,
2941
+ "grad_norm": 0.2080078125,
2942
  "learning_rate": 1.2793634057732818e-07,
2943
+ "loss": 0.8321,
2944
  "step": 2040
2945
  },
2946
  {
2947
+ "epoch": 9.87710843373494,
2948
+ "grad_norm": 0.212890625,
2949
  "learning_rate": 8.885047106578227e-08,
2950
+ "loss": 0.832,
2951
  "step": 2045
2952
  },
2953
  {
2954
+ "epoch": 9.901204819277108,
2955
+ "grad_norm": 0.2080078125,
2956
  "learning_rate": 5.6867333537580226e-08,
2957
+ "loss": 0.8308,
2958
  "step": 2050
2959
  },
2960
  {
2961
+ "epoch": 9.925301204819277,
2962
+ "grad_norm": 0.2001953125,
2963
  "learning_rate": 3.1989201689452967e-08,
2964
+ "loss": 0.8283,
2965
  "step": 2055
2966
  },
2967
  {
2968
+ "epoch": 9.949397590361446,
2969
  "grad_norm": 0.2080078125,
2970
  "learning_rate": 1.4217844119857048e-08,
2971
+ "loss": 0.8307,
2972
  "step": 2060
2973
  },
2974
  {
2975
+ "epoch": 9.973493975903615,
2976
+ "grad_norm": 0.2080078125,
2977
  "learning_rate": 3.554524203175369e-09,
2978
+ "loss": 0.8252,
2979
  "step": 2065
2980
  },
2981
  {
2982
+ "epoch": 9.997590361445782,
2983
+ "grad_norm": 0.203125,
2984
  "learning_rate": 0.0,
2985
+ "loss": 0.8289,
2986
  "step": 2070
2987
  },
2988
  {
2989
+ "epoch": 9.997590361445782,
2990
+ "eval_loss": 2.5681488513946533,
2991
+ "eval_runtime": 0.4835,
2992
+ "eval_samples_per_second": 20.683,
2993
+ "eval_steps_per_second": 2.068,
2994
  "step": 2070
2995
  },
2996
  {
2997
+ "epoch": 9.997590361445782,
2998
  "step": 2070,
2999
  "total_flos": 3.290190024938619e+18,
3000
+ "train_loss": 0.02805145143886695,
3001
+ "train_runtime": 534.5593,
3002
+ "train_samples_per_second": 495.96,
3003
+ "train_steps_per_second": 3.872
3004
  }
3005
  ],
3006
  "logging_steps": 5,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d0e8578086c62d31c0054e61eeb3aad589514780d2ee582bbd68270ce238f7b
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78a14829f27b48698fad520fcdb9f80df4e43a3e7e4532f4e22f6580ed774eb9
3
  size 5304