EtashGuha commited on
Commit
7719586
·
verified ·
1 Parent(s): 0747036

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -16,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # sft_a1_codeelo__Qwen3-8B
18
 
19
- This model is a fine-tuned version of [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) on the /e/scratch/jureap59/raoof1/sft_data/hf_hub/datasets--DCAgent--exp_rpt_codeelo-v2_10k_glm_4.7_traces_jupiter_upsampled_10k/snapshots/3ca27692cf3d7f3fa6ed3b83e00b3df43ad80fdc_thinking_preprocessed dataset.
20
 
21
  ## Model description
22
 
 
16
 
17
  # sft_a1_codeelo__Qwen3-8B
18
 
19
+ This model is a fine-tuned version of [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) on the /e/scratch/jureap59/raoof1/sft_data/hf_hub/datasets--DCAgent--exp_rpt_codeelo-v2_10k_glm_4.7_traces_jupiter/snapshots/82252f3ec14c532dcb0a1154c26432b8bcd8b10e_thinking_preprocessed dataset.
20
 
21
  ## Model description
22
 
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "achieved_tflops_per_gpu": 0.0037163499371083908,
3
- "achieved_tflops_per_gpu_theoretical": 1063.8020327264749,
4
  "epoch": 7.0,
5
  "loss_nan_ranks": 0,
6
- "loss_rank_avg": 0.22936706244945526,
7
- "mfu_percent": 0.00026263957152709475,
8
- "mfu_percent_theoretical": 75.18035566971554,
9
- "total_flos": 791537577689088.0,
10
- "train_loss": 0.17696809689205084,
11
- "train_runtime": 13311.7439,
12
- "train_samples_per_second": 5.027,
13
- "train_steps_per_second": 0.314,
14
- "valid_targets_mean": 3387.6,
15
- "valid_targets_min": 1204
16
  }
 
1
  {
2
+ "achieved_tflops_per_gpu": 0.0022911149705146998,
3
+ "achieved_tflops_per_gpu_theoretical": 423.488070352896,
4
  "epoch": 7.0,
5
  "loss_nan_ranks": 0,
6
+ "loss_rank_avg": 0.4637048840522766,
7
+ "mfu_percent": 0.0001619162523331943,
8
+ "mfu_percent_theoretical": 29.928485537307136,
9
+ "total_flos": 1105775565668352.0,
10
+ "train_loss": 0.48887504853286395,
11
+ "train_runtime": 30164.7773,
12
+ "train_samples_per_second": 1.994,
13
+ "train_steps_per_second": 0.125,
14
+ "valid_targets_mean": 7010.8,
15
+ "valid_targets_min": 805
16
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:085d2ef00968f85259608e7f4bae06bdebdeaad16dd78711045cf3f4fc02d385
3
  size 4902257696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aa7ce43de8f80cf45d174906fd9665c4cc3d4bd9f5710104a9b6a2b5d41de3f
3
  size 4902257696
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f3eff0ab84d61efebaf58604a6690b456f54fa31aa78d055cddebba4e1c93f08
3
  size 4915960368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b794036ca219abc636ea3301feea6a4b17921859b3e4a2d3a4a7f9997d8f28f
3
  size 4915960368
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:024d9f4a23750d66eae0a6389c4c5cd29813ea39e76bd64565c0c0fb5e4aae25
3
  size 4983068496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8bcb76294a475a9ce0009b57282da54a052eaeba6643247f28bb83e130c7595
3
  size 4983068496
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:753be65a952a3db9de839d47e34638a8a385c51ebbe5c569552fbd2f9509ced8
3
  size 1580230264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dddd527fc9b9c187160b3974b35654b79c0785379ff59b3e8d14be862e217afd
3
  size 1580230264
run_summary.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "agent_name": "3ca27692cf3d7f3fa6ed3b83e00b3df43ad80fdc_thinking_preprocessed",
3
  "training_start": null,
4
  "training_end": null,
5
  "created_by": "raoof1",
6
  "base_model_name": "Qwen/Qwen3-8B",
7
- "dataset_name": "/e/scratch/jureap59/raoof1/sft_data/hf_hub/datasets--DCAgent--exp_rpt_codeelo-v2_10k_glm_4.7_traces_jupiter_upsampled_10k/snapshots/3ca27692cf3d7f3fa6ed3b83e00b3df43ad80fdc_thinking_preprocessed",
8
  "training_type": "SFT",
9
  "training_parameters": "https://huggingface.co/DCAgent/a1-codeelo/blob/main/config.json",
10
  "wandb_link": null,
 
1
  {
2
+ "agent_name": "82252f3ec14c532dcb0a1154c26432b8bcd8b10e_thinking_preprocessed",
3
  "training_start": null,
4
  "training_end": null,
5
  "created_by": "raoof1",
6
  "base_model_name": "Qwen/Qwen3-8B",
7
+ "dataset_name": "/e/scratch/jureap59/raoof1/sft_data/hf_hub/datasets--DCAgent--exp_rpt_codeelo-v2_10k_glm_4.7_traces_jupiter/snapshots/82252f3ec14c532dcb0a1154c26432b8bcd8b10e_thinking_preprocessed",
8
  "training_type": "SFT",
9
  "training_parameters": "https://huggingface.co/DCAgent/a1-codeelo/blob/main/config.json",
10
  "wandb_link": null,
train_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "achieved_tflops_per_gpu": 0.0037163499371083908,
3
- "achieved_tflops_per_gpu_theoretical": 1063.8020327264749,
4
  "epoch": 7.0,
5
  "loss_nan_ranks": 0,
6
- "loss_rank_avg": 0.22936706244945526,
7
- "mfu_percent": 0.00026263957152709475,
8
- "mfu_percent_theoretical": 75.18035566971554,
9
- "total_flos": 791537577689088.0,
10
- "train_loss": 0.17696809689205084,
11
- "train_runtime": 13311.7439,
12
- "train_samples_per_second": 5.027,
13
- "train_steps_per_second": 0.314,
14
- "valid_targets_mean": 3387.6,
15
- "valid_targets_min": 1204
16
  }
 
1
  {
2
+ "achieved_tflops_per_gpu": 0.0022911149705146998,
3
+ "achieved_tflops_per_gpu_theoretical": 423.488070352896,
4
  "epoch": 7.0,
5
  "loss_nan_ranks": 0,
6
+ "loss_rank_avg": 0.4637048840522766,
7
+ "mfu_percent": 0.0001619162523331943,
8
+ "mfu_percent_theoretical": 29.928485537307136,
9
+ "total_flos": 1105775565668352.0,
10
+ "train_loss": 0.48887504853286395,
11
+ "train_runtime": 30164.7773,
12
+ "train_samples_per_second": 1.994,
13
+ "train_steps_per_second": 0.125,
14
+ "valid_targets_mean": 7010.8,
15
+ "valid_targets_min": 805
16
  }
trainer_log.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
training_loss.png CHANGED