yhavinga commited on
Commit
c93d73c
1 Parent(s): 312445c

Update model

Browse files
README.md CHANGED
@@ -1 +1 @@
1
- Logs at https://wandb.ai/yepster/long-t5-tglobal-small/runs/1s5jeq5q?workspace=user-yepster
 
1
+ Logs at https://wandb.ai/yepster/long-t5-tglobal-small/runs/2wiy76y6?workspace=user-yepster
eval_results.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
- "eval_accuracy": 0.6216245889663696,
3
- "eval_loss": 1.904692530632019
4
  }
 
1
  {
2
+ "eval_accuracy": 0.6504417657852173,
3
+ "eval_loss": 1.9302031993865967
4
  }
flax_model.msgpack CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95364c438eca009d45f05cf04a79d78d949c17c6855db8545983940388f205aa
3
  size 307750439
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d22d88f2f0bee62ab83ce809dd14295a22da03baab455e4ae5797bc693f9ec8
3
  size 307750439
run_longt5-tglobal-small-mc4.sh CHANGED
@@ -26,32 +26,33 @@ python ../train/run_t5_mlm_flax_pmap.py \
26
  --dataset_name="${DATASET}" \
27
  --dataset_config_name="${DATASET_CONFIG}" \
28
  --max_seq_length="1024" \
29
- --per_device_train_batch_size="32" \
30
- --per_device_eval_batch_size="32" \
31
- --gradient_accumulation_steps="4" \
32
  --mean_noise_span_length="3" \
33
- --dtype="float32" \
 
 
34
  --optim="adafactor" \
35
  --learning_rate="0.005" \
36
- --lr_decay="linear" \
37
  --overwrite_output_dir \
38
- --num_train_epochs="4" \
39
- --logging_steps="20" \
40
- --save_steps="1000" \
41
- --eval_steps="1000" \
42
- --warmup_steps="300" \
43
- --validation_split_count="15000" \
44
  --wandb_project="long-t5-tglobal-small" \
45
  --wandb_job_type="pmap"
46
 
47
 
48
  # --max_train_samples="160000" \
49
  # --max_eval_samples="1000"
 
50
 
51
  # --model_name_or_path="${MODEL_PATH}" \
52
 
53
  # \
54
- # --resume_from_checkpoint="${MODEL_PATH}"
55
 
56
  # --lr_decay="exponential" \
57
  # --lr_transition_steps="400000" \
 
26
  --dataset_name="${DATASET}" \
27
  --dataset_config_name="${DATASET_CONFIG}" \
28
  --max_seq_length="1024" \
29
+ --per_device_train_batch_size="64" \
30
+ --per_device_eval_batch_size="64" \
31
+ --gradient_accumulation_steps="1" \
32
  --mean_noise_span_length="3" \
33
+ --gradient_checkpointing="false" \
34
+ --dtype="bfloat16" \
35
+ --z_loss="1e-4" \
36
  --optim="adafactor" \
37
  --learning_rate="0.005" \
38
+ --lr_scheduler_type="linear" \
39
  --overwrite_output_dir \
40
+ --num_train_epochs="6" \
41
+ --logging_steps="80" \
42
+ --save_steps="4000" \
43
+ --eval_steps="4000" \
44
+ --warmup_steps="3000" \
 
45
  --wandb_project="long-t5-tglobal-small" \
46
  --wandb_job_type="pmap"
47
 
48
 
49
  # --max_train_samples="160000" \
50
  # --max_eval_samples="1000"
51
+ # --resume_from_checkpoint="${MODEL_PATH}" \
52
 
53
  # --model_name_or_path="${MODEL_PATH}" \
54
 
55
  # \
 
56
 
57
  # --lr_decay="exponential" \
58
  # --lr_transition_steps="400000" \
training_state.json CHANGED
@@ -1 +1 @@
1
- {"step": 112001}
 
1
+ {"step": 84001}