add stuff

Browse files

Files changed (9) hide show

.gitignore +1 -0
config.json +2 -2
latest +1 -0
long-t5-tglobal-base-16384-booksum-V7.9-ft1-booksum_training_metadata.json +1 -0
pytorch_model.bin +1 -1
tokenizer_config.json +1 -1
trainer_state.json +845 -179
training_args.bin +1 -1
zero_to_fp32.py +484 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ checkpoint-*/

config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "pszemraj/long-t5-tglobal-base-16384-booksum-V6-partial",
   "architectures": [
     "LongT5ForConditionalGeneration"
   ],
@@ -36,7 +36,7 @@
   "relative_attention_num_buckets": 32,
   "repetition_penalty": 3.5,
   "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
   "transformers_version": "4.20.1",
   "use_cache": false,
   "vocab_size": 32128

 {
+  "_name_or_path": "pszemraj/long-t5-tglobal-base-16384-booksum-V7.9",
   "architectures": [
     "LongT5ForConditionalGeneration"
   ],
   "relative_attention_num_buckets": 32,
   "repetition_penalty": 3.5,
   "tie_word_embeddings": false,
+  "torch_dtype": "float32",
   "transformers_version": "4.20.1",
   "use_cache": false,
   "vocab_size": 32128

latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step330

long-t5-tglobal-base-16384-booksum-V7.9-ft1-booksum_training_metadata.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"output_dir": "/content/drive/MyDrive/Programming/hf-trainer/long-t5-tglobal-base-16384-booksum-V7.9-ft1-booksum", "overwrite_output_dir": true, "do_train": false, "do_eval": false, "do_predict": false, "evaluation_strategy": "no", "prediction_loss_only": false, "per_device_train_batch_size": 1, "per_device_eval_batch_size": 1, "per_gpu_train_batch_size": "None", "per_gpu_eval_batch_size": "None", "gradient_accumulation_steps": 64, "eval_accumulation_steps": "None", "eval_delay": 0, "learning_rate": 0.001, "weight_decay": 0.05, "adam_beta1": 0.9, "adam_beta2": 0.999, "adam_epsilon": 1e-08, "max_grad_norm": 0.5, "num_train_epochs": 2, "max_steps": -1, "lr_scheduler_type": "cosine", "warmup_ratio": 0.01, "warmup_steps": 0, "log_level": -1, "log_level_replica": -1, "log_on_each_node": true, "logging_dir": "/content/drive/MyDrive/Programming/hf-trainer/long-t5-tglobal-base-16384-booksum-V7.9-ft1-booksum/logs", "logging_strategy": "steps", "logging_first_step": false, "logging_steps": 2, "logging_nan_inf_filter": true, "save_strategy": "epoch", "save_steps": 500, "save_total_limit": 1, "save_on_each_node": false, "no_cuda": false, "seed": 42, "data_seed": "None", "jit_mode_eval": false, "use_ipex": false, "bf16": false, "fp16": true, "fp16_opt_level": "O1", "half_precision_backend": "cuda_amp", "bf16_full_eval": false, "fp16_full_eval": false, "tf32": "None", "local_rank": 0, "xpu_backend": "None", "tpu_num_cores": "None", "tpu_metrics_debug": false, "debug": "[]", "dataloader_drop_last": false, "eval_steps": "None", "dataloader_num_workers": 0, "past_index": -1, "run_name": "/content/drive/MyDrive/Programming/hf-trainer/long-t5-tglobal-base-16384-booksum-V7.9-ft1-booksum", "disable_tqdm": false, "remove_unused_columns": true, "label_names": "None", "load_best_model_at_end": false, "metric_for_best_model": "None", "greater_is_better": "None", "ignore_data_skip": false, "sharded_ddp": "[]", "fsdp": "[]", "fsdp_min_num_params": 0, "deepspeed": "/content/ds_config_zero2.json", "label_smoothing_factor": 0.0, "optim": "adamw_hf", "adafactor": false, "group_by_length": false, "length_column_name": "length", "report_to": "['tensorboard']", "ddp_find_unused_parameters": "None", "ddp_bucket_cap_mb": "None", "dataloader_pin_memory": true, "skip_memory_metrics": true, "use_legacy_prediction_loop": false, "push_to_hub": true, "resume_from_checkpoint": "None", "hub_model_id": "long-t5-tglobal-base-16384-booksum-V7.9-ft1-booksum", "hub_strategy": "end", "hub_token": "<HUB_TOKEN>", "hub_private_repo": true, "gradient_checkpointing": true, "include_inputs_for_metrics": false, "fp16_backend": "auto", "push_to_hub_model_id": "None", "push_to_hub_organization": "None", "push_to_hub_token": "<PUSH_TO_HUB_TOKEN>", "_n_gpu": 1, "mp_parameters": "", "auto_find_batch_size": false, "full_determinism": false, "torchdynamo": "None", "ray_scope": "last", "sortish_sampler": false, "predict_with_generate": false, "generation_max_length": "None", "generation_num_beams": "None", "train_batch_size": 1, "eval_batch_size": 1, "configs_src": "long-t5-tglobal-base-16384-booksum-V7.9-ft1-booksum"}

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4adb405b88e1acd599988a723bb03f622f2bc5b7b441d9a480ed5b5ed75fa190
 size 990388907

 version https://git-lfs.github.com/spec/v1
+oid sha256:b05f4b28e354b9cc1c758956764bfd54d590226a1dfbe604856ded1dbafd148e
 size 990388907

tokenizer_config.json CHANGED Viewed

@@ -103,7 +103,7 @@
   ],
   "eos_token": "</s>",
   "extra_ids": 100,
-  "name_or_path": "pszemraj/long-t5-tglobal-base-16384-booksum-V6-partial",
   "pad_token": "<pad>",
   "special_tokens_map_file": null,
   "tokenizer_class": "T5Tokenizer",

   ],
   "eos_token": "</s>",
   "extra_ids": 100,
+  "name_or_path": "pszemraj/long-t5-tglobal-base-16384-booksum-V7.9",
   "pad_token": "<pad>",
   "special_tokens_map_file": null,
   "tokenizer_class": "T5Tokenizer",

trainer_state.json CHANGED Viewed

@@ -1,349 +1,1015 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 1.991123701605288,
-  "global_step": 164,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.04,
-      "learning_rate": 0.00030000000000000003,
-      "loss": 2.5934,
-      "step": 3
     },
     {
       "epoch": 0.07,
-      "learning_rate": 0.0003998458072481446,
-      "loss": 2.5887,
-      "step": 6
     },
     {
       "epoch": 0.11,
-      "learning_rate": 0.0003990369453344394,
-      "loss": 2.5383,
-      "step": 9
     },
     {
       "epoch": 0.15,
-      "learning_rate": 0.00039753766811902755,
-      "loss": 2.5382,
-      "step": 12
     },
     {
       "epoch": 0.18,
-      "learning_rate": 0.0003953531762641745,
-      "loss": 2.5684,
-      "step": 15
     },
     {
       "epoch": 0.22,
-      "learning_rate": 0.00039249104729072946,
-      "loss": 2.5694,
-      "step": 18
     },
     {
       "epoch": 0.25,
-      "learning_rate": 0.00038896120929337566,
-      "loss": 2.5685,
-      "step": 21
     },
     {
       "epoch": 0.29,
-      "learning_rate": 0.0003847759065022574,
-      "loss": 2.5249,
-      "step": 24
     },
     {
       "epoch": 0.33,
-      "learning_rate": 0.00037994965681044433,
-      "loss": 2.5835,
-      "step": 27
     },
     {
       "epoch": 0.36,
-      "learning_rate": 0.00037449920141455944,
-      "loss": 2.5326,
-      "step": 30
     },
     {
       "epoch": 0.4,
-      "learning_rate": 0.00036844344674325733,
-      "loss": 2.564,
-      "step": 33
     },
     {
       "epoch": 0.44,
-      "learning_rate": 0.0003618033988749895,
-      "loss": 2.5717,
-      "step": 36
     },
     {
       "epoch": 0.47,
-      "learning_rate": 0.0003546020906725474,
-      "loss": 2.5149,
-      "step": 39
     },
     {
       "epoch": 0.51,
-      "learning_rate": 0.0003468645018871371,
-      "loss": 2.5695,
-      "step": 42
     },
     {
       "epoch": 0.54,
-      "learning_rate": 0.0003386174725091272,
-      "loss": 2.5374,
-      "step": 45
     },
     {
       "epoch": 0.58,
-      "learning_rate": 0.0003298896096660367,
-      "loss": 2.5413,
-      "step": 48
     },
     {
       "epoch": 0.62,
-      "learning_rate": 0.0003207111883907143,
-      "loss": 2.509,
-      "step": 51
     },
     {
       "epoch": 0.65,
-      "learning_rate": 0.00031111404660392046,
-      "loss": 2.5628,
-      "step": 54
     },
     {
       "epoch": 0.69,
-      "learning_rate": 0.00030113147467559695,
-      "loss": 2.5069,
-      "step": 57
     },
     {
       "epoch": 0.73,
-      "learning_rate": 0.00029079809994790937,
-      "loss": 2.5331,
-      "step": 60
     },
     {
       "epoch": 0.76,
-      "learning_rate": 0.0002801497666206282,
-      "loss": 2.5527,
-      "step": 63
     },
     {
       "epoch": 0.8,
-      "learning_rate": 0.0002692234114154986,
-      "loss": 2.6179,
-      "step": 66
     },
     {
       "epoch": 0.83,
-      "learning_rate": 0.00025805693545089247,
-      "loss": 2.5411,
-      "step": 69
     },
     {
       "epoch": 0.87,
-      "learning_rate": 0.00024668907277118114,
-      "loss": 2.5583,
-      "step": 72
     },
     {
       "epoch": 0.91,
-      "learning_rate": 0.00023515925598687094,
-      "loss": 2.534,
-      "step": 75
     },
     {
       "epoch": 0.94,
-      "learning_rate": 0.00022350747949156756,
-      "loss": 2.5433,
-      "step": 78
     },
     {
       "epoch": 0.98,
-      "learning_rate": 0.0002117741607302378,
-      "loss": 2.5487,
-      "step": 81
     },
     {
       "epoch": 1.02,
-      "learning_rate": 0.0002,
-      "loss": 3.1324,
-      "step": 84
     },
     {
-      "epoch": 1.06,
-      "learning_rate": 0.00018822583926976218,
-      "loss": 2.4641,
-      "step": 87
     },
     {
       "epoch": 1.1,
-      "learning_rate": 0.00017649252050843252,
-      "loss": 2.4806,
-      "step": 90
     },
     {
       "epoch": 1.13,
-      "learning_rate": 0.0001648407440131291,
-      "loss": 2.4693,
-      "step": 93
     },
     {
-      "epoch": 1.17,
-      "learning_rate": 0.000153310927228819,
-      "loss": 2.5011,
-      "step": 96
     },
     {
       "epoch": 1.21,
-      "learning_rate": 0.00014194306454910757,
-      "loss": 2.4595,
-      "step": 99
     },
     {
       "epoch": 1.24,
-      "learning_rate": 0.00013077658858450138,
-      "loss": 2.4893,
-      "step": 102
     },
     {
       "epoch": 1.28,
-      "learning_rate": 0.00011985023337937184,
-      "loss": 2.4932,
-      "step": 105
     },
     {
       "epoch": 1.31,
-      "learning_rate": 0.00010920190005209065,
-      "loss": 2.4871,
-      "step": 108
     },
     {
-      "epoch": 1.35,
-      "learning_rate": 9.886852532440312e-05,
-      "loss": 2.4672,
-      "step": 111
     },
     {
       "epoch": 1.39,
-      "learning_rate": 8.888595339607961e-05,
-      "loss": 2.4597,
-      "step": 114
     },
     {
       "epoch": 1.42,
-      "learning_rate": 7.928881160928572e-05,
-      "loss": 2.4531,
-      "step": 117
     },
     {
-      "epoch": 1.46,
-      "learning_rate": 7.011039033396329e-05,
-      "loss": 2.4749,
-      "step": 120
     },
     {
       "epoch": 1.5,
-      "learning_rate": 6.138252749087286e-05,
-      "loss": 2.4551,
-      "step": 123
     },
     {
       "epoch": 1.53,
-      "learning_rate": 5.313549811286293e-05,
-      "loss": 2.4796,
-      "step": 126
     },
     {
       "epoch": 1.57,
-      "learning_rate": 4.53979093274526e-05,
-      "loss": 2.4705,
-      "step": 129
     },
     {
       "epoch": 1.6,
-      "learning_rate": 3.819660112501053e-05,
-      "loss": 2.4915,
-      "step": 132
     },
     {
-      "epoch": 1.64,
-      "learning_rate": 3.1556553256742713e-05,
-      "loss": 2.4958,
-      "step": 135
     },
     {
       "epoch": 1.68,
-      "learning_rate": 2.5500798585440567e-05,
-      "loss": 2.4814,
-      "step": 138
     },
     {
       "epoch": 1.71,
-      "learning_rate": 2.0050343189555743e-05,
-      "loss": 2.5034,
-      "step": 141
     },
     {
-      "epoch": 1.75,
-      "learning_rate": 1.5224093497742653e-05,
-      "loss": 2.4671,
-      "step": 144
     },
     {
       "epoch": 1.79,
-      "learning_rate": 1.1038790706624391e-05,
-      "loss": 2.4987,
-      "step": 147
     },
     {
       "epoch": 1.82,
-      "learning_rate": 7.508952709270567e-06,
-      "loss": 2.4618,
-      "step": 150
     },
     {
       "epoch": 1.86,
-      "learning_rate": 4.646823735825523e-06,
-      "loss": 2.4815,
-      "step": 153
     },
     {
       "epoch": 1.89,
-      "learning_rate": 2.462331880972468e-06,
-      "loss": 2.4595,
-      "step": 156
     },
     {
-      "epoch": 1.93,
-      "learning_rate": 9.630546655606364e-07,
-      "loss": 2.4925,
-      "step": 159
     },
     {
       "epoch": 1.97,
-      "learning_rate": 1.5419275185541982e-07,
-      "loss": 2.4333,
-      "step": 162
     },
     {
       "epoch": 1.99,
-      "step": 164,
-      "total_flos": 4.620604962546647e+17,
-      "train_loss": 2.525294606278582,
-      "train_runtime": 28715.0205,
-      "train_samples_per_second": 0.738,
-      "train_steps_per_second": 0.006
     }
   ],
-  "max_steps": 164,
   "num_train_epochs": 2,
-  "total_flos": 4.620604962546647e+17,
   "trial_name": null,
   "trial_params": null
 }

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 1.9971671388101981,
+  "global_step": 330,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
+    {
+      "epoch": 0.01,
+      "learning_rate": 0.0005,
+      "loss": 2.3487,
+      "step": 2
+    },
+    {
+      "epoch": 0.02,
+      "learning_rate": 0.001,
+      "loss": 2.3946,
+      "step": 4
+    },
     {
       "epoch": 0.04,
+      "learning_rate": 0.0009999071352056674,
+      "loss": 2.4059,
+      "step": 6
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 0.00099962857531815,
+      "loss": 2.4061,
+      "step": 8
+    },
+    {
+      "epoch": 0.06,
+      "learning_rate": 0.000999164423811074,
+      "loss": 2.3801,
+      "step": 10
     },
     {
       "epoch": 0.07,
+      "learning_rate": 0.0009985148530977765,
+      "loss": 2.4389,
+      "step": 12
+    },
+    {
+      "epoch": 0.08,
+      "learning_rate": 0.0009976801044672607,
+      "loss": 2.4007,
+      "step": 14
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 0.0009966604879945657,
+      "loss": 2.4691,
+      "step": 16
     },
     {
       "epoch": 0.11,
+      "learning_rate": 0.0009954563824255878,
+      "loss": 2.4015,
+      "step": 18
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 0.0009940682350363913,
+      "loss": 2.4415,
+      "step": 20
+    },
+    {
+      "epoch": 0.13,
+      "learning_rate": 0.000992496561467063,
+      "loss": 2.477,
+      "step": 22
     },
     {
       "epoch": 0.15,
+      "learning_rate": 0.000990741945530174,
+      "loss": 2.4429,
+      "step": 24
+    },
+    {
+      "epoch": 0.16,
+      "learning_rate": 0.0009888050389939172,
+      "loss": 2.4429,
+      "step": 26
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 0.0009866865613400006,
+      "loss": 2.4597,
+      "step": 28
     },
     {
       "epoch": 0.18,
+      "learning_rate": 0.0009843872994963912,
+      "loss": 2.4501,
+      "step": 30
+    },
+    {
+      "epoch": 0.19,
+      "learning_rate": 0.0009819081075450014,
+      "loss": 2.4307,
+      "step": 32
+    },
+    {
+      "epoch": 0.21,
+      "learning_rate": 0.0009792499064044343,
+      "loss": 2.4182,
+      "step": 34
     },
     {
       "epoch": 0.22,
+      "learning_rate": 0.0009764136834878986,
+      "loss": 2.4354,
+      "step": 36
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 0.0009734004923364257,
+      "loss": 2.4323,
+      "step": 38
+    },
+    {
+      "epoch": 0.24,
+      "learning_rate": 0.0009702114522275216,
+      "loss": 2.4592,
+      "step": 40
     },
     {
       "epoch": 0.25,
+      "learning_rate": 0.000966847747759402,
+      "loss": 2.4242,
+      "step": 42
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 0.0009633106284109611,
+      "loss": 2.4355,
+      "step": 44
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 0.0009596014080776422,
+      "loss": 2.4379,
+      "step": 46
     },
     {
       "epoch": 0.29,
+      "learning_rate": 0.0009557214645833791,
+      "loss": 2.3786,
+      "step": 48
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 0.0009516722391687902,
+      "loss": 2.4303,
+      "step": 50
+    },
+    {
+      "epoch": 0.31,
+      "learning_rate": 0.0009474552359558167,
+      "loss": 2.3946,
+      "step": 52
     },
     {
       "epoch": 0.33,
+      "learning_rate": 0.000943072021389003,
+      "loss": 2.4104,
+      "step": 54
+    },
+    {
+      "epoch": 0.34,
+      "learning_rate": 0.0009385242236536259,
+      "loss": 2.4266,
+      "step": 56
+    },
+    {
+      "epoch": 0.35,
+      "learning_rate": 0.0009338135320708912,
+      "loss": 2.5106,
+      "step": 58
     },
     {
       "epoch": 0.36,
+      "learning_rate": 0.0009289416964704185,
+      "loss": 2.4225,
+      "step": 60
+    },
+    {
+      "epoch": 0.37,
+      "learning_rate": 0.0009239105265402525,
+      "loss": 2.4745,
+      "step": 62
+    },
+    {
+      "epoch": 0.39,
+      "learning_rate": 0.0009187218911546363,
+      "loss": 2.4572,
+      "step": 64
     },
     {
       "epoch": 0.4,
+      "learning_rate": 0.0009133777176798013,
+      "loss": 2.4366,
+      "step": 66
+    },
+    {
+      "epoch": 0.41,
+      "learning_rate": 0.0009078799912580304,
+      "loss": 2.4021,
+      "step": 68
+    },
+    {
+      "epoch": 0.42,
+      "learning_rate": 0.0009022307540702576,
+      "loss": 2.4054,
+      "step": 70
     },
     {
       "epoch": 0.44,
+      "learning_rate": 0.0008964321045774807,
+      "loss": 2.4628,
+      "step": 72
+    },
+    {
+      "epoch": 0.45,
+      "learning_rate": 0.0008904861967412702,
+      "loss": 2.5038,
+      "step": 74
+    },
+    {
+      "epoch": 0.46,
+      "learning_rate": 0.0008843952392236594,
+      "loss": 2.3801,
+      "step": 76
     },
     {
       "epoch": 0.47,
+      "learning_rate": 0.0008781614945667169,
+      "loss": 2.4056,
+      "step": 78
+    },
+    {
+      "epoch": 0.48,
+      "learning_rate": 0.0008717872783521047,
+      "loss": 2.3334,
+      "step": 80
+    },
+    {
+      "epoch": 0.5,
+      "learning_rate": 0.0008652749583409339,
+      "loss": 2.3913,
+      "step": 82
     },
     {
       "epoch": 0.51,
+      "learning_rate": 0.0008586269535942384,
+      "loss": 2.3784,
+      "step": 84
+    },
+    {
+      "epoch": 0.52,
+      "learning_rate": 0.0008518457335743926,
+      "loss": 2.4436,
+      "step": 86
+    },
+    {
+      "epoch": 0.53,
+      "learning_rate": 0.0008449338172278058,
+      "loss": 2.3735,
+      "step": 88
     },
     {
       "epoch": 0.54,
+      "learning_rate": 0.0008378937720492384,
+      "loss": 2.374,
+      "step": 90
+    },
+    {
+      "epoch": 0.56,
+      "learning_rate": 0.0008307282131280805,
+      "loss": 2.4064,
+      "step": 92
+    },
+    {
+      "epoch": 0.57,
+      "learning_rate": 0.000823439802176954,
+      "loss": 2.4124,
+      "step": 94
     },
     {
       "epoch": 0.58,
+      "learning_rate": 0.0008160312465429952,
+      "loss": 2.4181,
+      "step": 96
+    },
+    {
+      "epoch": 0.59,
+      "learning_rate": 0.0008085052982021848,
+      "loss": 2.4253,
+      "step": 98
+    },
+    {
+      "epoch": 0.6,
+      "learning_rate": 0.0008008647527371022,
+      "loss": 2.4678,
+      "step": 100
     },
     {
       "epoch": 0.62,
+      "learning_rate": 0.0007931124482984802,
+      "loss": 2.4738,
+      "step": 102
+    },
+    {
+      "epoch": 0.63,
+      "learning_rate": 0.0007852512645509479,
+      "loss": 2.3738,
+      "step": 104
+    },
+    {
+      "epoch": 0.64,
+      "learning_rate": 0.0007772841216033533,
+      "loss": 2.4081,
+      "step": 106
     },
     {
       "epoch": 0.65,
+      "learning_rate": 0.0007692139789240611,
+      "loss": 2.3738,
+      "step": 108
+    },
+    {
+      "epoch": 0.66,
+      "learning_rate": 0.0007610438342416319,
+      "loss": 2.3701,
+      "step": 110
+    },
+    {
+      "epoch": 0.68,
+      "learning_rate": 0.0007527767224312882,
+      "loss": 2.4355,
+      "step": 112
     },
     {
       "epoch": 0.69,
+      "learning_rate": 0.000744415714387582,
+      "loss": 2.4036,
+      "step": 114
+    },
+    {
+      "epoch": 0.7,
+      "learning_rate": 0.0007359639158836828,
+      "loss": 2.3746,
+      "step": 116
+    },
+    {
+      "epoch": 0.71,
+      "learning_rate": 0.0007274244664177097,
+      "loss": 2.4855,
+      "step": 118
     },
     {
       "epoch": 0.73,
+      "learning_rate": 0.0007188005380465365,
+      "loss": 2.379,
+      "step": 120
+    },
+    {
+      "epoch": 0.74,
+      "learning_rate": 0.000710095334207501,
+      "loss": 2.4178,
+      "step": 122
+    },
+    {
+      "epoch": 0.75,
+      "learning_rate": 0.0007013120885284599,
+      "loss": 2.4561,
+      "step": 124
     },
     {
       "epoch": 0.76,
+      "learning_rate": 0.0006924540636266272,
+      "loss": 2.4024,
+      "step": 126
+    },
+    {
+      "epoch": 0.77,
+      "learning_rate": 0.000683524549896646,
+      "loss": 2.4172,
+      "step": 128
+    },
+    {
+      "epoch": 0.79,
+      "learning_rate": 0.0006745268642883404,
+      "loss": 2.3858,
+      "step": 130
     },
     {
       "epoch": 0.8,
+      "learning_rate": 0.0006654643490746042,
+      "loss": 2.3547,
+      "step": 132
+    },
+    {
+      "epoch": 0.81,
+      "learning_rate": 0.0006563403706098833,
+      "loss": 2.4372,
+      "step": 134
+    },
+    {
+      "epoch": 0.82,
+      "learning_rate": 0.0006471583180797121,
+      "loss": 2.3785,
+      "step": 136
     },
     {
       "epoch": 0.83,
+      "learning_rate": 0.0006379216022417695,
+      "loss": 2.3815,
+      "step": 138
+    },
+    {
+      "epoch": 0.85,
+      "learning_rate": 0.0006286336541589224,
+      "loss": 2.4209,
+      "step": 140
+    },
+    {
+      "epoch": 0.86,
+      "learning_rate": 0.0006192979239247243,
+      "loss": 2.3962,
+      "step": 142
     },
     {
       "epoch": 0.87,
+      "learning_rate": 0.0006099178793818478,
+      "loss": 2.3626,
+      "step": 144
+    },
+    {
+      "epoch": 0.88,
+      "learning_rate": 0.0006004970048339225,
+      "loss": 2.3991,
+      "step": 146
+    },
+    {
+      "epoch": 0.89,
+      "learning_rate": 0.0005910387997512573,
+      "loss": 2.4396,
+      "step": 148
     },
     {
       "epoch": 0.91,
+      "learning_rate": 0.0005815467774709313,
+      "loss": 2.3816,
+      "step": 150
+    },
+    {
+      "epoch": 0.92,
+      "learning_rate": 0.0005720244638917323,
+      "loss": 2.3866,
+      "step": 152
+    },
+    {
+      "epoch": 0.93,
+      "learning_rate": 0.0005624753961644281,
+      "loss": 2.4035,
+      "step": 154
     },
     {
       "epoch": 0.94,
+      "learning_rate": 0.0005529031213778615,
+      "loss": 2.4063,
+      "step": 156
+    },
+    {
+      "epoch": 0.95,
+      "learning_rate": 0.0005433111952413496,
+      "loss": 2.3944,
+      "step": 158
+    },
+    {
+      "epoch": 0.97,
+      "learning_rate": 0.0005337031807638841,
+      "loss": 2.4192,
+      "step": 160
     },
     {
       "epoch": 0.98,
+      "learning_rate": 0.0005240826469306187,
+      "loss": 2.3603,
+      "step": 162
+    },
+    {
+      "epoch": 0.99,
+      "learning_rate": 0.0005144531673771364,
+      "loss": 2.4041,
+      "step": 164
+    },
+    {
+      "epoch": 1.01,
+      "learning_rate": 0.0005048183190619903,
+      "loss": 2.8813,
+      "step": 166
     },
     {
       "epoch": 1.02,
+      "learning_rate": 0.0004951816809380097,
+      "loss": 2.2786,
+      "step": 168
+    },
+    {
+      "epoch": 1.03,
+      "learning_rate": 0.0004855468326228638,
+      "loss": 2.2886,
+      "step": 170
     },
     {
+      "epoch": 1.04,
+      "learning_rate": 0.00047591735306938137,
+      "loss": 2.1822,
+      "step": 172
+    },
+    {
+      "epoch": 1.05,
+      "learning_rate": 0.00046629681923611606,
+      "loss": 2.2589,
+      "step": 174
+    },
+    {
+      "epoch": 1.07,
+      "learning_rate": 0.0004566888047586507,
+      "loss": 2.2625,
+      "step": 176
+    },
+    {
+      "epoch": 1.08,
+      "learning_rate": 0.00044709687862213866,
+      "loss": 2.2715,
+      "step": 178
+    },
+    {
+      "epoch": 1.09,
+      "learning_rate": 0.000437524603835572,
+      "loss": 2.1988,
+      "step": 180
     },
     {
       "epoch": 1.1,
+      "learning_rate": 0.000427975536108268,
+      "loss": 2.3257,
+      "step": 182
+    },
+    {
+      "epoch": 1.11,
+      "learning_rate": 0.00041845322252906863,
+      "loss": 2.3026,
+      "step": 184
     },
     {
       "epoch": 1.13,
+      "learning_rate": 0.00040896120024874283,
+      "loss": 2.2306,
+      "step": 186
     },
     {
+      "epoch": 1.14,
+      "learning_rate": 0.0003995029951660776,
+      "loss": 2.2269,
+      "step": 188
+    },
+    {
+      "epoch": 1.15,
+      "learning_rate": 0.00039008212061815206,
+      "loss": 2.3079,
+      "step": 190
+    },
+    {
+      "epoch": 1.16,
+      "learning_rate": 0.00038070207607527587,
+      "loss": 2.218,
+      "step": 192
+    },
+    {
+      "epoch": 1.18,
+      "learning_rate": 0.00037136634584107787,
+      "loss": 2.2667,
+      "step": 194
+    },
+    {
+      "epoch": 1.19,
+      "learning_rate": 0.0003620783977582305,
+      "loss": 2.2754,
+      "step": 196
+    },
+    {
+      "epoch": 1.2,
+      "learning_rate": 0.0003528416819202881,
+      "loss": 2.2835,
+      "step": 198
     },
     {
       "epoch": 1.21,
+      "learning_rate": 0.00034365962939011697,
+      "loss": 2.2843,
+      "step": 200
+    },
+    {
+      "epoch": 1.22,
+      "learning_rate": 0.00033453565092539584,
+      "loss": 2.2387,
+      "step": 202
     },
     {
       "epoch": 1.24,
+      "learning_rate": 0.0003254731357116597,
+      "loss": 2.254,
+      "step": 204
+    },
+    {
+      "epoch": 1.25,
+      "learning_rate": 0.000316475450103354,
+      "loss": 2.2686,
+      "step": 206
+    },
+    {
+      "epoch": 1.26,
+      "learning_rate": 0.00030754593637337277,
+      "loss": 2.2422,
+      "step": 208
+    },
+    {
+      "epoch": 1.27,
+      "learning_rate": 0.0002986879114715403,
+      "loss": 2.3003,
+      "step": 210
     },
     {
       "epoch": 1.28,
+      "learning_rate": 0.0002899046657924992,
+      "loss": 2.2619,
+      "step": 212
+    },
+    {
+      "epoch": 1.3,
+      "learning_rate": 0.00028119946195346375,
+      "loss": 2.3022,
+      "step": 214
     },
     {
       "epoch": 1.31,
+      "learning_rate": 0.00027257553358229033,
+      "loss": 2.2523,
+      "step": 216
+    },
+    {
+      "epoch": 1.32,
+      "learning_rate": 0.0002640360841163174,
+      "loss": 2.3098,
+      "step": 218
     },
     {
+      "epoch": 1.33,
+      "learning_rate": 0.0002555842856124182,
+      "loss": 2.235,
+      "step": 220
+    },
+    {
+      "epoch": 1.34,
+      "learning_rate": 0.00024722327756871186,
+      "loss": 2.2448,
+      "step": 222
+    },
+    {
+      "epoch": 1.36,
+      "learning_rate": 0.0002389561657583681,
+      "loss": 2.2411,
+      "step": 224
+    },
+    {
+      "epoch": 1.37,
+      "learning_rate": 0.00023078602107593898,
+      "loss": 2.2485,
+      "step": 226
+    },
+    {
+      "epoch": 1.38,
+      "learning_rate": 0.0002227158783966467,
+      "loss": 2.2261,
+      "step": 228
     },
     {
       "epoch": 1.39,
+      "learning_rate": 0.00021474873544905204,
+      "loss": 2.2427,
+      "step": 230
+    },
+    {
+      "epoch": 1.4,
+      "learning_rate": 0.00020688755170151997,
+      "loss": 2.2961,
+      "step": 232
     },
     {
       "epoch": 1.42,
+      "learning_rate": 0.00019913524726289784,
+      "loss": 2.2272,
+      "step": 234
     },
     {
+      "epoch": 1.43,
+      "learning_rate": 0.00019149470179781532,
+      "loss": 2.2368,
+      "step": 236
+    },
+    {
+      "epoch": 1.44,
+      "learning_rate": 0.00018396875345700497,
+      "loss": 2.2846,
+      "step": 238
+    },
+    {
+      "epoch": 1.45,
+      "learning_rate": 0.000176560197823046,
+      "loss": 2.1709,
+      "step": 240
+    },
+    {
+      "epoch": 1.47,
+      "learning_rate": 0.0001692717868719195,
+      "loss": 2.2659,
+      "step": 242
+    },
+    {
+      "epoch": 1.48,
+      "learning_rate": 0.0001621062279507617,
+      "loss": 2.2655,
+      "step": 244
+    },
+    {
+      "epoch": 1.49,
+      "learning_rate": 0.0001550661827721941,
+      "loss": 2.2284,
+      "step": 246
     },
     {
       "epoch": 1.5,
+      "learning_rate": 0.00014815426642560752,
+      "loss": 2.2444,
+      "step": 248
+    },
+    {
+      "epoch": 1.51,
+      "learning_rate": 0.0001413730464057616,
+      "loss": 2.3102,
+      "step": 250
     },
     {
       "epoch": 1.53,
+      "learning_rate": 0.00013472504165906613,
+      "loss": 2.2287,
+      "step": 252
+    },
+    {
+      "epoch": 1.54,
+      "learning_rate": 0.00012821272164789544,
+      "loss": 2.2713,
+      "step": 254
+    },
+    {
+      "epoch": 1.55,
+      "learning_rate": 0.00012183850543328313,
+      "loss": 2.2127,
+      "step": 256
+    },
+    {
+      "epoch": 1.56,
+      "learning_rate": 0.00011560476077634069,
+      "loss": 2.1682,
+      "step": 258
     },
     {
       "epoch": 1.57,
+      "learning_rate": 0.00010951380325872979,
+      "loss": 2.2393,
+      "step": 260
+    },
+    {
+      "epoch": 1.59,
+      "learning_rate": 0.00010356789542251938,
+      "loss": 2.2259,
+      "step": 262
     },
     {
       "epoch": 1.6,
+      "learning_rate": 9.776924592974257e-05,
+      "loss": 2.2157,
+      "step": 264
     },
     {
+      "epoch": 1.61,
+      "learning_rate": 9.212000874196952e-05,
+      "loss": 2.2393,
+      "step": 266
+    },
+    {
+      "epoch": 1.62,
+      "learning_rate": 8.662228232019875e-05,
+      "loss": 2.2613,
+      "step": 268
+    },
+    {
+      "epoch": 1.63,
+      "learning_rate": 8.127810884536401e-05,
+      "loss": 2.1981,
+      "step": 270
+    },
+    {
+      "epoch": 1.65,
+      "learning_rate": 7.60894734597476e-05,
+      "loss": 2.2457,
+      "step": 272
+    },
+    {
+      "epoch": 1.66,
+      "learning_rate": 7.105830352958143e-05,
+      "loss": 2.2571,
+      "step": 274
+    },
+    {
+      "epoch": 1.67,
+      "learning_rate": 6.618646792910893e-05,
+      "loss": 2.1771,
+      "step": 276
     },
     {
       "epoch": 1.68,
+      "learning_rate": 6.147577634637414e-05,
+      "loss": 2.2243,
+      "step": 278
+    },
+    {
+      "epoch": 1.69,
+      "learning_rate": 5.692797861099719e-05,
+      "loss": 2.2427,
+      "step": 280
     },
     {
       "epoch": 1.71,
+      "learning_rate": 5.25447640441834e-05,
+      "loss": 2.2266,
+      "step": 282
     },
     {
+      "epoch": 1.72,
+      "learning_rate": 4.832776083120982e-05,
+      "loss": 2.3057,
+      "step": 284
+    },
+    {
+      "epoch": 1.73,
+      "learning_rate": 4.4278535416620916e-05,
+      "loss": 2.2225,
+      "step": 286
+    },
+    {
+      "epoch": 1.74,
+      "learning_rate": 4.039859192235778e-05,
+      "loss": 2.2665,
+      "step": 288
+    },
+    {
+      "epoch": 1.76,
+      "learning_rate": 3.668937158903901e-05,
+      "loss": 2.2807,
+      "step": 290
+    },
+    {
+      "epoch": 1.77,
+      "learning_rate": 3.315225224059809e-05,
+      "loss": 2.2165,
+      "step": 292
+    },
+    {
+      "epoch": 1.78,
+      "learning_rate": 2.9788547772478415e-05,
+      "loss": 2.2651,
+      "step": 294
     },
     {
       "epoch": 1.79,
+      "learning_rate": 2.6599507663574384e-05,
+      "loss": 2.2437,
+      "step": 296
+    },
+    {
+      "epoch": 1.8,
+      "learning_rate": 2.3586316512101414e-05,
+      "loss": 2.3066,
+      "step": 298
     },
     {
       "epoch": 1.82,
+      "learning_rate": 2.0750093595565732e-05,
+      "loss": 2.1727,
+      "step": 300
+    },
+    {
+      "epoch": 1.83,
+      "learning_rate": 1.8091892454998595e-05,
+      "loss": 2.2409,
+      "step": 302
+    },
+    {
+      "epoch": 1.84,
+      "learning_rate": 1.561270050360897e-05,
+      "loss": 2.2908,
+      "step": 304
+    },
+    {
+      "epoch": 1.85,
+      "learning_rate": 1.33134386599994e-05,
+      "loss": 2.2925,
+      "step": 306
     },
     {
       "epoch": 1.86,
+      "learning_rate": 1.1194961006082971e-05,
+      "loss": 2.2449,
+      "step": 308
+    },
+    {
+      "epoch": 1.88,
+      "learning_rate": 9.258054469825972e-06,
+      "loss": 2.235,
+      "step": 310
     },
     {
       "epoch": 1.89,
+      "learning_rate": 7.503438532937168e-06,
+      "loss": 2.2216,
+      "step": 312
     },
     {
+      "epoch": 1.9,
+      "learning_rate": 5.931764963608866e-06,
+      "loss": 2.2884,
+      "step": 314
+    },
+    {
+      "epoch": 1.91,
+      "learning_rate": 4.5436175744121845e-06,
+      "loss": 2.2124,
+      "step": 316
+    },
+    {
+      "epoch": 1.92,
+      "learning_rate": 3.3395120054343087e-06,
+      "loss": 2.2418,
+      "step": 318
+    },
+    {
+      "epoch": 1.94,
+      "learning_rate": 2.319895532739369e-06,
+      "loss": 2.2855,
+      "step": 320
+    },
+    {
+      "epoch": 1.95,
+      "learning_rate": 1.4851469022234e-06,
+      "loss": 2.2974,
+      "step": 322
+    },
+    {
+      "epoch": 1.96,
+      "learning_rate": 8.35576188926046e-07,
+      "loss": 2.2552,
+      "step": 324
     },
     {
       "epoch": 1.97,
+      "learning_rate": 3.71424681850141e-07,
+      "loss": 2.2209,
+      "step": 326
     },
     {
       "epoch": 1.99,
+      "learning_rate": 9.286479433257e-08,
+      "loss": 2.1935,
+      "step": 328
+    },
+    {
+      "epoch": 2.0,
+      "learning_rate": 0.0,
+      "loss": 2.2702,
+      "step": 330
+    },
+    {
+      "epoch": 2.0,
+      "step": 330,
+      "total_flos": 4.634629374287544e+17,
+      "train_loss": 2.336302039117524,
+      "train_runtime": 79791.9217,
+      "train_samples_per_second": 0.265,
+      "train_steps_per_second": 0.004
     }
   ],
+  "max_steps": 330,
   "num_train_epochs": 2,
+  "total_flos": 4.634629374287544e+17,
   "trial_name": null,
   "trial_params": null
 }

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:673e02f4e38479f806c1a20434ea2efa4f9cf7a6aa07067907fde9dc0160405c
 size 4527

 version https://git-lfs.github.com/spec/v1
+oid sha256:4526ccf2486e6fb3048af4d26eb6228cf640199b02d5c9ab46e06e3bf549ec3a
 size 4527

zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,484 @@

+#!/usr/bin/env python
+# This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+import deepspeed
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION,
+                                            OPTIMIZER_STATE_DICT,
+                                            PARAM_SHAPES,
+                                            SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS,
+                                            ZERO_STAGE,
+                                            PARTITION_COUNT,
+                                            PARAM_SHAPES,
+                                            BUFFER_NAMES)
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage == 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_optim_files(checkpoint_dir):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    optim_files = sorted(glob.glob(os.path.join(checkpoint_dir,
+                                                "*_optim_states.pt")),
+                         key=natural_keys)
+    if len(optim_files) == 0:
+        raise FileNotFoundError(
+            f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'")
+    return optim_files
+def parse_model_state(file):
+    state_dict = torch.load(file, map_location=device)
+    if BUFFER_NAMES not in state_dict:
+        raise ValueError(f"{file} is not a model state checkpoint")
+    buffer_names = state_dict[BUFFER_NAMES]
+    if debug:
+        print("Found buffers:", buffer_names)
+    # recover just the buffers while restoring them to fp32 if they were saved in fp16
+    buffers = {
+        k: v.float()
+        for k,
+        v in state_dict["module"].items() if k in buffer_names
+    }
+    param_shapes = state_dict[PARAM_SHAPES]
+    ds_version = state_dict.get(DS_VERSION, None)
+    return buffers, param_shapes, ds_version
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dicts.append(torch.load(f, map_location=device))
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage == 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage == 2:
+        fp32_flat_groups = [
+            state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key]
+            for i in range(len(state_dicts))
+        ]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key],
+                      0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(
+        f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_file = get_model_state_file(ds_checkpoint_dir, zero_stage)
+    buffers, param_shapes, ds_version = parse_model_state(model_file)
+    print(f'Parsing checkpoint created by deepspeed=={ds_version}')
+    if zero_stage == 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size,
+                                                          param_shapes,
+                                                          fp32_flat_groups,
+                                                          buffers)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size,
+                                                          param_shapes,
+                                                          fp32_flat_groups,
+                                                          buffers)
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
+                                               param_shapes,
+                                               fp32_flat_groups,
+                                               buffers):
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(
+                    f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum([
+        full_single_fp32_vector.numel()
+        for full_single_fp32_vector in merged_single_partition_of_fp32_groups
+    ])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum(
+            [sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    state_dict = OrderedDict()
+    # buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel()
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(
+                    f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} "
+                )
+            state_dict[name] = full_single_fp32_vector.narrow(
+                0,
+                offset,
+                unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(
+                f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(
+        f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
+    )
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
+                                               param_shapes,
+                                               fp32_flat_groups,
+                                               buffers):
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    state_dict = OrderedDict()
+    # buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"{total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0,
+                                             offset,
+                                             partitioned_numel)
+                  for i in range(world_size)),
+            0).narrow(0,
+                      0,
+                      unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(
+            f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(
+        f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
+    )
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    """
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "checkpoint_dir",
+        type=str,
+        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help=
+        "path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)"
+    )
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)