amazingvince commited on Sep 3, 2024

Commit

1d6d730

verified ·

1 Parent(s): a1481e0

Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

.gitattributes +1 -0
checkpoints/.hydra/config.yaml +50 -0
checkpoints/.hydra/hydra.yaml +156 -0
checkpoints/.hydra/overrides.yaml +1 -0
checkpoints/checkpoint-pt-10000/model.safetensors +3 -0
checkpoints/checkpoint-pt-10000/random_states_0.pkl +3 -0
checkpoints/checkpoint-pt-5000/model.safetensors +3 -0
checkpoints/checkpoint-pt-5000/random_states_0.pkl +3 -0
checkpoints/config.json +26 -0
checkpoints/main.log +135 -0
checkpoints/wandb/debug-internal.log +0 -0
checkpoints/wandb/debug.log +28 -0
checkpoints/wandb/run-20240902_170304-v43qltex/files/config.yaml +132 -0
checkpoints/wandb/run-20240902_170304-v43qltex/files/output.log +253 -0
checkpoints/wandb/run-20240902_170304-v43qltex/files/requirements.txt +195 -0
checkpoints/wandb/run-20240902_170304-v43qltex/files/wandb-metadata.json +527 -0
checkpoints/wandb/run-20240902_170304-v43qltex/files/wandb-summary.json +1 -0
checkpoints/wandb/run-20240902_170304-v43qltex/logs/debug-internal.log +0 -0
checkpoints/wandb/run-20240902_170304-v43qltex/logs/debug.log +28 -0
checkpoints/wandb/run-20240902_170304-v43qltex/run-v43qltex.wandb +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoints/wandb/run-20240902_170304-v43qltex/run-v43qltex.wandb filter=lfs diff=lfs merge=lfs -text

checkpoints/.hydra/config.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+mode: pt
+device: gpu
+precision: bf16
+eval_only: false
+predict_only: false
+seed: 2137
+tokenizer:
+  name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5
+model:
+  klass: custom_seq2seq
+  name: google/t5-v1_1-base
+  overwrite: null
+  add_config: null
+  checkpoint_path: ''
+  random_init: true
+  compile: true
+data:
+  input_length: 1024
+  mlm_probability: 0.15
+  mean_noise_span_length: 3.0
+  num_workers: 8
+optim:
+  name: adamwscale
+  base_lr: 0.02
+  batch_size: 64
+  total_steps: 65536
+  epochs: -1
+  warmup_steps: 10000
+  lr_scheduler: cosine
+  weight_decay: 0.001
+  grad_clip: 1.0
+  grad_acc: 4
+  final_cosine: 1.0e-05
+eval:
+  every_steps: 100000
+  steps: 500
+checkpoint:
+  every_steps: 5000
+logging:
+  every_steps: 100
+  grad_l2: true
+  weights_l2: true
+  use_wandb: true
+  wandb_config:
+    project: nano-custom-seq2seq
+    entity: amazingvince
+    tags:
+    - nanoT5
+    - my_tag
+    mode: online

checkpoints/.hydra/hydra.yaml ADDED Viewed

	@@ -0,0 +1,156 @@

+hydra:
+  run:
+    dir: ./logs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+      Use --hydra-help to view Hydra specific help
+      '
+    template: '${hydra.help.header}
+      == Configuration groups ==
+      Compose your configuration from those groups (group=option)
+      $APP_CONFIG_GROUPS
+      == Config ==
+      Override anything in the config (foo.bar=value)
+      $CONFIG
+      ${hydra.help.footer}
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+      See https://hydra.cc for more info.
+      == Flags ==
+      $FLAGS_HELP
+      == Configuration groups ==
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+      $HYDRA_CONFIG_GROUPS
+      Use ''--cfg hydra'' to Show the Hydra config.
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task: []
+  job:
+    name: main
+    chdir: true
+    override_dirname: ''
+    id: ???
+    num: ???
+    config_name: default
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.1'
+    cwd: /workspace/nanoT5
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /workspace/nanoT5/nanoT5/configs
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /workspace/nanoT5/logs/2024-09-02/17-03-02
+    choices:
+      local_env: default
+      task: pt
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false

checkpoints/.hydra/overrides.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ []

checkpoints/checkpoint-pt-10000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de209ea75162ba234b5a0ac46f2434ad29c106e4770c8c587eba8ac390f7fede
+size 2692370584

checkpoints/checkpoint-pt-10000/random_states_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50b14a8e8e4cb1f87530bb13452da585006a1a54e1fa02069afa73d0775f0736
+size 14344

checkpoints/checkpoint-pt-5000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7134d1e0be9b58fd378c0b681679ae41855daa9cbb0dc80b24e826ef59861ce
+size 2692370584

checkpoints/checkpoint-pt-5000/random_states_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50b14a8e8e4cb1f87530bb13452da585006a1a54e1fa02069afa73d0775f0736
+size 14344

checkpoints/config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "attention_probs_dropout_prob": 0.0,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 3,
+  "eos_token_id": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 1024,
+  "num_attention_heads": 16,
+  "num_decoder_layers": 32,
+  "num_encoder_layers": 16,
+  "num_key_value_heads": 4,
+  "pad_token_id": 3,
+  "rotary_emb_base": 10000.0,
+  "rotary_emb_dim": 32,
+  "rotary_emb_interleaved": false,
+  "rotary_emb_scale_base": null,
+  "transformers_version": "4.44.2",
+  "use_cache": true,
+  "vocab_size": 48256
+}

checkpoints/main.log ADDED Viewed

	@@ -0,0 +1,135 @@

+[2024-09-02 17:03:02,219][accelerate.utils.other][WARNING] - Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
+[2024-09-02 17:03:02,226][Main][INFO] - Distributed environment: DistributedType.NO
+Num processes: 1
+Process index: 0
+Local process index: 0
+Device: cuda
+Mixed precision type: bf16
+[2024-09-02 17:03:02,227][Main][INFO] - Working directory is /workspace/nanoT5/logs/2024-09-02/17-03-02
+[2024-09-02 17:14:53,691][Main][INFO] - [train] Step 100 out of 65536 | Loss --> 51.971 | Grad_l2 --> 82.676 | Weights_l2 --> 7042.062 | Lr --> 0.010 | Seconds_per_step --> 6.760 |
+[2024-09-02 17:20:23,699][Main][INFO] - [train] Step 200 out of 65536 | Loss --> 14.150 | Grad_l2 --> 19.390 | Weights_l2 --> 7034.376 | Lr --> 0.010 | Seconds_per_step --> 3.300 |
+[2024-09-02 17:25:54,840][Main][INFO] - [train] Step 300 out of 65536 | Loss --> 9.006 | Grad_l2 --> 9.061 | Weights_l2 --> 7026.824 | Lr --> 0.010 | Seconds_per_step --> 3.311 |
+[2024-09-02 17:31:26,095][Main][INFO] - [train] Step 400 out of 65536 | Loss --> 7.529 | Grad_l2 --> 5.889 | Weights_l2 --> 7019.014 | Lr --> 0.010 | Seconds_per_step --> 3.313 |
+[2024-09-02 17:36:56,190][Main][INFO] - [train] Step 500 out of 65536 | Loss --> 6.618 | Grad_l2 --> 4.039 | Weights_l2 --> 7010.897 | Lr --> 0.011 | Seconds_per_step --> 3.301 |
+[2024-09-02 17:42:27,693][Main][INFO] - [train] Step 600 out of 65536 | Loss --> 5.994 | Grad_l2 --> 2.962 | Weights_l2 --> 7002.549 | Lr --> 0.011 | Seconds_per_step --> 3.315 |
+[2024-09-02 17:47:57,967][Main][INFO] - [train] Step 700 out of 65536 | Loss --> 5.703 | Grad_l2 --> 2.434 | Weights_l2 --> 6994.267 | Lr --> 0.011 | Seconds_per_step --> 3.303 |
+[2024-09-02 17:53:29,228][Main][INFO] - [train] Step 800 out of 65536 | Loss --> 6.603 | Grad_l2 --> 6.221 | Weights_l2 --> 6985.927 | Lr --> 0.011 | Seconds_per_step --> 3.313 |
+[2024-09-02 17:59:00,011][Main][INFO] - [train] Step 900 out of 65536 | Loss --> 5.408 | Grad_l2 --> 1.465 | Weights_l2 --> 6980.026 | Lr --> 0.011 | Seconds_per_step --> 3.308 |
+[2024-09-02 18:04:30,275][Main][INFO] - [train] Step 1000 out of 65536 | Loss --> 5.311 | Grad_l2 --> 0.992 | Weights_l2 --> 6975.109 | Lr --> 0.011 | Seconds_per_step --> 3.303 |
+[2024-09-02 18:10:01,468][Main][INFO] - [train] Step 1100 out of 65536 | Loss --> 5.241 | Grad_l2 --> 0.854 | Weights_l2 --> 6970.708 | Lr --> 0.011 | Seconds_per_step --> 3.312 |
+[2024-09-02 18:15:33,362][Main][INFO] - [train] Step 1200 out of 65536 | Loss --> 5.180 | Grad_l2 --> 0.838 | Weights_l2 --> 6966.641 | Lr --> 0.011 | Seconds_per_step --> 3.319 |
+[2024-09-02 18:21:03,902][Main][INFO] - [train] Step 1300 out of 65536 | Loss --> 5.126 | Grad_l2 --> 0.764 | Weights_l2 --> 6962.789 | Lr --> 0.011 | Seconds_per_step --> 3.305 |
+[2024-09-02 18:26:35,349][Main][INFO] - [train] Step 1400 out of 65536 | Loss --> 5.088 | Grad_l2 --> 0.744 | Weights_l2 --> 6959.146 | Lr --> 0.011 | Seconds_per_step --> 3.314 |
+[2024-09-02 18:32:06,048][Main][INFO] - [train] Step 1500 out of 65536 | Loss --> 5.046 | Grad_l2 --> 0.702 | Weights_l2 --> 6955.673 | Lr --> 0.012 | Seconds_per_step --> 3.307 |
+[2024-09-02 18:37:37,903][Main][INFO] - [train] Step 1600 out of 65536 | Loss --> 5.007 | Grad_l2 --> 0.691 | Weights_l2 --> 6952.523 | Lr --> 0.012 | Seconds_per_step --> 3.319 |
+[2024-09-02 18:43:09,723][Main][INFO] - [train] Step 1700 out of 65536 | Loss --> 4.973 | Grad_l2 --> 0.673 | Weights_l2 --> 6949.412 | Lr --> 0.012 | Seconds_per_step --> 3.318 |
+[2024-09-02 18:48:40,909][Main][INFO] - [train] Step 1800 out of 65536 | Loss --> 4.943 | Grad_l2 --> 0.671 | Weights_l2 --> 6946.498 | Lr --> 0.012 | Seconds_per_step --> 3.312 |
+[2024-09-02 18:54:13,524][Main][INFO] - [train] Step 1900 out of 65536 | Loss --> 4.929 | Grad_l2 --> 0.668 | Weights_l2 --> 6943.795 | Lr --> 0.012 | Seconds_per_step --> 3.326 |
+[2024-09-02 18:59:45,500][Main][INFO] - [train] Step 2000 out of 65536 | Loss --> 4.894 | Grad_l2 --> 0.665 | Weights_l2 --> 6941.241 | Lr --> 0.012 | Seconds_per_step --> 3.320 |
+[2024-09-02 19:05:16,395][Main][INFO] - [train] Step 2100 out of 65536 | Loss --> 4.881 | Grad_l2 --> 0.713 | Weights_l2 --> 6938.861 | Lr --> 0.012 | Seconds_per_step --> 3.309 |
+[2024-09-02 19:10:48,520][Main][INFO] - [train] Step 2200 out of 65536 | Loss --> 4.853 | Grad_l2 --> 0.653 | Weights_l2 --> 6936.551 | Lr --> 0.012 | Seconds_per_step --> 3.321 |
+[2024-09-02 19:16:19,278][Main][INFO] - [train] Step 2300 out of 65536 | Loss --> 4.829 | Grad_l2 --> 0.646 | Weights_l2 --> 6934.357 | Lr --> 0.012 | Seconds_per_step --> 3.308 |
+[2024-09-02 19:21:51,370][Main][INFO] - [train] Step 2400 out of 65536 | Loss --> 4.790 | Grad_l2 --> 0.620 | Weights_l2 --> 6932.338 | Lr --> 0.012 | Seconds_per_step --> 3.321 |
+[2024-09-02 19:27:23,544][Main][INFO] - [train] Step 2500 out of 65536 | Loss --> 4.784 | Grad_l2 --> 0.643 | Weights_l2 --> 6930.395 | Lr --> 0.013 | Seconds_per_step --> 3.322 |
+[2024-09-02 19:32:54,341][Main][INFO] - [train] Step 2600 out of 65536 | Loss --> 4.755 | Grad_l2 --> 0.623 | Weights_l2 --> 6928.543 | Lr --> 0.013 | Seconds_per_step --> 3.308 |
+[2024-09-02 19:38:25,942][Main][INFO] - [train] Step 2700 out of 65536 | Loss --> 4.743 | Grad_l2 --> 0.636 | Weights_l2 --> 6926.944 | Lr --> 0.013 | Seconds_per_step --> 3.316 |
+[2024-09-02 19:43:57,708][Main][INFO] - [train] Step 2800 out of 65536 | Loss --> 4.722 | Grad_l2 --> 0.590 | Weights_l2 --> 6925.379 | Lr --> 0.013 | Seconds_per_step --> 3.318 |
+[2024-09-02 19:49:28,285][Main][INFO] - [train] Step 2900 out of 65536 | Loss --> 4.715 | Grad_l2 --> 0.622 | Weights_l2 --> 6924.007 | Lr --> 0.013 | Seconds_per_step --> 3.306 |
+[2024-09-02 19:54:59,957][Main][INFO] - [train] Step 3000 out of 65536 | Loss --> 4.694 | Grad_l2 --> 0.652 | Weights_l2 --> 6922.709 | Lr --> 0.013 | Seconds_per_step --> 3.317 |
+[2024-09-02 20:00:31,072][Main][INFO] - [train] Step 3100 out of 65536 | Loss --> 4.678 | Grad_l2 --> 0.614 | Weights_l2 --> 6921.561 | Lr --> 0.013 | Seconds_per_step --> 3.311 |
+[2024-09-02 20:06:02,747][Main][INFO] - [train] Step 3200 out of 65536 | Loss --> 4.633 | Grad_l2 --> 0.610 | Weights_l2 --> 6920.463 | Lr --> 0.013 | Seconds_per_step --> 3.317 |
+[2024-09-02 20:11:34,607][Main][INFO] - [train] Step 3300 out of 65536 | Loss --> 4.599 | Grad_l2 --> 0.638 | Weights_l2 --> 6919.642 | Lr --> 0.013 | Seconds_per_step --> 3.319 |
+[2024-09-02 20:17:05,731][Main][INFO] - [train] Step 3400 out of 65536 | Loss --> 4.549 | Grad_l2 --> 0.774 | Weights_l2 --> 6919.263 | Lr --> 0.013 | Seconds_per_step --> 3.311 |
+[2024-09-02 20:22:37,601][Main][INFO] - [train] Step 3500 out of 65536 | Loss --> 4.420 | Grad_l2 --> 0.934 | Weights_l2 --> 6918.974 | Lr --> 0.014 | Seconds_per_step --> 3.319 |
+[2024-09-02 20:28:09,554][Main][INFO] - [train] Step 3600 out of 65536 | Loss --> 4.256 | Grad_l2 --> 0.763 | Weights_l2 --> 6919.477 | Lr --> 0.014 | Seconds_per_step --> 3.319 |
+[2024-09-02 20:33:40,654][Main][INFO] - [train] Step 3700 out of 65536 | Loss --> 4.131 | Grad_l2 --> 0.657 | Weights_l2 --> 6920.705 | Lr --> 0.014 | Seconds_per_step --> 3.311 |
+[2024-09-02 20:39:13,064][Main][INFO] - [train] Step 3800 out of 65536 | Loss --> 4.021 | Grad_l2 --> 0.709 | Weights_l2 --> 6922.188 | Lr --> 0.014 | Seconds_per_step --> 3.324 |
+[2024-09-02 20:44:45,663][Main][INFO] - [train] Step 3900 out of 65536 | Loss --> 3.909 | Grad_l2 --> 0.637 | Weights_l2 --> 6923.666 | Lr --> 0.014 | Seconds_per_step --> 3.326 |
+[2024-09-02 20:50:16,811][Main][INFO] - [train] Step 4000 out of 65536 | Loss --> 3.855 | Grad_l2 --> 1.013 | Weights_l2 --> 6923.778 | Lr --> 0.014 | Seconds_per_step --> 3.311 |
+[2024-09-02 20:55:49,235][Main][INFO] - [train] Step 4100 out of 65536 | Loss --> 3.770 | Grad_l2 --> 0.589 | Weights_l2 --> 6925.545 | Lr --> 0.014 | Seconds_per_step --> 3.324 |
+[2024-09-02 21:01:20,500][Main][INFO] - [train] Step 4200 out of 65536 | Loss --> 3.710 | Grad_l2 --> 0.579 | Weights_l2 --> 6927.200 | Lr --> 0.014 | Seconds_per_step --> 3.313 |
+[2024-09-02 21:06:53,406][Main][INFO] - [train] Step 4300 out of 65536 | Loss --> 3.651 | Grad_l2 --> 0.588 | Weights_l2 --> 6928.842 | Lr --> 0.014 | Seconds_per_step --> 3.329 |
+[2024-09-02 21:12:26,298][Main][INFO] - [train] Step 4400 out of 65536 | Loss --> 3.614 | Grad_l2 --> 0.632 | Weights_l2 --> 6930.597 | Lr --> 0.014 | Seconds_per_step --> 3.329 |
+[2024-09-02 21:17:57,623][Main][INFO] - [train] Step 4500 out of 65536 | Loss --> 3.582 | Grad_l2 --> 0.884 | Weights_l2 --> 6931.569 | Lr --> 0.015 | Seconds_per_step --> 3.313 |
+[2024-09-02 21:23:30,116][Main][INFO] - [train] Step 4600 out of 65536 | Loss --> 3.527 | Grad_l2 --> 0.582 | Weights_l2 --> 6933.783 | Lr --> 0.015 | Seconds_per_step --> 3.325 |
+[2024-09-02 21:29:02,417][Main][INFO] - [train] Step 4700 out of 65536 | Loss --> 3.476 | Grad_l2 --> 0.549 | Weights_l2 --> 6935.959 | Lr --> 0.015 | Seconds_per_step --> 3.323 |
+[2024-09-02 21:34:33,535][Main][INFO] - [train] Step 4800 out of 65536 | Loss --> 3.430 | Grad_l2 --> 0.551 | Weights_l2 --> 6938.224 | Lr --> 0.015 | Seconds_per_step --> 3.311 |
+[2024-09-02 21:40:05,905][Main][INFO] - [train] Step 4900 out of 65536 | Loss --> 3.395 | Grad_l2 --> 0.550 | Weights_l2 --> 6940.617 | Lr --> 0.015 | Seconds_per_step --> 3.324 |
+[2024-09-02 21:45:36,944][Main][INFO] - [train] Step 5000 out of 65536 | Loss --> 3.366 | Grad_l2 --> 0.546 | Weights_l2 --> 6943.230 | Lr --> 0.015 | Seconds_per_step --> 3.310 |
+[2024-09-02 21:45:36,947][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-5000
+[2024-09-02 21:45:36,954][accelerate.utils.other][WARNING] - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
+[2024-09-02 21:45:44,182][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-5000/model.safetensors
+[2024-09-02 21:45:54,822][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-5000/optimizer.bin
+[2024-09-02 21:45:54,827][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-5000/scheduler.bin
+[2024-09-02 21:45:54,828][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-5000/sampler.bin
+[2024-09-02 21:45:54,829][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-5000/sampler_1.bin
+[2024-09-02 21:45:54,835][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-5000/random_states_0.pkl
+[2024-09-02 21:51:26,402][Main][INFO] - [train] Step 5100 out of 65536 | Loss --> 3.302 | Grad_l2 --> 0.541 | Weights_l2 --> 6946.278 | Lr --> 0.015 | Seconds_per_step --> 3.495 |
+[2024-09-02 21:56:58,321][Main][INFO] - [train] Step 5200 out of 65536 | Loss --> 3.248 | Grad_l2 --> 0.556 | Weights_l2 --> 6950.060 | Lr --> 0.015 | Seconds_per_step --> 3.319 |
+[2024-09-02 22:02:29,452][Main][INFO] - [train] Step 5300 out of 65536 | Loss --> 3.194 | Grad_l2 --> 0.566 | Weights_l2 --> 6954.461 | Lr --> 0.015 | Seconds_per_step --> 3.311 |
+[2024-09-02 22:08:01,594][Main][INFO] - [train] Step 5400 out of 65536 | Loss --> 3.144 | Grad_l2 --> 0.548 | Weights_l2 --> 6959.061 | Lr --> 0.015 | Seconds_per_step --> 3.321 |
+[2024-09-02 22:13:33,473][Main][INFO] - [train] Step 5500 out of 65536 | Loss --> 3.099 | Grad_l2 --> 0.546 | Weights_l2 --> 6963.676 | Lr --> 0.016 | Seconds_per_step --> 3.319 |
+[2024-09-02 22:19:04,763][Main][INFO] - [train] Step 5600 out of 65536 | Loss --> 3.044 | Grad_l2 --> 0.531 | Weights_l2 --> 6968.055 | Lr --> 0.016 | Seconds_per_step --> 3.313 |
+[2024-09-02 22:24:37,024][Main][INFO] - [train] Step 5700 out of 65536 | Loss --> 3.023 | Grad_l2 --> 0.528 | Weights_l2 --> 6972.595 | Lr --> 0.016 | Seconds_per_step --> 3.323 |
+[2024-09-02 22:30:08,010][Main][INFO] - [train] Step 5800 out of 65536 | Loss --> 2.999 | Grad_l2 --> 0.529 | Weights_l2 --> 6977.095 | Lr --> 0.016 | Seconds_per_step --> 3.310 |
+[2024-09-02 22:35:40,260][Main][INFO] - [train] Step 5900 out of 65536 | Loss --> 2.953 | Grad_l2 --> 0.516 | Weights_l2 --> 6981.522 | Lr --> 0.016 | Seconds_per_step --> 3.322 |
+[2024-09-02 22:41:12,494][Main][INFO] - [train] Step 6000 out of 65536 | Loss --> 2.924 | Grad_l2 --> 0.514 | Weights_l2 --> 6985.860 | Lr --> 0.016 | Seconds_per_step --> 3.322 |
+[2024-09-02 22:46:43,439][Main][INFO] - [train] Step 6100 out of 65536 | Loss --> 2.904 | Grad_l2 --> 0.500 | Weights_l2 --> 6990.209 | Lr --> 0.016 | Seconds_per_step --> 3.309 |
+[2024-09-02 22:52:15,361][Main][INFO] - [train] Step 6200 out of 65536 | Loss --> 2.885 | Grad_l2 --> 0.499 | Weights_l2 --> 6994.575 | Lr --> 0.016 | Seconds_per_step --> 3.319 |
+[2024-09-02 22:57:47,371][Main][INFO] - [train] Step 6300 out of 65536 | Loss --> 2.860 | Grad_l2 --> 0.496 | Weights_l2 --> 6998.855 | Lr --> 0.016 | Seconds_per_step --> 3.320 |
+[2024-09-02 23:03:18,243][Main][INFO] - [train] Step 6400 out of 65536 | Loss --> 2.828 | Grad_l2 --> 0.486 | Weights_l2 --> 7003.354 | Lr --> 0.016 | Seconds_per_step --> 3.309 |
+[2024-09-02 23:08:50,256][Main][INFO] - [train] Step 6500 out of 65536 | Loss --> 2.823 | Grad_l2 --> 0.491 | Weights_l2 --> 7007.772 | Lr --> 0.017 | Seconds_per_step --> 3.320 |
+[2024-09-02 23:14:21,254][Main][INFO] - [train] Step 6600 out of 65536 | Loss --> 2.801 | Grad_l2 --> 0.572 | Weights_l2 --> 7012.034 | Lr --> 0.017 | Seconds_per_step --> 3.310 |
+[2024-09-02 23:19:53,383][Main][INFO] - [train] Step 6700 out of 65536 | Loss --> 2.776 | Grad_l2 --> 0.473 | Weights_l2 --> 7016.624 | Lr --> 0.017 | Seconds_per_step --> 3.321 |
+[2024-09-02 23:25:25,894][Main][INFO] - [train] Step 6800 out of 65536 | Loss --> 2.764 | Grad_l2 --> 0.489 | Weights_l2 --> 7021.128 | Lr --> 0.017 | Seconds_per_step --> 3.325 |
+[2024-09-02 23:30:56,990][Main][INFO] - [train] Step 6900 out of 65536 | Loss --> 2.754 | Grad_l2 --> 0.467 | Weights_l2 --> 7025.909 | Lr --> 0.017 | Seconds_per_step --> 3.311 |
+[2024-09-02 23:36:28,837][Main][INFO] - [train] Step 7000 out of 65536 | Loss --> 2.716 | Grad_l2 --> 0.469 | Weights_l2 --> 7030.583 | Lr --> 0.017 | Seconds_per_step --> 3.318 |
+[2024-09-02 23:42:00,897][Main][INFO] - [train] Step 7100 out of 65536 | Loss --> 2.706 | Grad_l2 --> 0.470 | Weights_l2 --> 7035.338 | Lr --> 0.017 | Seconds_per_step --> 3.321 |
+[2024-09-02 23:47:31,913][Main][INFO] - [train] Step 7200 out of 65536 | Loss --> 2.685 | Grad_l2 --> 0.460 | Weights_l2 --> 7040.107 | Lr --> 0.017 | Seconds_per_step --> 3.310 |
+[2024-09-02 23:53:04,028][Main][INFO] - [train] Step 7300 out of 65536 | Loss --> 2.675 | Grad_l2 --> 0.462 | Weights_l2 --> 7044.921 | Lr --> 0.017 | Seconds_per_step --> 3.321 |
+[2024-09-02 23:58:35,224][Main][INFO] - [train] Step 7400 out of 65536 | Loss --> 2.670 | Grad_l2 --> 0.473 | Weights_l2 --> 7049.994 | Lr --> 0.017 | Seconds_per_step --> 3.312 |
+[2024-09-03 00:04:07,495][Main][INFO] - [train] Step 7500 out of 65536 | Loss --> 2.653 | Grad_l2 --> 0.452 | Weights_l2 --> 7055.123 | Lr --> 0.018 | Seconds_per_step --> 3.323 |
+[2024-09-03 00:09:39,687][Main][INFO] - [train] Step 7600 out of 65536 | Loss --> 2.644 | Grad_l2 --> 0.499 | Weights_l2 --> 7060.263 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
+[2024-09-03 00:15:11,125][Main][INFO] - [train] Step 7700 out of 65536 | Loss --> 2.619 | Grad_l2 --> 0.451 | Weights_l2 --> 7065.593 | Lr --> 0.018 | Seconds_per_step --> 3.314 |
+[2024-09-03 00:20:43,656][Main][INFO] - [train] Step 7800 out of 65536 | Loss --> 2.611 | Grad_l2 --> 0.444 | Weights_l2 --> 7071.016 | Lr --> 0.018 | Seconds_per_step --> 3.325 |
+[2024-09-03 00:26:15,825][Main][INFO] - [train] Step 7900 out of 65536 | Loss --> 2.593 | Grad_l2 --> 0.444 | Weights_l2 --> 7076.338 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
+[2024-09-03 00:31:46,986][Main][INFO] - [train] Step 8000 out of 65536 | Loss --> 2.591 | Grad_l2 --> 0.707 | Weights_l2 --> 7081.619 | Lr --> 0.018 | Seconds_per_step --> 3.312 |
+[2024-09-03 00:37:19,240][Main][INFO] - [train] Step 8100 out of 65536 | Loss --> 2.583 | Grad_l2 --> 0.504 | Weights_l2 --> 7087.303 | Lr --> 0.018 | Seconds_per_step --> 3.323 |
+[2024-09-03 00:42:50,497][Main][INFO] - [train] Step 8200 out of 65536 | Loss --> 2.572 | Grad_l2 --> 0.435 | Weights_l2 --> 7092.976 | Lr --> 0.018 | Seconds_per_step --> 3.313 |
+[2024-09-03 00:48:22,669][Main][INFO] - [train] Step 8300 out of 65536 | Loss --> 2.550 | Grad_l2 --> 0.444 | Weights_l2 --> 7098.242 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
+[2024-09-03 00:53:54,859][Main][INFO] - [train] Step 8400 out of 65536 | Loss --> 2.533 | Grad_l2 --> 0.424 | Weights_l2 --> 7103.870 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
+[2024-09-03 00:59:25,959][Main][INFO] - [train] Step 8500 out of 65536 | Loss --> 2.520 | Grad_l2 --> 0.415 | Weights_l2 --> 7109.426 | Lr --> 0.019 | Seconds_per_step --> 3.311 |
+[2024-09-03 01:04:58,102][Main][INFO] - [train] Step 8600 out of 65536 | Loss --> 2.512 | Grad_l2 --> 0.445 | Weights_l2 --> 7115.243 | Lr --> 0.019 | Seconds_per_step --> 3.321 |
+[2024-09-03 01:10:30,308][Main][INFO] - [train] Step 8700 out of 65536 | Loss --> 2.497 | Grad_l2 --> 0.416 | Weights_l2 --> 7120.917 | Lr --> 0.019 | Seconds_per_step --> 3.322 |
+[2024-09-03 01:16:01,412][Main][INFO] - [train] Step 8800 out of 65536 | Loss --> 2.503 | Grad_l2 --> 0.453 | Weights_l2 --> 7127.067 | Lr --> 0.019 | Seconds_per_step --> 3.311 |
+[2024-09-03 01:21:33,679][Main][INFO] - [train] Step 8900 out of 65536 | Loss --> 2.498 | Grad_l2 --> 0.519 | Weights_l2 --> 7133.268 | Lr --> 0.019 | Seconds_per_step --> 3.323 |
+[2024-09-03 01:27:05,633][Main][INFO] - [train] Step 9000 out of 65536 | Loss --> 2.480 | Grad_l2 --> 0.413 | Weights_l2 --> 7139.449 | Lr --> 0.019 | Seconds_per_step --> 3.320 |
+[2024-09-03 01:32:36,839][Main][INFO] - [train] Step 9100 out of 65536 | Loss --> 2.488 | Grad_l2 --> 0.429 | Weights_l2 --> 7145.663 | Lr --> 0.019 | Seconds_per_step --> 3.312 |
+[2024-09-03 01:38:09,090][Main][INFO] - [train] Step 9200 out of 65536 | Loss --> 2.458 | Grad_l2 --> 0.651 | Weights_l2 --> 7151.751 | Lr --> 0.019 | Seconds_per_step --> 3.322 |
+[2024-09-03 01:43:40,183][Main][INFO] - [train] Step 9300 out of 65536 | Loss --> 2.481 | Grad_l2 --> 0.667 | Weights_l2 --> 7157.979 | Lr --> 0.019 | Seconds_per_step --> 3.311 |
+[2024-09-03 01:49:12,323][Main][INFO] - [train] Step 9400 out of 65536 | Loss --> 2.454 | Grad_l2 --> 0.500 | Weights_l2 --> 7164.722 | Lr --> 0.019 | Seconds_per_step --> 3.321 |
+[2024-09-03 01:54:44,360][Main][INFO] - [train] Step 9500 out of 65536 | Loss --> 2.434 | Grad_l2 --> 0.434 | Weights_l2 --> 7171.100 | Lr --> 0.020 | Seconds_per_step --> 3.320 |
+[2024-09-03 02:00:15,384][Main][INFO] - [train] Step 9600 out of 65536 | Loss --> 2.430 | Grad_l2 --> 0.459 | Weights_l2 --> 7177.669 | Lr --> 0.020 | Seconds_per_step --> 3.310 |
+[2024-09-03 02:05:47,653][Main][INFO] - [train] Step 9700 out of 65536 | Loss --> 2.435 | Grad_l2 --> 0.458 | Weights_l2 --> 7184.407 | Lr --> 0.020 | Seconds_per_step --> 3.323 |
+[2024-09-03 02:11:19,839][Main][INFO] - [train] Step 9800 out of 65536 | Loss --> 2.431 | Grad_l2 --> 0.796 | Weights_l2 --> 7190.992 | Lr --> 0.020 | Seconds_per_step --> 3.322 |
+[2024-09-03 02:16:50,929][Main][INFO] - [train] Step 9900 out of 65536 | Loss --> 2.403 | Grad_l2 --> 0.782 | Weights_l2 --> 7197.863 | Lr --> 0.020 | Seconds_per_step --> 3.311 |
+[2024-09-03 02:22:23,236][Main][INFO] - [train] Step 10000 out of 65536 | Loss --> 2.445 | Grad_l2 --> 1.140 | Weights_l2 --> 7204.637 | Lr --> 0.020 | Seconds_per_step --> 3.323 |
+[2024-09-03 02:22:23,238][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-10000
+[2024-09-03 02:22:23,245][accelerate.utils.other][WARNING] - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
+[2024-09-03 02:22:29,395][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-10000/model.safetensors
+[2024-09-03 02:22:38,780][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-10000/optimizer.bin
+[2024-09-03 02:22:38,784][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-10000/scheduler.bin
+[2024-09-03 02:22:38,784][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-10000/sampler.bin
+[2024-09-03 02:22:38,785][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-10000/sampler_1.bin
+[2024-09-03 02:22:38,790][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-10000/random_states_0.pkl
+[2024-09-03 02:28:09,713][Main][INFO] - [train] Step 10100 out of 65536 | Loss --> 2.441 | Grad_l2 --> 1.063 | Weights_l2 --> 7212.671 | Lr --> 0.020 | Seconds_per_step --> 3.465 |
+[2024-09-03 02:33:42,096][Main][INFO] - [train] Step 10200 out of 65536 | Loss --> 2.421 | Grad_l2 --> 1.135 | Weights_l2 --> 7219.539 | Lr --> 0.020 | Seconds_per_step --> 3.324 |
+[2024-09-03 02:39:14,331][Main][INFO] - [train] Step 10300 out of 65536 | Loss --> 2.408 | Grad_l2 --> 1.377 | Weights_l2 --> 7226.397 | Lr --> 0.020 | Seconds_per_step --> 3.322 |
+[2024-09-03 02:44:45,309][Main][INFO] - [train] Step 10400 out of 65536 | Loss --> 2.385 | Grad_l2 --> 1.568 | Weights_l2 --> 7232.973 | Lr --> 0.020 | Seconds_per_step --> 3.310 |
+[2024-09-03 02:50:17,356][Main][INFO] - [train] Step 10500 out of 65536 | Loss --> 2.383 | Grad_l2 --> 5.267 | Weights_l2 --> 7238.788 | Lr --> 0.020 | Seconds_per_step --> 3.320 |
+[2024-09-03 02:55:49,191][Main][INFO] - [train] Step 10600 out of 65536 | Loss --> 51.695 | Grad_l2 --> 2316.455 | Weights_l2 --> 7233.899 | Lr --> 0.020 | Seconds_per_step --> 3.318 |
+[2024-09-03 03:01:20,350][Main][INFO] - [train] Step 10700 out of 65536 | Loss --> 19.189 | Grad_l2 --> 206.407 | Weights_l2 --> 7221.798 | Lr --> 0.020 | Seconds_per_step --> 3.312 |
+[2024-09-03 03:06:52,743][Main][INFO] - [train] Step 10800 out of 65536 | Loss --> 6.908 | Grad_l2 --> 26.249 | Weights_l2 --> 7210.980 | Lr --> 0.020 | Seconds_per_step --> 3.324 |
+[2024-09-03 03:12:23,733][Main][INFO] - [train] Step 10900 out of 65536 | Loss --> 42.736 | Grad_l2 --> 1292.659 | Weights_l2 --> 7206.464 | Lr --> 0.020 | Seconds_per_step --> 3.310 |

checkpoints/wandb/debug-internal.log ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/wandb/debug.log ADDED Viewed

	@@ -0,0 +1,28 @@

+2024-09-02 17:03:04,741 INFO    MainThread:6499 [wandb_setup.py:_flush():77] Current SDK version is 0.17.8
+2024-09-02 17:03:04,741 INFO    MainThread:6499 [wandb_setup.py:_flush():77] Configure stats pid to 6499
+2024-09-02 17:03:04,741 INFO    MainThread:6499 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
+2024-09-02 17:03:04,741 INFO    MainThread:6499 [wandb_setup.py:_flush():77] Loading settings from /workspace/nanoT5/logs/2024-09-02/17-03-02/wandb/settings
+2024-09-02 17:03:04,742 INFO    MainThread:6499 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
+2024-09-02 17:03:04,742 INFO    MainThread:6499 [wandb_setup.py:_flush():77] Applying setup settings: {'_disable_service': False}
+2024-09-02 17:03:04,742 WARNING MainThread:6499 [wandb_setup.py:_flush():77] Could not find program at -m nanoT5.main
+2024-09-02 17:03:04,743 INFO    MainThread:6499 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m nanoT5.main'}
+2024-09-02 17:03:04,743 INFO    MainThread:6499 [wandb_setup.py:_flush():77] Applying login settings: {}
+2024-09-02 17:03:04,743 INFO    MainThread:6499 [wandb_init.py:_log_setup():524] Logging user logs to /workspace/nanoT5/logs/2024-09-02/17-03-02/wandb/run-20240902_170304-v43qltex/logs/debug.log
+2024-09-02 17:03:04,744 INFO    MainThread:6499 [wandb_init.py:_log_setup():525] Logging internal logs to /workspace/nanoT5/logs/2024-09-02/17-03-02/wandb/run-20240902_170304-v43qltex/logs/debug-internal.log
+2024-09-02 17:03:04,744 INFO    MainThread:6499 [wandb_init.py:init():607] calling init triggers
+2024-09-02 17:03:04,744 INFO    MainThread:6499 [wandb_init.py:init():614] wandb.init called with sweep_config: {}
+config: {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 2137, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'model': {'klass': 'custom_seq2seq', 'name': 'google/t5-v1_1-base', 'overwrite': None, 'add_config': None, 'checkpoint_path': '', 'random_init': True, 'compile': True}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 8}, 'optim': {'name': 'adamwscale', 'base_lr': 0.02, 'batch_size': 64, 'total_steps': 65536, 'epochs': -1, 'warmup_steps': 10000, 'lr_scheduler': 'cosine', 'weight_decay': 0.001, 'grad_clip': 1.0, 'grad_acc': 4, 'final_cosine': 1e-05}, 'eval': {'every_steps': 100000, 'steps': 500}, 'checkpoint': {'every_steps': 5000}, 'logging': {'every_steps': 100, 'grad_l2': True, 'weights_l2': True, 'use_wandb': True, 'wandb_config': {'project': 'nano-custom-seq2seq', 'entity': 'amazingvince', 'tags': ['nanoT5', 'my_tag'], 'mode': 'online'}}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/logs/2024-09-02/17-03-02'}
+2024-09-02 17:03:04,745 INFO    MainThread:6499 [wandb_init.py:init():657] starting backend
+2024-09-02 17:03:04,745 INFO    MainThread:6499 [wandb_init.py:init():661] setting up manager
+2024-09-02 17:03:04,760 INFO    MainThread:6499 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-09-02 17:03:04,761 INFO    MainThread:6499 [wandb_init.py:init():669] backend started and connected
+2024-09-02 17:03:04,776 INFO    MainThread:6499 [wandb_init.py:init():767] updated telemetry
+2024-09-02 17:03:04,819 INFO    MainThread:6499 [wandb_init.py:init():800] communicating run to backend with 90.0 second timeout
+2024-09-02 17:03:05,519 INFO    MainThread:6499 [wandb_init.py:init():851] starting run threads in backend
+2024-09-02 17:03:05,817 INFO    MainThread:6499 [wandb_run.py:_console_start():2463] atexit reg
+2024-09-02 17:03:05,818 INFO    MainThread:6499 [wandb_run.py:_redirect():2309] redirect: wrap_raw
+2024-09-02 17:03:05,819 INFO    MainThread:6499 [wandb_run.py:_redirect():2374] Wrapping output streams.
+2024-09-02 17:03:05,819 INFO    MainThread:6499 [wandb_run.py:_redirect():2399] Redirects installed.
+2024-09-02 17:03:05,822 INFO    MainThread:6499 [wandb_init.py:init():894] run started, returning control to user process
+2024-09-02 17:03:35,512 INFO    MainThread:6499 [wandb_run.py:_config_callback():1392] config_cb None None {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 2137, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'model': {'klass': 'custom_seq2seq', 'name': 'google/t5-v1_1-base', 'overwrite': None, 'add_config': None, 'checkpoint_path': '', 'random_init': True, 'compile': True}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 8, 'before_mask_input_length': 1137, 'target_length': 229}, 'optim': {'name': 'adamwscale', 'base_lr': 0.02, 'batch_size': 64, 'total_steps': 65536, 'epochs': -1, 'warmup_steps': 10000, 'lr_scheduler': 'cosine', 'weight_decay': 0.001, 'grad_clip': 1.0, 'grad_acc': 4, 'final_cosine': 1e-05}, 'eval': {'every_steps': 100000, 'steps': 500, 'corrected_steps': 500}, 'checkpoint': {'every_steps': 5000}, 'logging': {'every_steps': 100, 'grad_l2': True, 'weights_l2': True, 'use_wandb': True, 'wandb_config': {'project': 'nano-custom-seq2seq', 'entity': 'amazingvince', 'tags': ['nanoT5', 'my_tag'], 'mode': 'online'}}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/logs/2024-09-02/17-03-02', 'n_all_param': 673076736}
+2024-09-03 03:17:10,763 WARNING MsgRouterThr:6499 [router.py:message_loop():77] message_loop has been closed

checkpoints/wandb/run-20240902_170304-v43qltex/files/config.yaml ADDED Viewed

	@@ -0,0 +1,132 @@

+wandb_version: 1
+mode:
+  desc: null
+  value: pt
+device:
+  desc: null
+  value: gpu
+precision:
+  desc: null
+  value: bf16
+eval_only:
+  desc: null
+  value: false
+predict_only:
+  desc: null
+  value: false
+seed:
+  desc: null
+  value: 2137
+tokenizer:
+  desc: null
+  value:
+    name: BEE-spoke-data/slimpajama_tok-48128-BPE-forT5
+model:
+  desc: null
+  value:
+    klass: custom_seq2seq
+    name: google/t5-v1_1-base
+    overwrite: null
+    add_config: null
+    checkpoint_path: ''
+    random_init: true
+    compile: true
+data:
+  desc: null
+  value:
+    input_length: 1024
+    mlm_probability: 0.15
+    mean_noise_span_length: 3.0
+    num_workers: 8
+    before_mask_input_length: 1137
+    target_length: 229
+optim:
+  desc: null
+  value:
+    name: adamwscale
+    base_lr: 0.02
+    batch_size: 64
+    total_steps: 65536
+    epochs: -1
+    warmup_steps: 10000
+    lr_scheduler: cosine
+    weight_decay: 0.001
+    grad_clip: 1.0
+    grad_acc: 4
+    final_cosine: 1.0e-05
+eval:
+  desc: null
+  value:
+    every_steps: 100000
+    steps: 500
+    corrected_steps: 500
+checkpoint:
+  desc: null
+  value:
+    every_steps: 5000
+logging:
+  desc: null
+  value:
+    every_steps: 100
+    grad_l2: true
+    weights_l2: true
+    use_wandb: true
+    wandb_config:
+      project: nano-custom-seq2seq
+      entity: amazingvince
+      tags:
+      - nanoT5
+      - my_tag
+      mode: online
+slurm_id:
+  desc: null
+  value: none
+working_dir:
+  desc: null
+  value: /workspace/nanoT5/logs/2024-09-02/17-03-02
+_wandb:
+  desc: null
+  value:
+    python_version: 3.11.9
+    cli_version: 0.17.8
+    framework: huggingface
+    huggingface_version: 4.44.2
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1725296584
+    t:
+      1:
+      - 1
+      - 11
+      - 41
+      - 49
+      - 50
+      - 51
+      - 55
+      - 71
+      - 100
+      2:
+      - 1
+      - 11
+      - 41
+      - 49
+      - 50
+      - 51
+      - 55
+      - 71
+      - 100
+      3:
+      - 15
+      - 16
+      - 23
+      - 61
+      4: 3.11.9
+      5: 0.17.8
+      6: 4.44.2
+      8:
+      - 5
+      13: linux-x86_64
+n_all_param:
+  desc: null
+  value: 673076736

checkpoints/wandb/run-20240902_170304-v43qltex/files/output.log ADDED Viewed

	@@ -0,0 +1,253 @@

+loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--slimpajama_tok-48128-BPE-forT5/snapshots/daab53ce6ef3ec52824b04e85fe8cf762739b407/tokenizer.model
+loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--slimpajama_tok-48128-BPE-forT5/snapshots/daab53ce6ef3ec52824b04e85fe8cf762739b407/tokenizer.json
+loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--slimpajama_tok-48128-BPE-forT5/snapshots/daab53ce6ef3ec52824b04e85fe8cf762739b407/added_tokens.json
+loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--slimpajama_tok-48128-BPE-forT5/snapshots/daab53ce6ef3ec52824b04e85fe8cf762739b407/special_tokens_map.json
+loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--BEE-spoke-data--slimpajama_tok-48128-BPE-forT5/snapshots/daab53ce6ef3ec52824b04e85fe8cf762739b407/tokenizer_config.json
+Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+Resolving data files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 104/104 [00:00<00:00, 237.82it/s]
+Resolving data files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 234/234 [00:00<00:00, 158787.76it/s]
+Resolving data files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 45.39it/s]
+Resolving data files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 50142.87it/s]
+Resolving data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 104/104 [00:00<00:00, 96827.44it/s]
+Resolving data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 104/104 [00:00<00:00, 98644.87it/s]
+Resolving data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:04<00:00, 223.35it/s]
+Resolving data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:01<00:00, 908.08it/s]
+Configuration saved in ./config.json
+=========================================================================
+Layer (type:depth-idx)  Output Shape      Param #  Trainable
+=========================================================================
+CustomSeq2SeqLLM                    673,076,736       True
+  Embedding                          49,414,144       True
+  CustomEncoder                     193,012,736       True
+    ModuleList                      193,011,712       True
+      EncoderLayer                   12,063,232       True
+      EncoderLayer                   12,063,232       True
+      EncoderLayer                   12,063,232       True
+      EncoderLayer                   12,063,232       True
+      EncoderLayer                   12,063,232       True
+      EncoderLayer                   12,063,232       True
+      EncoderLayer                   12,063,232       True
+      EncoderLayer                   12,063,232       True
+      EncoderLayer                   12,063,232       True
+      EncoderLayer                   12,063,232       True
+      EncoderLayer                   12,063,232       True
+      EncoderLayer                   12,063,232       True
+      EncoderLayer                   12,063,232       True
+      EncoderLayer                   12,063,232       True
+      EncoderLayer                   12,063,232       True
+      EncoderLayer                   12,063,232       True
+    RMSNorm                               1,024       True
+  CustomDecoder                     430,649,856       True
+    ModuleList                      430,648,832       True
+      DecoderLayer                   14,688,256       True
+      DecoderLayer                   12,063,232       True
+      DecoderLayer                   14,688,256       True
+      DecoderLayer                   12,063,232       True
+      DecoderLayer                   14,688,256       True
+      DecoderLayer                   12,063,232       True
+      DecoderLayer                   14,688,256       True
+      DecoderLayer                   12,063,232       True
+      DecoderLayer                   14,688,256       True
+      DecoderLayer                   12,063,232       True
+      DecoderLayer                   14,688,256       True
+      DecoderLayer                   12,063,232       True
+      DecoderLayer                   14,688,256       True
+      DecoderLayer                   12,063,232       True
+      DecoderLayer                   14,688,256       True
+      DecoderLayer                   12,063,232       True
+      DecoderLayer                   14,688,256       True
+      DecoderLayer                   12,063,232       True
+      DecoderLayer                   14,688,256       True
+      DecoderLayer                   12,063,232       True
+      DecoderLayer                   14,688,256       True
+      DecoderLayer                   12,063,232       True
+      DecoderLayer                   14,688,256       True
+      DecoderLayer                   12,063,232       True
+      DecoderLayer                   14,688,256       True
+      DecoderLayer                   12,063,232       True
+      DecoderLayer                   14,688,256       True
+      DecoderLayer                   12,063,232       True
+      DecoderLayer                   14,688,256       True
+      DecoderLayer                   12,063,232       True
+      DecoderLayer                   14,688,256       True
+      DecoderLayer                   14,688,256       True
+    RMSNorm                               1,024       True
+  Linear                             49,414,144       True
+  LigerCrossEntropyLoss                      --      False
+=========================================================================
+Total params: 673,076,736
+Trainable params: 673,076,736
+Non-trainable params: --
+=========================================================================
+W0902 17:08:31.372000 140679957673088 torch/_dynamo/variables/tensor.py:715] [30/0] Graph break from `Tensor.item()`, consider setting:
+W0902 17:08:31.372000 140679957673088 torch/_dynamo/variables/tensor.py:715] [30/0]     torch._dynamo.config.capture_scalar_outputs = True
+W0902 17:08:31.372000 140679957673088 torch/_dynamo/variables/tensor.py:715] [30/0] or:
+W0902 17:08:31.372000 140679957673088 torch/_dynamo/variables/tensor.py:715] [30/0]     env TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1
+W0902 17:08:31.372000 140679957673088 torch/_dynamo/variables/tensor.py:715] [30/0] to include these operations in the captured graph.
+W0902 17:08:31.372000 140679957673088 torch/_dynamo/variables/tensor.py:715] [30/0]
+[2024-09-02 17:14:53,691][Main][INFO] - [train] Step 100 out of 65536 | Loss --> 51.971 | Grad_l2 --> 82.676 | Weights_l2 --> 7042.062 | Lr --> 0.010 | Seconds_per_step --> 6.760 |
+[2024-09-02 17:20:23,699][Main][INFO] - [train] Step 200 out of 65536 | Loss --> 14.150 | Grad_l2 --> 19.390 | Weights_l2 --> 7034.376 | Lr --> 0.010 | Seconds_per_step --> 3.300 |
+[2024-09-02 17:25:54,840][Main][INFO] - [train] Step 300 out of 65536 | Loss --> 9.006 | Grad_l2 --> 9.061 | Weights_l2 --> 7026.824 | Lr --> 0.010 | Seconds_per_step --> 3.311 |
+[2024-09-02 17:31:26,095][Main][INFO] - [train] Step 400 out of 65536 | Loss --> 7.529 | Grad_l2 --> 5.889 | Weights_l2 --> 7019.014 | Lr --> 0.010 | Seconds_per_step --> 3.313 |
+[2024-09-02 17:36:56,190][Main][INFO] - [train] Step 500 out of 65536 | Loss --> 6.618 | Grad_l2 --> 4.039 | Weights_l2 --> 7010.897 | Lr --> 0.011 | Seconds_per_step --> 3.301 |
+[2024-09-02 17:42:27,693][Main][INFO] - [train] Step 600 out of 65536 | Loss --> 5.994 | Grad_l2 --> 2.962 | Weights_l2 --> 7002.549 | Lr --> 0.011 | Seconds_per_step --> 3.315 |
+[2024-09-02 17:47:57,967][Main][INFO] - [train] Step 700 out of 65536 | Loss --> 5.703 | Grad_l2 --> 2.434 | Weights_l2 --> 6994.267 | Lr --> 0.011 | Seconds_per_step --> 3.303 |
+[2024-09-02 17:53:29,228][Main][INFO] - [train] Step 800 out of 65536 | Loss --> 6.603 | Grad_l2 --> 6.221 | Weights_l2 --> 6985.927 | Lr --> 0.011 | Seconds_per_step --> 3.313 |
+[2024-09-02 17:59:00,011][Main][INFO] - [train] Step 900 out of 65536 | Loss --> 5.408 | Grad_l2 --> 1.465 | Weights_l2 --> 6980.026 | Lr --> 0.011 | Seconds_per_step --> 3.308 |
+[2024-09-02 18:04:30,275][Main][INFO] - [train] Step 1000 out of 65536 | Loss --> 5.311 | Grad_l2 --> 0.992 | Weights_l2 --> 6975.109 | Lr --> 0.011 | Seconds_per_step --> 3.303 |
+[2024-09-02 18:10:01,468][Main][INFO] - [train] Step 1100 out of 65536 | Loss --> 5.241 | Grad_l2 --> 0.854 | Weights_l2 --> 6970.708 | Lr --> 0.011 | Seconds_per_step --> 3.312 |
+[2024-09-02 18:15:33,362][Main][INFO] - [train] Step 1200 out of 65536 | Loss --> 5.180 | Grad_l2 --> 0.838 | Weights_l2 --> 6966.641 | Lr --> 0.011 | Seconds_per_step --> 3.319 |
+[2024-09-02 18:21:03,902][Main][INFO] - [train] Step 1300 out of 65536 | Loss --> 5.126 | Grad_l2 --> 0.764 | Weights_l2 --> 6962.789 | Lr --> 0.011 | Seconds_per_step --> 3.305 |
+[2024-09-02 18:26:35,349][Main][INFO] - [train] Step 1400 out of 65536 | Loss --> 5.088 | Grad_l2 --> 0.744 | Weights_l2 --> 6959.146 | Lr --> 0.011 | Seconds_per_step --> 3.314 |
+[2024-09-02 18:32:06,048][Main][INFO] - [train] Step 1500 out of 65536 | Loss --> 5.046 | Grad_l2 --> 0.702 | Weights_l2 --> 6955.673 | Lr --> 0.012 | Seconds_per_step --> 3.307 |
+[2024-09-02 18:37:37,903][Main][INFO] - [train] Step 1600 out of 65536 | Loss --> 5.007 | Grad_l2 --> 0.691 | Weights_l2 --> 6952.523 | Lr --> 0.012 | Seconds_per_step --> 3.319 |
+[2024-09-02 18:43:09,723][Main][INFO] - [train] Step 1700 out of 65536 | Loss --> 4.973 | Grad_l2 --> 0.673 | Weights_l2 --> 6949.412 | Lr --> 0.012 | Seconds_per_step --> 3.318 |
+[2024-09-02 18:48:40,909][Main][INFO] - [train] Step 1800 out of 65536 | Loss --> 4.943 | Grad_l2 --> 0.671 | Weights_l2 --> 6946.498 | Lr --> 0.012 | Seconds_per_step --> 3.312 |
+[2024-09-02 18:54:13,524][Main][INFO] - [train] Step 1900 out of 65536 | Loss --> 4.929 | Grad_l2 --> 0.668 | Weights_l2 --> 6943.795 | Lr --> 0.012 | Seconds_per_step --> 3.326 |
+[2024-09-02 18:59:45,500][Main][INFO] - [train] Step 2000 out of 65536 | Loss --> 4.894 | Grad_l2 --> 0.665 | Weights_l2 --> 6941.241 | Lr --> 0.012 | Seconds_per_step --> 3.320 |
+[2024-09-02 19:05:16,395][Main][INFO] - [train] Step 2100 out of 65536 | Loss --> 4.881 | Grad_l2 --> 0.713 | Weights_l2 --> 6938.861 | Lr --> 0.012 | Seconds_per_step --> 3.309 |
+[2024-09-02 19:10:48,520][Main][INFO] - [train] Step 2200 out of 65536 | Loss --> 4.853 | Grad_l2 --> 0.653 | Weights_l2 --> 6936.551 | Lr --> 0.012 | Seconds_per_step --> 3.321 |
+[2024-09-02 19:16:19,278][Main][INFO] - [train] Step 2300 out of 65536 | Loss --> 4.829 | Grad_l2 --> 0.646 | Weights_l2 --> 6934.357 | Lr --> 0.012 | Seconds_per_step --> 3.308 |
+[2024-09-02 19:21:51,370][Main][INFO] - [train] Step 2400 out of 65536 | Loss --> 4.790 | Grad_l2 --> 0.620 | Weights_l2 --> 6932.338 | Lr --> 0.012 | Seconds_per_step --> 3.321 |
+[2024-09-02 19:27:23,544][Main][INFO] - [train] Step 2500 out of 65536 | Loss --> 4.784 | Grad_l2 --> 0.643 | Weights_l2 --> 6930.395 | Lr --> 0.013 | Seconds_per_step --> 3.322 |
+[2024-09-02 19:32:54,341][Main][INFO] - [train] Step 2600 out of 65536 | Loss --> 4.755 | Grad_l2 --> 0.623 | Weights_l2 --> 6928.543 | Lr --> 0.013 | Seconds_per_step --> 3.308 |
+[2024-09-02 19:38:25,942][Main][INFO] - [train] Step 2700 out of 65536 | Loss --> 4.743 | Grad_l2 --> 0.636 | Weights_l2 --> 6926.944 | Lr --> 0.013 | Seconds_per_step --> 3.316 |
+[2024-09-02 19:43:57,708][Main][INFO] - [train] Step 2800 out of 65536 | Loss --> 4.722 | Grad_l2 --> 0.590 | Weights_l2 --> 6925.379 | Lr --> 0.013 | Seconds_per_step --> 3.318 |
+[2024-09-02 19:49:28,285][Main][INFO] - [train] Step 2900 out of 65536 | Loss --> 4.715 | Grad_l2 --> 0.622 | Weights_l2 --> 6924.007 | Lr --> 0.013 | Seconds_per_step --> 3.306 |
+[2024-09-02 19:54:59,957][Main][INFO] - [train] Step 3000 out of 65536 | Loss --> 4.694 | Grad_l2 --> 0.652 | Weights_l2 --> 6922.709 | Lr --> 0.013 | Seconds_per_step --> 3.317 |
+[2024-09-02 20:00:31,072][Main][INFO] - [train] Step 3100 out of 65536 | Loss --> 4.678 | Grad_l2 --> 0.614 | Weights_l2 --> 6921.561 | Lr --> 0.013 | Seconds_per_step --> 3.311 |
+[2024-09-02 20:06:02,747][Main][INFO] - [train] Step 3200 out of 65536 | Loss --> 4.633 | Grad_l2 --> 0.610 | Weights_l2 --> 6920.463 | Lr --> 0.013 | Seconds_per_step --> 3.317 |
+[2024-09-02 20:11:34,607][Main][INFO] - [train] Step 3300 out of 65536 | Loss --> 4.599 | Grad_l2 --> 0.638 | Weights_l2 --> 6919.642 | Lr --> 0.013 | Seconds_per_step --> 3.319 |
+[2024-09-02 20:17:05,731][Main][INFO] - [train] Step 3400 out of 65536 | Loss --> 4.549 | Grad_l2 --> 0.774 | Weights_l2 --> 6919.263 | Lr --> 0.013 | Seconds_per_step --> 3.311 |
+[2024-09-02 20:22:37,601][Main][INFO] - [train] Step 3500 out of 65536 | Loss --> 4.420 | Grad_l2 --> 0.934 | Weights_l2 --> 6918.974 | Lr --> 0.014 | Seconds_per_step --> 3.319 |
+[2024-09-02 20:28:09,554][Main][INFO] - [train] Step 3600 out of 65536 | Loss --> 4.256 | Grad_l2 --> 0.763 | Weights_l2 --> 6919.477 | Lr --> 0.014 | Seconds_per_step --> 3.319 |
+[2024-09-02 20:33:40,654][Main][INFO] - [train] Step 3700 out of 65536 | Loss --> 4.131 | Grad_l2 --> 0.657 | Weights_l2 --> 6920.705 | Lr --> 0.014 | Seconds_per_step --> 3.311 |
+[2024-09-02 20:39:13,064][Main][INFO] - [train] Step 3800 out of 65536 | Loss --> 4.021 | Grad_l2 --> 0.709 | Weights_l2 --> 6922.188 | Lr --> 0.014 | Seconds_per_step --> 3.324 |
+[2024-09-02 20:44:45,663][Main][INFO] - [train] Step 3900 out of 65536 | Loss --> 3.909 | Grad_l2 --> 0.637 | Weights_l2 --> 6923.666 | Lr --> 0.014 | Seconds_per_step --> 3.326 |
+[2024-09-02 20:50:16,811][Main][INFO] - [train] Step 4000 out of 65536 | Loss --> 3.855 | Grad_l2 --> 1.013 | Weights_l2 --> 6923.778 | Lr --> 0.014 | Seconds_per_step --> 3.311 |
+[2024-09-02 20:55:49,235][Main][INFO] - [train] Step 4100 out of 65536 | Loss --> 3.770 | Grad_l2 --> 0.589 | Weights_l2 --> 6925.545 | Lr --> 0.014 | Seconds_per_step --> 3.324 |
+[2024-09-02 21:01:20,500][Main][INFO] - [train] Step 4200 out of 65536 | Loss --> 3.710 | Grad_l2 --> 0.579 | Weights_l2 --> 6927.200 | Lr --> 0.014 | Seconds_per_step --> 3.313 |
+[2024-09-02 21:06:53,406][Main][INFO] - [train] Step 4300 out of 65536 | Loss --> 3.651 | Grad_l2 --> 0.588 | Weights_l2 --> 6928.842 | Lr --> 0.014 | Seconds_per_step --> 3.329 |
+[2024-09-02 21:12:26,298][Main][INFO] - [train] Step 4400 out of 65536 | Loss --> 3.614 | Grad_l2 --> 0.632 | Weights_l2 --> 6930.597 | Lr --> 0.014 | Seconds_per_step --> 3.329 |
+[2024-09-02 21:17:57,623][Main][INFO] - [train] Step 4500 out of 65536 | Loss --> 3.582 | Grad_l2 --> 0.884 | Weights_l2 --> 6931.569 | Lr --> 0.015 | Seconds_per_step --> 3.313 |
+[2024-09-02 21:23:30,116][Main][INFO] - [train] Step 4600 out of 65536 | Loss --> 3.527 | Grad_l2 --> 0.582 | Weights_l2 --> 6933.783 | Lr --> 0.015 | Seconds_per_step --> 3.325 |
+[2024-09-02 21:29:02,417][Main][INFO] - [train] Step 4700 out of 65536 | Loss --> 3.476 | Grad_l2 --> 0.549 | Weights_l2 --> 6935.959 | Lr --> 0.015 | Seconds_per_step --> 3.323 |
+[2024-09-02 21:34:33,535][Main][INFO] - [train] Step 4800 out of 65536 | Loss --> 3.430 | Grad_l2 --> 0.551 | Weights_l2 --> 6938.224 | Lr --> 0.015 | Seconds_per_step --> 3.311 |
+[2024-09-02 21:40:05,905][Main][INFO] - [train] Step 4900 out of 65536 | Loss --> 3.395 | Grad_l2 --> 0.550 | Weights_l2 --> 6940.617 | Lr --> 0.015 | Seconds_per_step --> 3.324 |
+[2024-09-02 21:45:36,944][Main][INFO] - [train] Step 5000 out of 65536 | Loss --> 3.366 | Grad_l2 --> 0.546 | Weights_l2 --> 6943.230 | Lr --> 0.015 | Seconds_per_step --> 3.310 |
+[2024-09-02 21:45:36,947][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-5000
+[2024-09-02 21:45:36,954][accelerate.utils.other][WARNING] - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
+[2024-09-02 21:45:44,182][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-5000/model.safetensors
+[2024-09-02 21:45:54,822][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-5000/optimizer.bin
+[2024-09-02 21:45:54,827][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-5000/scheduler.bin
+[2024-09-02 21:45:54,828][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-5000/sampler.bin
+[2024-09-02 21:45:54,829][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-5000/sampler_1.bin
+[2024-09-02 21:45:54,835][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-5000/random_states_0.pkl
+[2024-09-02 21:51:26,402][Main][INFO] - [train] Step 5100 out of 65536 | Loss --> 3.302 | Grad_l2 --> 0.541 | Weights_l2 --> 6946.278 | Lr --> 0.015 | Seconds_per_step --> 3.495 |
+[2024-09-02 21:56:58,321][Main][INFO] - [train] Step 5200 out of 65536 | Loss --> 3.248 | Grad_l2 --> 0.556 | Weights_l2 --> 6950.060 | Lr --> 0.015 | Seconds_per_step --> 3.319 |
+[2024-09-02 22:02:29,452][Main][INFO] - [train] Step 5300 out of 65536 | Loss --> 3.194 | Grad_l2 --> 0.566 | Weights_l2 --> 6954.461 | Lr --> 0.015 | Seconds_per_step --> 3.311 |
+[2024-09-02 22:08:01,594][Main][INFO] - [train] Step 5400 out of 65536 | Loss --> 3.144 | Grad_l2 --> 0.548 | Weights_l2 --> 6959.061 | Lr --> 0.015 | Seconds_per_step --> 3.321 |
+[2024-09-02 22:13:33,473][Main][INFO] - [train] Step 5500 out of 65536 | Loss --> 3.099 | Grad_l2 --> 0.546 | Weights_l2 --> 6963.676 | Lr --> 0.016 | Seconds_per_step --> 3.319 |
+[2024-09-02 22:19:04,763][Main][INFO] - [train] Step 5600 out of 65536 | Loss --> 3.044 | Grad_l2 --> 0.531 | Weights_l2 --> 6968.055 | Lr --> 0.016 | Seconds_per_step --> 3.313 |
+[2024-09-02 22:24:37,024][Main][INFO] - [train] Step 5700 out of 65536 | Loss --> 3.023 | Grad_l2 --> 0.528 | Weights_l2 --> 6972.595 | Lr --> 0.016 | Seconds_per_step --> 3.323 |
+[2024-09-02 22:30:08,010][Main][INFO] - [train] Step 5800 out of 65536 | Loss --> 2.999 | Grad_l2 --> 0.529 | Weights_l2 --> 6977.095 | Lr --> 0.016 | Seconds_per_step --> 3.310 |
+[2024-09-02 22:35:40,260][Main][INFO] - [train] Step 5900 out of 65536 | Loss --> 2.953 | Grad_l2 --> 0.516 | Weights_l2 --> 6981.522 | Lr --> 0.016 | Seconds_per_step --> 3.322 |
+[2024-09-02 22:41:12,494][Main][INFO] - [train] Step 6000 out of 65536 | Loss --> 2.924 | Grad_l2 --> 0.514 | Weights_l2 --> 6985.860 | Lr --> 0.016 | Seconds_per_step --> 3.322 |
+[2024-09-02 22:46:43,439][Main][INFO] - [train] Step 6100 out of 65536 | Loss --> 2.904 | Grad_l2 --> 0.500 | Weights_l2 --> 6990.209 | Lr --> 0.016 | Seconds_per_step --> 3.309 |
+[2024-09-02 22:52:15,361][Main][INFO] - [train] Step 6200 out of 65536 | Loss --> 2.885 | Grad_l2 --> 0.499 | Weights_l2 --> 6994.575 | Lr --> 0.016 | Seconds_per_step --> 3.319 |
+[2024-09-02 22:57:47,371][Main][INFO] - [train] Step 6300 out of 65536 | Loss --> 2.860 | Grad_l2 --> 0.496 | Weights_l2 --> 6998.855 | Lr --> 0.016 | Seconds_per_step --> 3.320 |
+[2024-09-02 23:03:18,243][Main][INFO] - [train] Step 6400 out of 65536 | Loss --> 2.828 | Grad_l2 --> 0.486 | Weights_l2 --> 7003.354 | Lr --> 0.016 | Seconds_per_step --> 3.309 |
+[2024-09-02 23:08:50,256][Main][INFO] - [train] Step 6500 out of 65536 | Loss --> 2.823 | Grad_l2 --> 0.491 | Weights_l2 --> 7007.772 | Lr --> 0.017 | Seconds_per_step --> 3.320 |
+[2024-09-02 23:14:21,254][Main][INFO] - [train] Step 6600 out of 65536 | Loss --> 2.801 | Grad_l2 --> 0.572 | Weights_l2 --> 7012.034 | Lr --> 0.017 | Seconds_per_step --> 3.310 |
+[2024-09-02 23:19:53,383][Main][INFO] - [train] Step 6700 out of 65536 | Loss --> 2.776 | Grad_l2 --> 0.473 | Weights_l2 --> 7016.624 | Lr --> 0.017 | Seconds_per_step --> 3.321 |
+[2024-09-02 23:25:25,894][Main][INFO] - [train] Step 6800 out of 65536 | Loss --> 2.764 | Grad_l2 --> 0.489 | Weights_l2 --> 7021.128 | Lr --> 0.017 | Seconds_per_step --> 3.325 |
+[2024-09-02 23:30:56,990][Main][INFO] - [train] Step 6900 out of 65536 | Loss --> 2.754 | Grad_l2 --> 0.467 | Weights_l2 --> 7025.909 | Lr --> 0.017 | Seconds_per_step --> 3.311 |
+[2024-09-02 23:36:28,837][Main][INFO] - [train] Step 7000 out of 65536 | Loss --> 2.716 | Grad_l2 --> 0.469 | Weights_l2 --> 7030.583 | Lr --> 0.017 | Seconds_per_step --> 3.318 |
+[2024-09-02 23:42:00,897][Main][INFO] - [train] Step 7100 out of 65536 | Loss --> 2.706 | Grad_l2 --> 0.470 | Weights_l2 --> 7035.338 | Lr --> 0.017 | Seconds_per_step --> 3.321 |
+[2024-09-02 23:47:31,913][Main][INFO] - [train] Step 7200 out of 65536 | Loss --> 2.685 | Grad_l2 --> 0.460 | Weights_l2 --> 7040.107 | Lr --> 0.017 | Seconds_per_step --> 3.310 |
+[2024-09-02 23:53:04,028][Main][INFO] - [train] Step 7300 out of 65536 | Loss --> 2.675 | Grad_l2 --> 0.462 | Weights_l2 --> 7044.921 | Lr --> 0.017 | Seconds_per_step --> 3.321 |
+[2024-09-02 23:58:35,224][Main][INFO] - [train] Step 7400 out of 65536 | Loss --> 2.670 | Grad_l2 --> 0.473 | Weights_l2 --> 7049.994 | Lr --> 0.017 | Seconds_per_step --> 3.312 |
+[2024-09-03 00:04:07,495][Main][INFO] - [train] Step 7500 out of 65536 | Loss --> 2.653 | Grad_l2 --> 0.452 | Weights_l2 --> 7055.123 | Lr --> 0.018 | Seconds_per_step --> 3.323 |
+[2024-09-03 00:09:39,687][Main][INFO] - [train] Step 7600 out of 65536 | Loss --> 2.644 | Grad_l2 --> 0.499 | Weights_l2 --> 7060.263 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
+[2024-09-03 00:15:11,125][Main][INFO] - [train] Step 7700 out of 65536 | Loss --> 2.619 | Grad_l2 --> 0.451 | Weights_l2 --> 7065.593 | Lr --> 0.018 | Seconds_per_step --> 3.314 |
+[2024-09-03 00:20:43,656][Main][INFO] - [train] Step 7800 out of 65536 | Loss --> 2.611 | Grad_l2 --> 0.444 | Weights_l2 --> 7071.016 | Lr --> 0.018 | Seconds_per_step --> 3.325 |
+[2024-09-03 00:26:15,825][Main][INFO] - [train] Step 7900 out of 65536 | Loss --> 2.593 | Grad_l2 --> 0.444 | Weights_l2 --> 7076.338 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
+[2024-09-03 00:31:46,986][Main][INFO] - [train] Step 8000 out of 65536 | Loss --> 2.591 | Grad_l2 --> 0.707 | Weights_l2 --> 7081.619 | Lr --> 0.018 | Seconds_per_step --> 3.312 |
+[2024-09-03 00:37:19,240][Main][INFO] - [train] Step 8100 out of 65536 | Loss --> 2.583 | Grad_l2 --> 0.504 | Weights_l2 --> 7087.303 | Lr --> 0.018 | Seconds_per_step --> 3.323 |
+[2024-09-03 00:42:50,497][Main][INFO] - [train] Step 8200 out of 65536 | Loss --> 2.572 | Grad_l2 --> 0.435 | Weights_l2 --> 7092.976 | Lr --> 0.018 | Seconds_per_step --> 3.313 |
+[2024-09-03 00:48:22,669][Main][INFO] - [train] Step 8300 out of 65536 | Loss --> 2.550 | Grad_l2 --> 0.444 | Weights_l2 --> 7098.242 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
+[2024-09-03 00:53:54,859][Main][INFO] - [train] Step 8400 out of 65536 | Loss --> 2.533 | Grad_l2 --> 0.424 | Weights_l2 --> 7103.870 | Lr --> 0.018 | Seconds_per_step --> 3.322 |
+[2024-09-03 00:59:25,959][Main][INFO] - [train] Step 8500 out of 65536 | Loss --> 2.520 | Grad_l2 --> 0.415 | Weights_l2 --> 7109.426 | Lr --> 0.019 | Seconds_per_step --> 3.311 |
+[2024-09-03 01:04:58,102][Main][INFO] - [train] Step 8600 out of 65536 | Loss --> 2.512 | Grad_l2 --> 0.445 | Weights_l2 --> 7115.243 | Lr --> 0.019 | Seconds_per_step --> 3.321 |
+[2024-09-03 01:10:30,308][Main][INFO] - [train] Step 8700 out of 65536 | Loss --> 2.497 | Grad_l2 --> 0.416 | Weights_l2 --> 7120.917 | Lr --> 0.019 | Seconds_per_step --> 3.322 |
+[2024-09-03 01:16:01,412][Main][INFO] - [train] Step 8800 out of 65536 | Loss --> 2.503 | Grad_l2 --> 0.453 | Weights_l2 --> 7127.067 | Lr --> 0.019 | Seconds_per_step --> 3.311 |
+[2024-09-03 01:21:33,679][Main][INFO] - [train] Step 8900 out of 65536 | Loss --> 2.498 | Grad_l2 --> 0.519 | Weights_l2 --> 7133.268 | Lr --> 0.019 | Seconds_per_step --> 3.323 |
+[2024-09-03 01:27:05,633][Main][INFO] - [train] Step 9000 out of 65536 | Loss --> 2.480 | Grad_l2 --> 0.413 | Weights_l2 --> 7139.449 | Lr --> 0.019 | Seconds_per_step --> 3.320 |
+[2024-09-03 01:32:36,839][Main][INFO] - [train] Step 9100 out of 65536 | Loss --> 2.488 | Grad_l2 --> 0.429 | Weights_l2 --> 7145.663 | Lr --> 0.019 | Seconds_per_step --> 3.312 |
+[2024-09-03 01:38:09,090][Main][INFO] - [train] Step 9200 out of 65536 | Loss --> 2.458 | Grad_l2 --> 0.651 | Weights_l2 --> 7151.751 | Lr --> 0.019 | Seconds_per_step --> 3.322 |
+[2024-09-03 01:43:40,183][Main][INFO] - [train] Step 9300 out of 65536 | Loss --> 2.481 | Grad_l2 --> 0.667 | Weights_l2 --> 7157.979 | Lr --> 0.019 | Seconds_per_step --> 3.311 |
+[2024-09-03 01:49:12,323][Main][INFO] - [train] Step 9400 out of 65536 | Loss --> 2.454 | Grad_l2 --> 0.500 | Weights_l2 --> 7164.722 | Lr --> 0.019 | Seconds_per_step --> 3.321 |
+[2024-09-03 01:54:44,360][Main][INFO] - [train] Step 9500 out of 65536 | Loss --> 2.434 | Grad_l2 --> 0.434 | Weights_l2 --> 7171.100 | Lr --> 0.020 | Seconds_per_step --> 3.320 |
+[2024-09-03 02:00:15,384][Main][INFO] - [train] Step 9600 out of 65536 | Loss --> 2.430 | Grad_l2 --> 0.459 | Weights_l2 --> 7177.669 | Lr --> 0.020 | Seconds_per_step --> 3.310 |
+[2024-09-03 02:05:47,653][Main][INFO] - [train] Step 9700 out of 65536 | Loss --> 2.435 | Grad_l2 --> 0.458 | Weights_l2 --> 7184.407 | Lr --> 0.020 | Seconds_per_step --> 3.323 |
+[2024-09-03 02:11:19,839][Main][INFO] - [train] Step 9800 out of 65536 | Loss --> 2.431 | Grad_l2 --> 0.796 | Weights_l2 --> 7190.992 | Lr --> 0.020 | Seconds_per_step --> 3.322 |
+[2024-09-03 02:16:50,929][Main][INFO] - [train] Step 9900 out of 65536 | Loss --> 2.403 | Grad_l2 --> 0.782 | Weights_l2 --> 7197.863 | Lr --> 0.020 | Seconds_per_step --> 3.311 |
+[2024-09-03 02:22:23,236][Main][INFO] - [train] Step 10000 out of 65536 | Loss --> 2.445 | Grad_l2 --> 1.140 | Weights_l2 --> 7204.637 | Lr --> 0.020 | Seconds_per_step --> 3.323 |
+[2024-09-03 02:22:23,238][accelerate.accelerator][INFO] - Saving current state to checkpoint-pt-10000
+[2024-09-03 02:22:23,245][accelerate.utils.other][WARNING] - Removed shared tensor {'lm_head.weight'} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading
+[2024-09-03 02:22:29,395][accelerate.checkpointing][INFO] - Model weights saved in checkpoint-pt-10000/model.safetensors
+[2024-09-03 02:22:38,780][accelerate.checkpointing][INFO] - Optimizer state saved in checkpoint-pt-10000/optimizer.bin
+[2024-09-03 02:22:38,784][accelerate.checkpointing][INFO] - Scheduler state saved in checkpoint-pt-10000/scheduler.bin
+[2024-09-03 02:22:38,784][accelerate.checkpointing][INFO] - Sampler state for dataloader 0 saved in checkpoint-pt-10000/sampler.bin
+[2024-09-03 02:22:38,785][accelerate.checkpointing][INFO] - Sampler state for dataloader 1 saved in checkpoint-pt-10000/sampler_1.bin
+[2024-09-03 02:22:38,790][accelerate.checkpointing][INFO] - Random states saved in checkpoint-pt-10000/random_states_0.pkl
+[2024-09-03 02:28:09,713][Main][INFO] - [train] Step 10100 out of 65536 | Loss --> 2.441 | Grad_l2 --> 1.063 | Weights_l2 --> 7212.671 | Lr --> 0.020 | Seconds_per_step --> 3.465 |
+[2024-09-03 02:33:42,096][Main][INFO] - [train] Step 10200 out of 65536 | Loss --> 2.421 | Grad_l2 --> 1.135 | Weights_l2 --> 7219.539 | Lr --> 0.020 | Seconds_per_step --> 3.324 |
+[2024-09-03 02:39:14,331][Main][INFO] - [train] Step 10300 out of 65536 | Loss --> 2.408 | Grad_l2 --> 1.377 | Weights_l2 --> 7226.397 | Lr --> 0.020 | Seconds_per_step --> 3.322 |
+[2024-09-03 02:44:45,309][Main][INFO] - [train] Step 10400 out of 65536 | Loss --> 2.385 | Grad_l2 --> 1.568 | Weights_l2 --> 7232.973 | Lr --> 0.020 | Seconds_per_step --> 3.310 |
+[2024-09-03 02:50:17,356][Main][INFO] - [train] Step 10500 out of 65536 | Loss --> 2.383 | Grad_l2 --> 5.267 | Weights_l2 --> 7238.788 | Lr --> 0.020 | Seconds_per_step --> 3.320 |
+[2024-09-03 02:55:49,191][Main][INFO] - [train] Step 10600 out of 65536 | Loss --> 51.695 | Grad_l2 --> 2316.455 | Weights_l2 --> 7233.899 | Lr --> 0.020 | Seconds_per_step --> 3.318 |
+[2024-09-03 03:01:20,350][Main][INFO] - [train] Step 10700 out of 65536 | Loss --> 19.189 | Grad_l2 --> 206.407 | Weights_l2 --> 7221.798 | Lr --> 0.020 | Seconds_per_step --> 3.312 |
+[2024-09-03 03:06:52,743][Main][INFO] - [train] Step 10800 out of 65536 | Loss --> 6.908 | Grad_l2 --> 26.249 | Weights_l2 --> 7210.980 | Lr --> 0.020 | Seconds_per_step --> 3.324 |
+[2024-09-03 03:12:23,733][Main][INFO] - [train] Step 10900 out of 65536 | Loss --> 42.736 | Grad_l2 --> 1292.659 | Weights_l2 --> 7206.464 | Lr --> 0.020 | Seconds_per_step --> 3.310 |
+Traceback (most recent call last):
+  File "<frozen runpy>", line 198, in _run_module_as_main
+  File "<frozen runpy>", line 88, in _run_code
+  File "/workspace/nanoT5/nanoT5/main.py", line 92, in <module>
+    main()
+  File "/usr/local/lib/python3.11/dist-packages/hydra/main.py", line 94, in decorated_main
+    _run_hydra(
+  File "/usr/local/lib/python3.11/dist-packages/hydra/_internal/utils.py", line 394, in _run_hydra
+    _run_app(
+  File "/usr/local/lib/python3.11/dist-packages/hydra/_internal/utils.py", line 457, in _run_app
+    run_and_report(
+  File "/usr/local/lib/python3.11/dist-packages/hydra/_internal/utils.py", line 220, in run_and_report
+    return func()
+           ^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/hydra/_internal/utils.py", line 458, in <lambda>
+    lambda: hydra.run(
+            ^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/hydra/_internal/hydra.py", line 119, in run
+    ret = run_job(
+          ^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/hydra/core/utils.py", line 186, in run_job
+    ret.return_value = task_function(task_cfg)
+                       ^^^^^^^^^^^^^^^^^^^^^^^
+  File "/workspace/nanoT5/nanoT5/main.py", line 75, in main
+    train(
+  File "/workspace/nanoT5/nanoT5/utils/train_utils.py", line 197, in train
+    for batch_id, batch in enumerate(train_dataloader, start=1):
+  File "/usr/local/lib/python3.11/dist-packages/accelerate/data_loader.py", line 685, in __iter__
+    batch = send_to_device(batch, self.state.device, non_blocking=self._non_blocking)
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/accelerate/utils/operations.py", line 183, in send_to_device
+    {
+  File "/usr/local/lib/python3.11/dist-packages/accelerate/utils/operations.py", line 184, in <dictcomp>
+    k: t if k in skip_keys else send_to_device(t, device, non_blocking=non_blocking, skip_keys=skip_keys)
+                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/accelerate/utils/operations.py", line 155, in send_to_device
+    return tensor.to(device, non_blocking=non_blocking)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+KeyboardInterrupt

checkpoints/wandb/run-20240902_170304-v43qltex/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,195 @@

+GitPython==3.1.43
+Jinja2==3.1.4
+MarkupSafe==2.1.5
+PyGObject==3.42.1
+PyJWT==2.3.0
+PyYAML==6.0.2
+Pygments==2.18.0
+SecretStorage==3.3.1
+Send2Trash==1.8.3
+absl-py==2.1.0
+accelerate==0.33.0
+aiohappyeyeballs==2.4.0
+aiohttp==3.10.5
+aiosignal==1.3.1
+antlr4-python3-runtime==4.9.3
+anyio==4.4.0
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blinker==1.4
+certifi==2024.7.4
+cffi==1.17.0
+charset-normalizer==3.3.2
+click==8.1.7
+comm==0.2.2
+cryptography==3.4.8
+datasets==2.21.0
+dbus-python==1.2.18
+debugpy==1.8.5
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.8
+distro==1.7.0
+docker-pycreds==0.4.0
+einops==0.8.0
+entrypoints==0.4
+evaluate==0.4.2
+executing==2.0.1
+fancycompleter==0.9.1
+fastjsonschema==2.20.0
+filelock==3.15.4
+flash-attn==2.6.3
+fqdn==1.5.1
+frozenlist==1.4.1
+fsspec==2024.6.1
+gitdb==4.0.11
+h11==0.14.0
+httpcore==1.0.5
+httplib2==0.20.2
+httpx==0.27.0
+huggingface-hub==0.24.6
+hydra-core==1.3.2
+idna==3.7
+importlib-metadata==4.6.4
+ipykernel==6.29.5
+ipython-genutils==0.2.0
+ipython==8.26.0
+ipywidgets==8.1.3
+isoduration==20.11.0
+jedi==0.19.1
+jeepney==0.7.1
+joblib==1.4.2
+json5==0.9.25
+jsonpointer==3.0.0
+jsonschema-specifications==2023.12.1
+jsonschema==4.23.0
+jupyter-archive==3.4.0
+jupyter-events==0.10.0
+jupyter-highlight-selected-word==0.2.0
+jupyter-lsp==2.2.5
+jupyter_client==7.4.9
+jupyter_contrib_core==0.4.2
+jupyter_contrib_nbextensions==0.7.0
+jupyter_core==5.7.2
+jupyter_nbextensions_configurator==0.6.4
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.4
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.11
+keyring==23.5.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+liger-kernel==0.2.1
+lxml==5.3.0
+matplotlib-inline==0.1.7
+mistune==3.0.2
+more-itertools==8.10.0
+mpmath==1.3.0
+multidict==6.0.5
+multiprocess==0.70.16
+nbclassic==1.1.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.3
+nltk==3.9.1
+notebook==6.5.5
+notebook_shim==0.2.4
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.6.20
+nvidia-nvtx-cu12==12.1.105
+oauthlib==3.2.0
+omegaconf==2.3.0
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.2
+pandocfilters==1.5.1
+parso==0.8.4
+pdbpp==0.10.3
+pexpect==4.9.0
+pillow==10.4.0
+pip==24.2
+platformdirs==4.2.2
+prometheus_client==0.20.0
+prompt_toolkit==3.0.47
+protobuf==3.20.3
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==17.0.0
+pycparser==2.22
+pynvml==11.5.3
+pyparsing==2.4.7
+pyrepl==0.9.0
+python-apt==2.4.0+ubuntu3
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+pytz==2024.1
+pyzmq==24.0.1
+referencing==0.35.1
+regex==2024.7.24
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rouge_score==0.1.2
+rpds-py==0.20.0
+safetensors==0.4.4
+sentencepiece==0.2.0
+sentry-sdk==2.13.0
+setproctitle==1.3.3
+setuptools==73.0.1
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+sympy==1.13.2
+terminado==0.18.1
+tinycss2==1.3.0
+tokenizers==0.19.1
+torch==2.4.0
+torchaudio==2.4.0
+torchvision==0.19.0
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.44.2
+triton==3.0.0
+types-python-dateutil==2.9.0.20240821
+typing_extensions==4.12.2
+tzdata==2024.1
+uri-template==1.3.0
+urllib3==2.2.2
+wadllib==1.3.6
+wandb==0.17.8
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+websocket-client==1.8.0
+wheel==0.44.0
+widgetsnbextension==4.0.11
+wmctrl==0.5
+xxhash==3.5.0
+yarl==1.9.7
+zipp==1.0.0

checkpoints/wandb/run-20240902_170304-v43qltex/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,527 @@

+{
+    "os": "Linux-5.4.0-171-generic-x86_64-with-glibc2.35",
+    "python": "3.11.9",
+    "heartbeatAt": "2024-09-02T17:03:05.625568",
+    "startedAt": "2024-09-02T17:03:04.729800",
+    "docker": null,
+    "cuda": null,
+    "args": [],
+    "state": "running",
+    "program": "-m nanoT5.main",
+    "codePathLocal": null,
+    "git": {
+        "remote": "https://github.com/pszemraj/nanoT5.git",
+        "commit": "7e55b4db2270303afebba4e0d389b68979943c0c"
+    },
+    "email": null,
+    "root": "/workspace/nanoT5",
+    "host": "f8d7d6f6310f",
+    "username": "root",
+    "executable": "/usr/bin/python",
+    "cpu_count": 48,
+    "cpu_count_logical": 96,
+    "cpu_freq": {
+        "current": 1001.9331562499998,
+        "min": 800.0,
+        "max": 2801.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 900.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 799.402,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 799.398,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 1037.385,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 872.196,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 1072.759,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 882.854,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 823.861,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 823.212,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 799.39,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 823.065,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 1398.392,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 799.401,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 824.357,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 2022.695,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 3300.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 823.663,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 804.641,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 873.253,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 872.742,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 799.845,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 799.422,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 900.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 799.957,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 1100.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 3351.928,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 799.426,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 870.565,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.192,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.057,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 2799.997,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 959.262,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 2801.291,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 799.425,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 900.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.683,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.697,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.876,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.741,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 942.364,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.344,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 799.272,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 3300.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 3304.817,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.103,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.363,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.727,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 900.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 900.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 900.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 811.831,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 799.938,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 801.226,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 799.947,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 900.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 980.682,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 3308.926,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 801.074,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.709,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 804.122,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 801.051,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 805.622,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 2800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 799.951,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 800.0,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 2802.488,
+            "min": 800.0,
+            "max": 2801.0
+        },
+        {
+            "current": 801.049,
+            "min": 800.0,
+            "max": 2801.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 200.0,
+            "used": 1.4021186828613281
+        }
+    },
+    "gpu": "NVIDIA A40",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A40",
+            "memory_total": 48305799168
+        }
+    ],
+    "memory": {
+        "total": 503.5313262939453
+    }
+}

checkpoints/wandb/run-20240902_170304-v43qltex/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"train/loss": 42.73570467710495, "train/grad_l2": 1292.6591796875, "train/weights_l2": 7206.464049045997, "train/lr": 0.019987049260593165, "train/seconds_per_step": 3.309874153137207, "_timestamp": 1725333143.7322862, "_runtime": 36558.970363140106, "_step": 10900, "_wandb": {"runtime": 36838}}

checkpoints/wandb/run-20240902_170304-v43qltex/logs/debug-internal.log ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/wandb/run-20240902_170304-v43qltex/logs/debug.log ADDED Viewed

	@@ -0,0 +1,28 @@

+2024-09-02 17:03:04,741 INFO    MainThread:6499 [wandb_setup.py:_flush():77] Current SDK version is 0.17.8
+2024-09-02 17:03:04,741 INFO    MainThread:6499 [wandb_setup.py:_flush():77] Configure stats pid to 6499
+2024-09-02 17:03:04,741 INFO    MainThread:6499 [wandb_setup.py:_flush():77] Loading settings from /root/.config/wandb/settings
+2024-09-02 17:03:04,741 INFO    MainThread:6499 [wandb_setup.py:_flush():77] Loading settings from /workspace/nanoT5/logs/2024-09-02/17-03-02/wandb/settings
+2024-09-02 17:03:04,742 INFO    MainThread:6499 [wandb_setup.py:_flush():77] Loading settings from environment variables: {}
+2024-09-02 17:03:04,742 INFO    MainThread:6499 [wandb_setup.py:_flush():77] Applying setup settings: {'_disable_service': False}
+2024-09-02 17:03:04,742 WARNING MainThread:6499 [wandb_setup.py:_flush():77] Could not find program at -m nanoT5.main
+2024-09-02 17:03:04,743 INFO    MainThread:6499 [wandb_setup.py:_flush():77] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m nanoT5.main'}
+2024-09-02 17:03:04,743 INFO    MainThread:6499 [wandb_setup.py:_flush():77] Applying login settings: {}
+2024-09-02 17:03:04,743 INFO    MainThread:6499 [wandb_init.py:_log_setup():524] Logging user logs to /workspace/nanoT5/logs/2024-09-02/17-03-02/wandb/run-20240902_170304-v43qltex/logs/debug.log
+2024-09-02 17:03:04,744 INFO    MainThread:6499 [wandb_init.py:_log_setup():525] Logging internal logs to /workspace/nanoT5/logs/2024-09-02/17-03-02/wandb/run-20240902_170304-v43qltex/logs/debug-internal.log
+2024-09-02 17:03:04,744 INFO    MainThread:6499 [wandb_init.py:init():607] calling init triggers
+2024-09-02 17:03:04,744 INFO    MainThread:6499 [wandb_init.py:init():614] wandb.init called with sweep_config: {}
+config: {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 2137, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'model': {'klass': 'custom_seq2seq', 'name': 'google/t5-v1_1-base', 'overwrite': None, 'add_config': None, 'checkpoint_path': '', 'random_init': True, 'compile': True}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 8}, 'optim': {'name': 'adamwscale', 'base_lr': 0.02, 'batch_size': 64, 'total_steps': 65536, 'epochs': -1, 'warmup_steps': 10000, 'lr_scheduler': 'cosine', 'weight_decay': 0.001, 'grad_clip': 1.0, 'grad_acc': 4, 'final_cosine': 1e-05}, 'eval': {'every_steps': 100000, 'steps': 500}, 'checkpoint': {'every_steps': 5000}, 'logging': {'every_steps': 100, 'grad_l2': True, 'weights_l2': True, 'use_wandb': True, 'wandb_config': {'project': 'nano-custom-seq2seq', 'entity': 'amazingvince', 'tags': ['nanoT5', 'my_tag'], 'mode': 'online'}}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/logs/2024-09-02/17-03-02'}
+2024-09-02 17:03:04,745 INFO    MainThread:6499 [wandb_init.py:init():657] starting backend
+2024-09-02 17:03:04,745 INFO    MainThread:6499 [wandb_init.py:init():661] setting up manager
+2024-09-02 17:03:04,760 INFO    MainThread:6499 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-09-02 17:03:04,761 INFO    MainThread:6499 [wandb_init.py:init():669] backend started and connected
+2024-09-02 17:03:04,776 INFO    MainThread:6499 [wandb_init.py:init():767] updated telemetry
+2024-09-02 17:03:04,819 INFO    MainThread:6499 [wandb_init.py:init():800] communicating run to backend with 90.0 second timeout
+2024-09-02 17:03:05,519 INFO    MainThread:6499 [wandb_init.py:init():851] starting run threads in backend
+2024-09-02 17:03:05,817 INFO    MainThread:6499 [wandb_run.py:_console_start():2463] atexit reg
+2024-09-02 17:03:05,818 INFO    MainThread:6499 [wandb_run.py:_redirect():2309] redirect: wrap_raw
+2024-09-02 17:03:05,819 INFO    MainThread:6499 [wandb_run.py:_redirect():2374] Wrapping output streams.
+2024-09-02 17:03:05,819 INFO    MainThread:6499 [wandb_run.py:_redirect():2399] Redirects installed.
+2024-09-02 17:03:05,822 INFO    MainThread:6499 [wandb_init.py:init():894] run started, returning control to user process
+2024-09-02 17:03:35,512 INFO    MainThread:6499 [wandb_run.py:_config_callback():1392] config_cb None None {'mode': 'pt', 'device': 'gpu', 'precision': 'bf16', 'eval_only': False, 'predict_only': False, 'seed': 2137, 'tokenizer': {'name': 'BEE-spoke-data/slimpajama_tok-48128-BPE-forT5'}, 'model': {'klass': 'custom_seq2seq', 'name': 'google/t5-v1_1-base', 'overwrite': None, 'add_config': None, 'checkpoint_path': '', 'random_init': True, 'compile': True}, 'data': {'input_length': 1024, 'mlm_probability': 0.15, 'mean_noise_span_length': 3.0, 'num_workers': 8, 'before_mask_input_length': 1137, 'target_length': 229}, 'optim': {'name': 'adamwscale', 'base_lr': 0.02, 'batch_size': 64, 'total_steps': 65536, 'epochs': -1, 'warmup_steps': 10000, 'lr_scheduler': 'cosine', 'weight_decay': 0.001, 'grad_clip': 1.0, 'grad_acc': 4, 'final_cosine': 1e-05}, 'eval': {'every_steps': 100000, 'steps': 500, 'corrected_steps': 500}, 'checkpoint': {'every_steps': 5000}, 'logging': {'every_steps': 100, 'grad_l2': True, 'weights_l2': True, 'use_wandb': True, 'wandb_config': {'project': 'nano-custom-seq2seq', 'entity': 'amazingvince', 'tags': ['nanoT5', 'my_tag'], 'mode': 'online'}}, 'slurm_id': 'none', 'working_dir': '/workspace/nanoT5/logs/2024-09-02/17-03-02', 'n_all_param': 673076736}
+2024-09-03 03:17:10,763 WARNING MsgRouterThr:6499 [router.py:message_loop():77] message_loop has been closed

checkpoints/wandb/run-20240902_170304-v43qltex/run-v43qltex.wandb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c992f1fa0f2152d9b2336fede3ebcc8b26d60d54a1945da3d2cecab44ce3ab70
+size 4157001