koichi12 commited on Nov 28, 2024

Commit

09e5c81

verified ·

1 Parent(s): d3983b6

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

wandb/run-20240804_021032-cd2cg2ui/files/output.log +11 -0
wandb/run-20240804_021032-cd2cg2ui/files/wandb-metadata.json +215 -0
wandb/run-20240804_021032-cd2cg2ui/files/wandb-summary.json +1 -0
wandb/run-20240804_021444-pk5j08lr/files/config.yaml +335 -0
wandb/run-20240804_021444-pk5j08lr/files/output.log +103 -0
wandb/run-20240804_021444-pk5j08lr/files/requirements.txt +271 -0
wandb/run-20240804_021444-pk5j08lr/files/wandb-metadata.json +215 -0
wandb/run-20240804_021444-pk5j08lr/files/wandb-summary.json +1 -0
wandb/run-20240804_021444-pk5j08lr/logs/debug-internal.log +191 -0
wandb/run-20240804_021444-pk5j08lr/logs/debug.log +30 -0
wandb/run-20240804_021444-pk5j08lr/run-pk5j08lr.wandb +0 -0
wandb/run-20240804_144007-dds6qqbt/files/config.yaml +335 -0
wandb/run-20240804_144007-dds6qqbt/files/output.log +135 -0
wandb/run-20240804_144007-dds6qqbt/files/requirements.txt +271 -0
wandb/run-20240804_144007-dds6qqbt/files/wandb-metadata.json +215 -0
wandb/run-20240804_144007-dds6qqbt/files/wandb-summary.json +1 -0
wandb/run-20240804_144007-dds6qqbt/logs/debug-internal.log +186 -0
wandb/run-20240804_144007-dds6qqbt/logs/debug.log +30 -0
wandb/run-20240804_144007-dds6qqbt/run-dds6qqbt.wandb +0 -0
wandb/run-20240804_222226-kh5katc1/files/config.yaml +335 -0
wandb/run-20240804_222226-kh5katc1/files/output.log +468 -0
wandb/run-20240804_222226-kh5katc1/files/requirements.txt +271 -0
wandb/run-20240804_222226-kh5katc1/files/wandb-metadata.json +215 -0
wandb/run-20240804_222226-kh5katc1/files/wandb-summary.json +1 -0
wandb/run-20240804_222226-kh5katc1/logs/debug-internal.log +0 -0
wandb/run-20240804_222226-kh5katc1/logs/debug.log +30 -0
wandb/run-20240812_063447-whqmtxyq/files/config.yaml +335 -0
wandb/run-20240812_063447-whqmtxyq/files/output.log +144 -0
wandb/run-20240812_063447-whqmtxyq/files/requirements.txt +271 -0
wandb/run-20240812_063447-whqmtxyq/files/wandb-metadata.json +215 -0
wandb/run-20240812_063447-whqmtxyq/files/wandb-summary.json +1 -0
wandb/run-20240812_063447-whqmtxyq/logs/debug-internal.log +359 -0
wandb/run-20240812_063447-whqmtxyq/logs/debug.log +30 -0
wandb/run-20240812_063447-whqmtxyq/run-whqmtxyq.wandb +0 -0
wandb/run-20240815_031216-0szn78ph/files/config.yaml +335 -0
wandb/run-20240815_031216-0szn78ph/files/output.log +92 -0
wandb/run-20240815_031216-0szn78ph/files/requirements.txt +293 -0
wandb/run-20240815_031216-0szn78ph/files/wandb-metadata.json +215 -0
wandb/run-20240815_031216-0szn78ph/files/wandb-summary.json +1 -0
wandb/run-20240815_031216-0szn78ph/logs/debug-internal.log +260 -0
wandb/run-20240815_031216-0szn78ph/logs/debug.log +29 -0
wandb/run-20240815_031216-0szn78ph/run-0szn78ph.wandb +0 -0
wandb/run-20240823_162543-eroprw00/files/config.yaml +342 -0
wandb/run-20240823_162543-eroprw00/files/output.log +116 -0
wandb/run-20240823_162543-eroprw00/files/requirements.txt +375 -0
wandb/run-20240823_162543-eroprw00/files/wandb-metadata.json +220 -0
wandb/run-20240823_162543-eroprw00/files/wandb-summary.json +1 -0
wandb/run-20240823_162543-eroprw00/logs/debug-internal.log +188 -0
wandb/run-20240823_162543-eroprw00/logs/debug.log +30 -0
wandb/run-20240823_162543-eroprw00/run-eroprw00.wandb +0 -0

wandb/run-20240804_021032-cd2cg2ui/files/output.log ADDED Viewed

	@@ -0,0 +1,11 @@

+Created Hugging Face repository with ID koichi12/tiny-mistral-sample.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+Traceback (most recent call last):
+  File "/project/examples/finetuning.py", line 13, in <module>
+    main()
+  File "/project/src/llama_recipes/finetuning.py", line 103, in main
+    model = get_model(
+  File "/project/src/llama_recipes/get_models.py", line 71, in get_model
+    assert sliding_window == 4096
+AssertionError

wandb/run-20240804_021032-cd2cg2ui/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-03T17:10:33.458421",
+    "startedAt": "2024-08-03T17:10:32.395506",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "1024",
+        "--sliding-window-size",
+        "8192",
+        "--micro-batch-size",
+        "8",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "20000",
+        "--tokenizer-type",
+        "Llama2Tokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3",
+        "--train-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--valid-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--test-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "20000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "adam",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "200",
+        "--eval-interval",
+        "200",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/custom/tiny-mistral",
+        "--save",
+        "/work/llm_recipes/models/tiny-mistral-sample",
+        "--load",
+        "/work/llm_recipes/models/tiny-mistral-sample",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/tiny-mistral-sample",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "tiny-mistral-sample_train_2024-08-04-02:10:14"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.034,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.48782730102539
+    }
+}

wandb/run-20240804_021032-cd2cg2ui/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb": {"runtime": 0}}

wandb/run-20240804_021444-pk5j08lr/files/config.yaml ADDED Viewed

	@@ -0,0 +1,335 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 1024
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: Llama2Tokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: tiny-mistral-sample_train_2024-08-04-02:14:34
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/tiny-mistral-sample
+save:
+  desc: null
+  value: /work/llm_recipes/models/tiny-mistral-sample
+base_model:
+  desc: null
+  value: /share/pretrained_lm/custom/tiny-mistral
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 200
+save_interval:
+  desc: null
+  value: 200
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: adam
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 20000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 20000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 8
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/tiny-mistral-sample
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 32768
+gradient_accumulation_steps:
+  desc: null
+  value: 40
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1722705284.714592
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+activation_function:
+  desc: null
+  value: silu
+hidden_size:
+  desc: null
+  value: 256
+model_type:
+  desc: null
+  value: mistral
+max_position_embeddings:
+  desc: null
+  value: 1024
+num_attention_heads:
+  desc: null
+  value: 4
+num_hidden_layers:
+  desc: null
+  value: 4
+model_architecture:
+  desc: null
+  value: MistralForCausalLM

wandb/run-20240804_021444-pk5j08lr/files/output.log ADDED Viewed

	@@ -0,0 +1,103 @@

+Created Hugging Face repository with ID koichi12/tiny-mistral-sample.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
+Loading model state dict from /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/model.pt
+Loaded model state dict from /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/model.pt
+--> Model /share/pretrained_lm/custom/tiny-mistral
+--> /share/pretrained_lm/custom/tiny-mistral has 19.925248 Million params
+BFloat16 enabled for mixed precision - using bfSixteen policy
+--> applying fsdp activation checkpointing...
+ > datasets target sizes (minimum size):
+    train:      6400000
+    validation: 323200
+    test:       3200
+> building train, validation, and test datasets for GPT ...
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
+  warnings.warn(
+Let split = None
+> finished creating GPT datasets ...
+Loading optimizer state dict from /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/optimizer.pt
+Loaded optimizer state dict from /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/optimizer.pt
+model info: FullyShardedDataParallel(
+  (_fsdp_wrapped_module): MistralForCausalLM(
+    (model): MistralModel(
+      (embed_tokens): Embedding(32768, 256)
+      (layers): ModuleList(
+        (0-3): 4 x FullyShardedDataParallel(
+          (_fsdp_wrapped_module): CheckpointWrapper(
+            (_checkpoint_wrapped_module): MistralDecoderLayer(
+              (self_attn): MistralFlashAttention2(
+                (q_proj): Linear(in_features=256, out_features=512, bias=False)
+                (k_proj): Linear(in_features=256, out_features=256, bias=False)
+                (v_proj): Linear(in_features=256, out_features=256, bias=False)
+                (o_proj): Linear(in_features=512, out_features=256, bias=False)
+                (rotary_emb): MistralRotaryEmbedding()
+              )
+              (mlp): MistralMLP(
+                (gate_proj): Linear(in_features=256, out_features=512, bias=False)
+                (up_proj): Linear(in_features=256, out_features=512, bias=False)
+                (down_proj): Linear(in_features=512, out_features=256, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): MistralRMSNorm()
+              (post_attention_layernorm): MistralRMSNorm()
+            )
+          )
+        )
+      )
+      (norm): MistralRMSNorm()
+    )
+    (lm_head): Linear(in_features=256, out_features=32768, bias=False)
+  )
+)
+model config: MistralConfig {
+  "_name_or_path": "/share/pretrained_lm/custom/tiny-mistral",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 256,
+  "initializer_range": 0.02,
+  "intermediate_size": 512,
+  "label_smoothing": 0.0,
+  "max_position_embeddings": 1024,
+  "model_type": "mistral",
+  "num_attention_heads": 4,
+  "num_hidden_layers": 4,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 1000000.0,
+  "sliding_window": 4096,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.43.3",
+  "use_cache": false,
+  "vocab_size": 32768
+}
+Saving checkpoint to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000
+Saving model state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/model.pt
+Saved model state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/model.pt
+Saving optimizer state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/optimizer.pt
+Saved optimizer state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/optimizer.pt
+Saving scheduler state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/scheduler.pt
+Saved scheduler state dict to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/scheduler.pt
+Saving RNG states to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/rng.pt
+Saved RNG states to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000/rng.pt
+Saved checkpoint to /work/llm_recipes/models/tiny-mistral-sample/iter_0020000, took 0.17s
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+[rank0]:[2024-08-04 02:14:50,842] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling:  defaultdict(<class 'float'>, {})
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:773: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
+  warnings.warn(
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:716: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
+  warnings.warn(
+[rank0]:[2024-08-04 02:14:50,959] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling:  defaultdict(<class 'float'>, {'preprocessing': 0.0010300300018570852, 'preprocessing_with_comm': 0.0005270100009511225, 'state_converting': 0.021121047997439746, <Type.ALL: 'all'>: 0.022993901999143418})

wandb/run-20240804_021444-pk5j08lr/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,271 @@

+absl-py==2.1.0
+accelerate==0.33.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+distributed==2023.11.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+grpcio==1.60.1
+huggingface-hub==0.24.5
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+jedi==0.19.1
+jinja2==3.1.3
+joblib==1.3.2
+json5==0.9.14
+jsonnet==0.19.1
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy-loader==0.3
+librosa==0.10.1
+llvmlite==0.40.1
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+murmurhash==1.0.10
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+onnx==1.15.0rc2
+opencv==4.7.0
+optree==0.10.0
+packaging==23.2
+pandas==1.5.3
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1.dev0+gba5374836.d20240125
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pynvml==11.4.1
+pyparsing==3.1.1
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+rich==13.7.0
+rmm==23.12.0
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.0
+safetensors==0.4.3
+scikit-learn==1.2.0
+scipy==1.12.0
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.9.0
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+xdoctest==1.0.2
+xgboost==1.7.6
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0

wandb/run-20240804_021444-pk5j08lr/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-03T17:14:45.302596",
+    "startedAt": "2024-08-03T17:14:44.702200",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "1024",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "8",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "20000",
+        "--tokenizer-type",
+        "Llama2Tokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3",
+        "--train-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--valid-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--test-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "20000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "adam",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "200",
+        "--eval-interval",
+        "200",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/custom/tiny-mistral",
+        "--save",
+        "/work/llm_recipes/models/tiny-mistral-sample",
+        "--load",
+        "/work/llm_recipes/models/tiny-mistral-sample",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/tiny-mistral-sample",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "tiny-mistral-sample_train_2024-08-04-02:14:34"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.034,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.034,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.48782730102539
+    }
+}

wandb/run-20240804_021444-pk5j08lr/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb": {"runtime": 5}}

wandb/run-20240804_021444-pk5j08lr/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,191 @@

+2024-08-04 02:14:44,716 INFO    StreamThr :11553 [internal.py:wandb_internal():86] W&B internal server running at pid: 11553, started at: 2024-08-04 02:14:44.715209
+2024-08-04 02:14:44,717 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: status
+2024-08-04 02:14:44,719 INFO    WriterThread:11553 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_021444-pk5j08lr/run-pk5j08lr.wandb
+2024-08-04 02:14:44,720 DEBUG   SenderThread:11553 [sender.py:send():382] send: header
+2024-08-04 02:14:44,733 DEBUG   SenderThread:11553 [sender.py:send():382] send: run
+2024-08-04 02:14:45,190 INFO    SenderThread:11553 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_021444-pk5j08lr/files
+2024-08-04 02:14:45,190 INFO    SenderThread:11553 [sender.py:_start_run_threads():1136] run started: pk5j08lr with start time 1722705284.714592
+2024-08-04 02:14:45,195 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: check_version
+2024-08-04 02:14:45,195 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: check_version
+2024-08-04 02:14:45,280 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: run_start
+2024-08-04 02:14:45,286 DEBUG   HandlerThread:11553 [system_info.py:__init__():27] System info init
+2024-08-04 02:14:45,286 DEBUG   HandlerThread:11553 [system_info.py:__init__():42] System info init done
+2024-08-04 02:14:45,286 INFO    HandlerThread:11553 [system_monitor.py:start():194] Starting system monitor
+2024-08-04 02:14:45,286 INFO    SystemMonitor:11553 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-08-04 02:14:45,287 INFO    HandlerThread:11553 [system_monitor.py:probe():214] Collecting system info
+2024-08-04 02:14:45,287 INFO    SystemMonitor:11553 [interfaces.py:start():190] Started cpu monitoring
+2024-08-04 02:14:45,288 INFO    SystemMonitor:11553 [interfaces.py:start():190] Started disk monitoring
+2024-08-04 02:14:45,289 INFO    SystemMonitor:11553 [interfaces.py:start():190] Started gpu monitoring
+2024-08-04 02:14:45,289 INFO    SystemMonitor:11553 [interfaces.py:start():190] Started memory monitoring
+2024-08-04 02:14:45,290 INFO    SystemMonitor:11553 [interfaces.py:start():190] Started network monitoring
+2024-08-04 02:14:45,302 DEBUG   HandlerThread:11553 [system_info.py:probe():151] Probing system
+2024-08-04 02:14:45,304 DEBUG   HandlerThread:11553 [system_info.py:_probe_git():136] Probing git
+2024-08-04 02:14:45,315 DEBUG   HandlerThread:11553 [system_info.py:_probe_git():144] Probing git done
+2024-08-04 02:14:45,315 DEBUG   HandlerThread:11553 [system_info.py:probe():199] Probing system done
+2024-08-04 02:14:45,315 DEBUG   HandlerThread:11553 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-03T17:14:45.302596', 'startedAt': '2024-08-03T17:14:44.702200', 'docker': None, 'cuda': None, 'args': ('--seq-length', '1024', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/custom/tiny-mistral', '--save', '/work/llm_recipes/models/tiny-mistral-sample', '--load', '/work/llm_recipes/models/tiny-mistral-sample', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-mistral-sample', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-mistral-sample_train_2024-08-04-02:14:34'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.034, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}, {'current': 2400.034, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
+2024-08-04 02:14:45,315 INFO    HandlerThread:11553 [system_monitor.py:probe():224] Finished collecting system info
+2024-08-04 02:14:45,316 INFO    HandlerThread:11553 [system_monitor.py:probe():227] Publishing system info
+2024-08-04 02:14:45,317 INFO    HandlerThread:11553 [system_monitor.py:probe():229] Finished publishing system info
+2024-08-04 02:14:45,323 DEBUG   SenderThread:11553 [sender.py:send():382] send: files
+2024-08-04 02:14:45,323 INFO    SenderThread:11553 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
+2024-08-04 02:14:45,332 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: python_packages
+2024-08-04 02:14:45,332 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 02:14:45,332 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: python_packages
+2024-08-04 02:14:45,333 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 02:14:45,334 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 02:14:45,580 DEBUG   SenderThread:11553 [sender.py:send():382] send: telemetry
+2024-08-04 02:14:46,067 INFO    wandb-upload_0:11553 [upload_job.py:push():131] Uploaded file /tmp/tmp8oqwu4dewandb/gzg3ga4a-wandb-metadata.json
+2024-08-04 02:14:46,191 INFO    Thread-12 :11553 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_021444-pk5j08lr/files/wandb-metadata.json
+2024-08-04 02:14:46,192 INFO    Thread-12 :11553 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
+2024-08-04 02:14:46,192 INFO    Thread-12 :11553 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_021444-pk5j08lr/files/requirements.txt
+2024-08-04 02:14:48,192 INFO    Thread-12 :11553 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
+2024-08-04 02:14:49,192 INFO    Thread-12 :11553 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
+2024-08-04 02:14:50,179 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 02:14:50,193 INFO    Thread-12 :11553 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
+2024-08-04 02:14:50,882 DEBUG   SenderThread:11553 [sender.py:send():382] send: config
+2024-08-04 02:14:50,882 DEBUG   SenderThread:11553 [sender.py:send():382] send: config
+2024-08-04 02:14:51,067 DEBUG   SenderThread:11553 [sender.py:send():382] send: exit
+2024-08-04 02:14:51,067 INFO    SenderThread:11553 [sender.py:send_exit():589] handling exit code: 0
+2024-08-04 02:14:51,067 INFO    SenderThread:11553 [sender.py:send_exit():591] handling runtime: 5
+2024-08-04 02:14:51,068 INFO    SenderThread:11553 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-04 02:14:51,068 INFO    SenderThread:11553 [sender.py:send_exit():597] send defer
+2024-08-04 02:14:51,068 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 02:14:51,068 INFO    HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 0
+2024-08-04 02:14:51,069 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: defer
+2024-08-04 02:14:51,069 INFO    SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-08-04 02:14:51,069 INFO    SenderThread:11553 [sender.py:transition_state():617] send defer: 1
+2024-08-04 02:14:51,069 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 02:14:51,069 INFO    HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 1
+2024-08-04 02:14:51,069 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: defer
+2024-08-04 02:14:51,069 INFO    SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-08-04 02:14:51,069 INFO    SenderThread:11553 [sender.py:transition_state():617] send defer: 2
+2024-08-04 02:14:51,069 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 02:14:51,069 INFO    HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 2
+2024-08-04 02:14:51,069 INFO    HandlerThread:11553 [system_monitor.py:finish():203] Stopping system monitor
+2024-08-04 02:14:51,069 DEBUG   SystemMonitor:11553 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-08-04 02:14:51,070 INFO    HandlerThread:11553 [interfaces.py:finish():202] Joined cpu monitor
+2024-08-04 02:14:51,070 DEBUG   SystemMonitor:11553 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-08-04 02:14:51,070 INFO    HandlerThread:11553 [interfaces.py:finish():202] Joined disk monitor
+2024-08-04 02:14:51,070 DEBUG   SystemMonitor:11553 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-08-04 02:14:51,103 INFO    HandlerThread:11553 [interfaces.py:finish():202] Joined gpu monitor
+2024-08-04 02:14:51,103 INFO    HandlerThread:11553 [interfaces.py:finish():202] Joined memory monitor
+2024-08-04 02:14:51,103 INFO    HandlerThread:11553 [interfaces.py:finish():202] Joined network monitor
+2024-08-04 02:14:51,104 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: defer
+2024-08-04 02:14:51,104 INFO    SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-08-04 02:14:51,104 INFO    SenderThread:11553 [sender.py:transition_state():617] send defer: 3
+2024-08-04 02:14:51,104 DEBUG   SenderThread:11553 [sender.py:send():382] send: stats
+2024-08-04 02:14:51,104 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 02:14:51,104 INFO    HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 3
+2024-08-04 02:14:51,104 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: defer
+2024-08-04 02:14:51,104 INFO    SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-08-04 02:14:51,105 INFO    SenderThread:11553 [sender.py:transition_state():617] send defer: 4
+2024-08-04 02:14:51,105 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 02:14:51,105 INFO    HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 4
+2024-08-04 02:14:51,105 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: defer
+2024-08-04 02:14:51,105 INFO    SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-08-04 02:14:51,105 INFO    SenderThread:11553 [sender.py:transition_state():617] send defer: 5
+2024-08-04 02:14:51,105 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 02:14:51,105 INFO    HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 5
+2024-08-04 02:14:51,105 DEBUG   SenderThread:11553 [sender.py:send():382] send: summary
+2024-08-04 02:14:51,106 INFO    SenderThread:11553 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-04 02:14:51,106 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: defer
+2024-08-04 02:14:51,106 INFO    SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-08-04 02:14:51,106 INFO    SenderThread:11553 [sender.py:transition_state():617] send defer: 6
+2024-08-04 02:14:51,106 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 02:14:51,106 INFO    HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 6
+2024-08-04 02:14:51,107 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: defer
+2024-08-04 02:14:51,107 INFO    SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-08-04 02:14:51,109 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 02:14:51,194 INFO    Thread-12 :11553 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
+2024-08-04 02:14:51,194 INFO    Thread-12 :11553 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_021444-pk5j08lr/files/wandb-summary.json
+2024-08-04 02:14:51,396 INFO    SenderThread:11553 [sender.py:transition_state():617] send defer: 7
+2024-08-04 02:14:51,396 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 02:14:51,396 INFO    HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 7
+2024-08-04 02:14:51,396 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: defer
+2024-08-04 02:14:51,396 INFO    SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-08-04 02:14:52,066 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 02:14:52,195 INFO    Thread-12 :11553 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021444-pk5j08lr/files/config.yaml
+2024-08-04 02:14:52,195 INFO    Thread-12 :11553 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
+2024-08-04 02:14:52,692 INFO    SenderThread:11553 [sender.py:transition_state():617] send defer: 8
+2024-08-04 02:14:52,692 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 02:14:52,692 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 02:14:52,692 INFO    HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 8
+2024-08-04 02:14:52,692 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: defer
+2024-08-04 02:14:52,693 INFO    SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-08-04 02:14:52,693 INFO    SenderThread:11553 [job_builder.py:build():296] Attempting to build job artifact
+2024-08-04 02:14:52,693 INFO    SenderThread:11553 [job_builder.py:_get_source_type():426] is repo sourced job
+2024-08-04 02:14:52,707 INFO    SenderThread:11553 [job_builder.py:build():402] adding wandb-job metadata file
+2024-08-04 02:14:52,715 INFO    SenderThread:11553 [sender.py:transition_state():617] send defer: 9
+2024-08-04 02:14:52,716 DEBUG   SenderThread:11553 [sender.py:send():382] send: artifact
+2024-08-04 02:14:52,716 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 02:14:52,717 INFO    HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 9
+2024-08-04 02:14:53,067 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 02:14:53,195 INFO    Thread-12 :11553 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
+2024-08-04 02:14:53,655 INFO    SenderThread:11553 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
+2024-08-04 02:14:53,655 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: defer
+2024-08-04 02:14:53,655 INFO    SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 9
+2024-08-04 02:14:53,655 INFO    SenderThread:11553 [dir_watcher.py:finish():358] shutting down directory watcher
+2024-08-04 02:14:54,196 INFO    SenderThread:11553 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_021444-pk5j08lr/files
+2024-08-04 02:14:54,197 INFO    SenderThread:11553 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021444-pk5j08lr/files/requirements.txt requirements.txt
+2024-08-04 02:14:54,197 INFO    SenderThread:11553 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021444-pk5j08lr/files/config.yaml config.yaml
+2024-08-04 02:14:54,198 INFO    SenderThread:11553 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021444-pk5j08lr/files/wandb-metadata.json wandb-metadata.json
+2024-08-04 02:14:54,198 INFO    SenderThread:11553 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021444-pk5j08lr/files/wandb-summary.json wandb-summary.json
+2024-08-04 02:14:54,200 INFO    SenderThread:11553 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_021444-pk5j08lr/files/output.log output.log
+2024-08-04 02:14:54,200 INFO    SenderThread:11553 [sender.py:transition_state():617] send defer: 10
+2024-08-04 02:14:54,202 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 02:14:54,202 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 02:14:54,205 INFO    HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 10
+2024-08-04 02:14:54,206 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: defer
+2024-08-04 02:14:54,206 INFO    SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 10
+2024-08-04 02:14:54,206 INFO    SenderThread:11553 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-04 02:14:54,605 INFO    wandb-upload_1:11553 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_021444-pk5j08lr/files/config.yaml
+2024-08-04 02:14:54,711 INFO    wandb-upload_0:11553 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_021444-pk5j08lr/files/requirements.txt
+2024-08-04 02:14:54,762 INFO    wandb-upload_2:11553 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_021444-pk5j08lr/files/wandb-summary.json
+2024-08-04 02:14:54,792 INFO    wandb-upload_3:11553 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_021444-pk5j08lr/files/output.log
+2024-08-04 02:14:54,992 INFO    Thread-11 (_thread_body):11553 [sender.py:transition_state():617] send defer: 11
+2024-08-04 02:14:54,992 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 02:14:54,992 INFO    HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 11
+2024-08-04 02:14:54,992 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: defer
+2024-08-04 02:14:54,992 INFO    SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 11
+2024-08-04 02:14:54,993 INFO    SenderThread:11553 [file_pusher.py:join():178] waiting for file pusher
+2024-08-04 02:14:54,993 INFO    SenderThread:11553 [sender.py:transition_state():617] send defer: 12
+2024-08-04 02:14:54,993 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 02:14:54,993 INFO    HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 12
+2024-08-04 02:14:54,993 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: defer
+2024-08-04 02:14:54,993 INFO    SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 12
+2024-08-04 02:14:54,993 INFO    SenderThread:11553 [file_stream.py:finish():595] file stream finish called
+2024-08-04 02:14:55,067 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 02:14:55,176 INFO    SenderThread:11553 [file_stream.py:finish():599] file stream finish is done
+2024-08-04 02:14:55,176 INFO    SenderThread:11553 [sender.py:transition_state():617] send defer: 13
+2024-08-04 02:14:55,176 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 02:14:55,176 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 02:14:55,177 INFO    HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 13
+2024-08-04 02:14:55,177 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: defer
+2024-08-04 02:14:55,177 INFO    SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 13
+2024-08-04 02:14:55,177 INFO    SenderThread:11553 [sender.py:transition_state():617] send defer: 14
+2024-08-04 02:14:55,177 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 02:14:55,177 DEBUG   SenderThread:11553 [sender.py:send():382] send: final
+2024-08-04 02:14:55,177 INFO    HandlerThread:11553 [handler.py:handle_request_defer():172] handle defer: 14
+2024-08-04 02:14:55,177 DEBUG   SenderThread:11553 [sender.py:send():382] send: footer
+2024-08-04 02:14:55,178 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: defer
+2024-08-04 02:14:55,178 INFO    SenderThread:11553 [sender.py:send_request_defer():613] handle sender defer: 14
+2024-08-04 02:14:55,178 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 02:14:55,178 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 02:14:55,178 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 02:14:55,179 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 02:14:55,179 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: server_info
+2024-08-04 02:14:55,179 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: server_info
+2024-08-04 02:14:55,180 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: get_summary
+2024-08-04 02:14:55,181 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: sampled_history
+2024-08-04 02:14:55,181 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 02:14:55,181 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: job_info
+2024-08-04 02:14:55,346 DEBUG   SenderThread:11553 [sender.py:send_request():409] send_request: job_info
+2024-08-04 02:14:55,346 INFO    MainThread:11553 [wandb_run.py:_footer_history_summary_info():3866] rendering history
+2024-08-04 02:14:55,346 INFO    MainThread:11553 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
+2024-08-04 02:14:55,346 INFO    MainThread:11553 [wandb_run.py:_footer_sync_info():3825] logging synced files
+2024-08-04 02:14:55,346 DEBUG   HandlerThread:11553 [handler.py:handle_request():146] handle_request: shutdown
+2024-08-04 02:14:55,346 INFO    HandlerThread:11553 [handler.py:finish():869] shutting down handler
+2024-08-04 02:14:56,181 INFO    WriterThread:11553 [datastore.py:close():296] close: /project/wandb/run-20240804_021444-pk5j08lr/run-pk5j08lr.wandb
+2024-08-04 02:14:56,346 INFO    SenderThread:11553 [sender.py:finish():1572] shutting down sender
+2024-08-04 02:14:56,346 INFO    SenderThread:11553 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-04 02:14:56,346 INFO    SenderThread:11553 [file_pusher.py:join():178] waiting for file pusher

wandb/run-20240804_021444-pk5j08lr/logs/debug.log ADDED Viewed

	@@ -0,0 +1,30 @@

+2024-08-04 02:14:44,708 INFO    MainThread:11482 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-04 02:14:44,708 INFO    MainThread:11482 [wandb_setup.py:_flush():76] Configure stats pid to 11482
+2024-08-04 02:14:44,708 INFO    MainThread:11482 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-04 02:14:44,708 INFO    MainThread:11482 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-04 02:14:44,708 INFO    MainThread:11482 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tuny llama sample'}
+2024-08-04 02:14:44,708 INFO    MainThread:11482 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-04 02:14:44,708 INFO    MainThread:11482 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-04 02:14:44,708 INFO    MainThread:11482 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_021444-pk5j08lr/logs/debug.log
+2024-08-04 02:14:44,708 INFO    MainThread:11482 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_021444-pk5j08lr/logs/debug-internal.log
+2024-08-04 02:14:44,708 INFO    MainThread:11482 [wandb_init.py:init():566] calling init triggers
+2024-08-04 02:14:44,708 INFO    MainThread:11482 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 1024, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/custom/tiny-mistral/tokenizer.model.v3', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-mistral-sample_train_2024-08-04-02:14:34', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-mistral-sample', 'save': '/work/llm_recipes/models/tiny-mistral-sample', 'base_model': '/share/pretrained_lm/custom/tiny-mistral', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-mistral-sample', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32768, 'gradient_accumulation_steps': 40}
+2024-08-04 02:14:44,708 INFO    MainThread:11482 [wandb_init.py:init():616] starting backend
+2024-08-04 02:14:44,708 INFO    MainThread:11482 [wandb_init.py:init():620] setting up manager
+2024-08-04 02:14:44,713 INFO    MainThread:11482 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-04 02:14:44,714 INFO    MainThread:11482 [wandb_init.py:init():628] backend started and connected
+2024-08-04 02:14:44,719 INFO    MainThread:11482 [wandb_init.py:init():720] updated telemetry
+2024-08-04 02:14:44,729 INFO    MainThread:11482 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-04 02:14:45,194 INFO    MainThread:11482 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-04 02:14:45,273 INFO    MainThread:11482 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-04 02:14:45,274 INFO    MainThread:11482 [wandb_init.py:init():804] starting run threads in backend
+2024-08-04 02:14:45,331 INFO    MainThread:11482 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-04 02:14:45,331 INFO    MainThread:11482 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-04 02:14:45,332 INFO    MainThread:11482 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-04 02:14:45,332 INFO    MainThread:11482 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-04 02:14:45,333 INFO    MainThread:11482 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-04 02:14:50,881 INFO    MainThread:11482 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 256, 'model_type': 'mistral', 'max_position_embeddings': 1024, 'num_attention_heads': 4, 'num_hidden_layers': 4, 'model_architecture': 'MistralForCausalLM'}
+2024-08-04 02:14:50,881 INFO    MainThread:11482 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
+2024-08-04 02:14:56,347 WARNING MsgRouterThr:11482 [router.py:message_loop():77] message_loop has been closed

wandb/run-20240804_021444-pk5j08lr/run-pk5j08lr.wandb ADDED Viewed

Binary file (17.1 kB). View file

wandb/run-20240804_144007-dds6qqbt/files/config.yaml ADDED Viewed

	@@ -0,0 +1,335 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '4013541'
+  - /work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 512
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: Llama2Tokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: tiny-llama_train_2024-08-04-14:39:57
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/tiny-llama
+save:
+  desc: null
+  value: /work/llm_recipes/models/tiny-llama
+base_model:
+  desc: null
+  value: /share/pretrained_lm/meta-llama/TinyLlama_v1.1
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 200
+save_interval:
+  desc: null
+  value: 200
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: adam
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 2000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 2000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 8
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/tiny-llama
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 32000
+gradient_accumulation_steps:
+  desc: null
+  value: 40
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1722750007.607754
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+activation_function:
+  desc: null
+  value: silu
+hidden_size:
+  desc: null
+  value: 2048
+model_type:
+  desc: null
+  value: llama
+max_position_embeddings:
+  desc: null
+  value: 2048
+num_attention_heads:
+  desc: null
+  value: 32
+num_hidden_layers:
+  desc: null
+  value: 22
+model_architecture:
+  desc: null
+  value: LlamaForCausalLM

wandb/run-20240804_144007-dds6qqbt/files/output.log ADDED Viewed

	@@ -0,0 +1,135 @@

+Created Hugging Face repository with ID koichi12/tiny-llama.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping model loading
+--> Model /share/pretrained_lm/meta-llama/TinyLlama_v1.1
+--> /share/pretrained_lm/meta-llama/TinyLlama_v1.1 has 1100.048384 Million params
+You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
+You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
+Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
+Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
+  warnings.warn(
+BFloat16 enabled for mixed precision - using bfSixteen policy
+--> applying fsdp activation checkpointing...
+ > datasets target sizes (minimum size):
+    train:      640000
+    validation: 35200
+    test:       3200
+> building train, validation, and test datasets for GPT ...
+> finished creating GPT datasets ...
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/tiny-llama, skipping optimizer loading
+File not found: /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/tiny-llama/latest_iteration.txt
+model info: FullyShardedDataParallel(
+  (_fsdp_wrapped_module): LlamaForCausalLM(
+    (model): LlamaModel(
+      (embed_tokens): Embedding(32000, 2048)
+      (layers): ModuleList(
+        (0-21): 22 x FullyShardedDataParallel(
+          (_fsdp_wrapped_module): CheckpointWrapper(
+            (_checkpoint_wrapped_module): LlamaDecoderLayer(
+              (self_attn): LlamaFlashAttention2(
+                (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
+                (k_proj): Linear(in_features=2048, out_features=256, bias=False)
+                (v_proj): Linear(in_features=2048, out_features=256, bias=False)
+                (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
+                (rotary_emb): LlamaRotaryEmbedding()
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
+                (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
+                (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm()
+              (post_attention_layernorm): LlamaRMSNorm()
+            )
+          )
+        )
+      )
+      (norm): LlamaRMSNorm()
+      (rotary_emb): LlamaRotaryEmbedding()
+    )
+    (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
+  )
+)
+model config: LlamaConfig {
+  "_name_or_path": "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5632,
+  "label_smoothing": 0.0,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 22,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.43.3",
+  "use_cache": false,
+  "vocab_size": 32000
+}
+Let split = None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Traceback (most recent call last):
+  File "/project/examples/finetuning.py", line 13, in <module>
+    main()
+  File "/project/src/llama_recipes/finetuning.py", line 281, in main
+    train(
+  File "/project/src/llama_recipes/utils/train_utils.py", line 104, in train
+    batch = next(train_dataloader)
+  File "/project/src/llama_recipes/utils/train_utils.py", line 24, in cyclic_iter
+    for x in iter:
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 631, in __next__
+    data = self._next_data()
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1346, in _next_data
+    return self._process_data(data)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1372, in _process_data
+    data.reraise()
+  File "/usr/local/lib/python3.10/dist-packages/torch/_utils.py", line 705, in reraise
+    raise exception
+RuntimeError: Caught RuntimeError in DataLoader worker process 0.
+Original Traceback (most recent call last):
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
+    data = fetcher.fetch(index)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
+    return self.collate_fn(data)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 277, in default_collate
+    return collate(batch, collate_fn_map=default_collate_fn_map)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in collate
+    return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 129, in <dictcomp>
+    return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 121, in collate
+    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
+  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py", line 174, in collate_tensor_fn
+    return torch.stack(batch, 0, out=out)
+RuntimeError: stack expects each tensor to be equal size, but got [513] at entry 0 and [543] at entry 1

wandb/run-20240804_144007-dds6qqbt/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,271 @@

+absl-py==2.1.0
+accelerate==0.33.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+distributed==2023.11.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+grpcio==1.60.1
+huggingface-hub==0.24.5
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+jedi==0.19.1
+jinja2==3.1.3
+joblib==1.3.2
+json5==0.9.14
+jsonnet==0.19.1
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy-loader==0.3
+librosa==0.10.1
+llvmlite==0.40.1
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+murmurhash==1.0.10
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+onnx==1.15.0rc2
+opencv==4.7.0
+optree==0.10.0
+packaging==23.2
+pandas==1.5.3
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1.dev0+gba5374836.d20240125
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pynvml==11.4.1
+pyparsing==3.1.1
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+rich==13.7.0
+rmm==23.12.0
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.0
+safetensors==0.4.3
+scikit-learn==1.2.0
+scipy==1.12.0
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.9.0
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+xdoctest==1.0.2
+xgboost==1.7.6
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0

wandb/run-20240804_144007-dds6qqbt/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-04T05:40:08.224323",
+    "startedAt": "2024-08-04T05:40:07.595226",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "512",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "8",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "2000",
+        "--tokenizer-type",
+        "Llama2Tokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model",
+        "--train-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--valid-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--test-data-path",
+        "4013541",
+        "/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "2000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "adam",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "200",
+        "--eval-interval",
+        "200",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/meta-llama/TinyLlama_v1.1",
+        "--save",
+        "/work/llm_recipes/models/tiny-llama",
+        "--load",
+        "/work/llm_recipes/models/tiny-llama",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/tiny-llama",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "tiny-llama_train_2024-08-04-14:39:57"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "3be5353210a678dc7008f237fa16b99f2bdf36ea"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.0389999999993,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.48781967163086
+    }
+}

wandb/run-20240804_144007-dds6qqbt/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb": {"runtime": 2}}

wandb/run-20240804_144007-dds6qqbt/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,186 @@

+2024-08-04 14:40:07,609 INFO    StreamThr :11999 [internal.py:wandb_internal():86] W&B internal server running at pid: 11999, started at: 2024-08-04 14:40:07.608480
+2024-08-04 14:40:07,610 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: status
+2024-08-04 14:40:07,612 INFO    WriterThread:11999 [datastore.py:open_for_write():87] open: /project/wandb/run-20240804_144007-dds6qqbt/run-dds6qqbt.wandb
+2024-08-04 14:40:07,613 DEBUG   SenderThread:11999 [sender.py:send():382] send: header
+2024-08-04 14:40:07,627 DEBUG   SenderThread:11999 [sender.py:send():382] send: run
+2024-08-04 14:40:08,110 INFO    SenderThread:11999 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240804_144007-dds6qqbt/files
+2024-08-04 14:40:08,111 INFO    SenderThread:11999 [sender.py:_start_run_threads():1136] run started: dds6qqbt with start time 1722750007.607754
+2024-08-04 14:40:08,116 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: check_version
+2024-08-04 14:40:08,116 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: check_version
+2024-08-04 14:40:08,204 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: run_start
+2024-08-04 14:40:08,210 DEBUG   HandlerThread:11999 [system_info.py:__init__():27] System info init
+2024-08-04 14:40:08,210 DEBUG   HandlerThread:11999 [system_info.py:__init__():42] System info init done
+2024-08-04 14:40:08,211 INFO    HandlerThread:11999 [system_monitor.py:start():194] Starting system monitor
+2024-08-04 14:40:08,211 INFO    SystemMonitor:11999 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-08-04 14:40:08,211 INFO    HandlerThread:11999 [system_monitor.py:probe():214] Collecting system info
+2024-08-04 14:40:08,211 INFO    SystemMonitor:11999 [interfaces.py:start():190] Started cpu monitoring
+2024-08-04 14:40:08,212 INFO    SystemMonitor:11999 [interfaces.py:start():190] Started disk monitoring
+2024-08-04 14:40:08,213 INFO    SystemMonitor:11999 [interfaces.py:start():190] Started gpu monitoring
+2024-08-04 14:40:08,214 INFO    SystemMonitor:11999 [interfaces.py:start():190] Started memory monitoring
+2024-08-04 14:40:08,214 INFO    SystemMonitor:11999 [interfaces.py:start():190] Started network monitoring
+2024-08-04 14:40:08,224 DEBUG   HandlerThread:11999 [system_info.py:probe():151] Probing system
+2024-08-04 14:40:08,226 DEBUG   HandlerThread:11999 [system_info.py:_probe_git():136] Probing git
+2024-08-04 14:40:08,238 DEBUG   HandlerThread:11999 [system_info.py:_probe_git():144] Probing git done
+2024-08-04 14:40:08,238 DEBUG   HandlerThread:11999 [system_info.py:probe():199] Probing system done
+2024-08-04 14:40:08,238 DEBUG   HandlerThread:11999 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-04T05:40:08.224323', 'startedAt': '2024-08-04T05:40:07.595226', 'docker': None, 'cuda': None, 'args': ('--seq-length', '512', '--sliding-window-size', '4096', '--micro-batch-size', '8', '--global-batch-size', '320', '--train-iters', '2000', '--tokenizer-type', 'Llama2Tokenizer', '--tokenizer-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', '--train-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--valid-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--test-data-path', '4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '2000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '200', '--eval-interval', '200', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', '--save', '/work/llm_recipes/models/tiny-llama', '--load', '/work/llm_recipes/models/tiny-llama', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/tiny-llama', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'tiny-llama_train_2024-08-04-14:39:57'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '3be5353210a678dc7008f237fa16b99f2bdf36ea'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48781967163086}}
+2024-08-04 14:40:08,238 INFO    HandlerThread:11999 [system_monitor.py:probe():224] Finished collecting system info
+2024-08-04 14:40:08,238 INFO    HandlerThread:11999 [system_monitor.py:probe():227] Publishing system info
+2024-08-04 14:40:08,239 INFO    HandlerThread:11999 [system_monitor.py:probe():229] Finished publishing system info
+2024-08-04 14:40:08,245 DEBUG   SenderThread:11999 [sender.py:send():382] send: files
+2024-08-04 14:40:08,246 INFO    SenderThread:11999 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
+2024-08-04 14:40:08,255 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: python_packages
+2024-08-04 14:40:08,255 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-04 14:40:08,255 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 14:40:08,255 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: python_packages
+2024-08-04 14:40:08,257 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: stop_status
+2024-08-04 14:40:08,521 DEBUG   SenderThread:11999 [sender.py:send():382] send: telemetry
+2024-08-04 14:40:08,889 INFO    wandb-upload_0:11999 [upload_job.py:push():131] Uploaded file /tmp/tmp5bbx13axwandb/8bl0rtdu-wandb-metadata.json
+2024-08-04 14:40:09,112 INFO    Thread-12 :11999 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_144007-dds6qqbt/files/requirements.txt
+2024-08-04 14:40:09,113 INFO    Thread-12 :11999 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_144007-dds6qqbt/files/output.log
+2024-08-04 14:40:09,113 INFO    Thread-12 :11999 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_144007-dds6qqbt/files/wandb-metadata.json
+2024-08-04 14:40:10,756 DEBUG   SenderThread:11999 [sender.py:send():382] send: config
+2024-08-04 14:40:10,756 DEBUG   SenderThread:11999 [sender.py:send():382] send: config
+2024-08-04 14:40:10,842 DEBUG   SenderThread:11999 [sender.py:send():382] send: exit
+2024-08-04 14:40:10,842 INFO    SenderThread:11999 [sender.py:send_exit():589] handling exit code: 1
+2024-08-04 14:40:10,842 INFO    SenderThread:11999 [sender.py:send_exit():591] handling runtime: 2
+2024-08-04 14:40:10,843 INFO    SenderThread:11999 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-04 14:40:10,843 INFO    SenderThread:11999 [sender.py:send_exit():597] send defer
+2024-08-04 14:40:10,844 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:40:10,844 INFO    HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 0
+2024-08-04 14:40:10,844 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:40:10,844 INFO    SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-08-04 14:40:10,844 INFO    SenderThread:11999 [sender.py:transition_state():617] send defer: 1
+2024-08-04 14:40:10,844 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:40:10,844 INFO    HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 1
+2024-08-04 14:40:10,844 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:40:10,844 INFO    SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-08-04 14:40:10,844 INFO    SenderThread:11999 [sender.py:transition_state():617] send defer: 2
+2024-08-04 14:40:10,844 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:40:10,844 INFO    HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 2
+2024-08-04 14:40:10,844 INFO    HandlerThread:11999 [system_monitor.py:finish():203] Stopping system monitor
+2024-08-04 14:40:10,845 DEBUG   SystemMonitor:11999 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-08-04 14:40:10,845 INFO    HandlerThread:11999 [interfaces.py:finish():202] Joined cpu monitor
+2024-08-04 14:40:10,845 DEBUG   SystemMonitor:11999 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-08-04 14:40:10,845 INFO    HandlerThread:11999 [interfaces.py:finish():202] Joined disk monitor
+2024-08-04 14:40:10,845 DEBUG   SystemMonitor:11999 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-08-04 14:40:10,878 INFO    HandlerThread:11999 [interfaces.py:finish():202] Joined gpu monitor
+2024-08-04 14:40:10,878 INFO    HandlerThread:11999 [interfaces.py:finish():202] Joined memory monitor
+2024-08-04 14:40:10,878 INFO    HandlerThread:11999 [interfaces.py:finish():202] Joined network monitor
+2024-08-04 14:40:10,878 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:40:10,878 INFO    SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-08-04 14:40:10,878 INFO    SenderThread:11999 [sender.py:transition_state():617] send defer: 3
+2024-08-04 14:40:10,879 DEBUG   SenderThread:11999 [sender.py:send():382] send: stats
+2024-08-04 14:40:10,879 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:40:10,879 INFO    HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 3
+2024-08-04 14:40:10,879 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:40:10,879 INFO    SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-08-04 14:40:10,879 INFO    SenderThread:11999 [sender.py:transition_state():617] send defer: 4
+2024-08-04 14:40:10,879 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:40:10,879 INFO    HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 4
+2024-08-04 14:40:10,879 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:40:10,879 INFO    SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-08-04 14:40:10,879 INFO    SenderThread:11999 [sender.py:transition_state():617] send defer: 5
+2024-08-04 14:40:10,879 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:40:10,880 INFO    HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 5
+2024-08-04 14:40:10,880 DEBUG   SenderThread:11999 [sender.py:send():382] send: summary
+2024-08-04 14:40:10,881 INFO    SenderThread:11999 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-04 14:40:10,881 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:40:10,881 INFO    SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-08-04 14:40:10,881 INFO    SenderThread:11999 [sender.py:transition_state():617] send defer: 6
+2024-08-04 14:40:10,881 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:40:10,881 INFO    HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 6
+2024-08-04 14:40:10,881 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:40:10,881 INFO    SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-08-04 14:40:10,884 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: status_report
+2024-08-04 14:40:11,083 INFO    SenderThread:11999 [sender.py:transition_state():617] send defer: 7
+2024-08-04 14:40:11,083 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:40:11,083 INFO    HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 7
+2024-08-04 14:40:11,083 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:40:11,083 INFO    SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-08-04 14:40:11,113 INFO    Thread-12 :11999 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_144007-dds6qqbt/files/output.log
+2024-08-04 14:40:11,114 INFO    Thread-12 :11999 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_144007-dds6qqbt/files/config.yaml
+2024-08-04 14:40:11,114 INFO    Thread-12 :11999 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240804_144007-dds6qqbt/files/wandb-summary.json
+2024-08-04 14:40:11,842 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:40:12,953 INFO    SenderThread:11999 [sender.py:transition_state():617] send defer: 8
+2024-08-04 14:40:12,953 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:40:12,953 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:40:12,954 INFO    HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 8
+2024-08-04 14:40:12,954 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:40:12,954 INFO    SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-08-04 14:40:12,954 INFO    SenderThread:11999 [job_builder.py:build():296] Attempting to build job artifact
+2024-08-04 14:40:12,955 INFO    SenderThread:11999 [job_builder.py:_get_source_type():426] is repo sourced job
+2024-08-04 14:40:12,969 INFO    SenderThread:11999 [job_builder.py:build():402] adding wandb-job metadata file
+2024-08-04 14:40:12,987 INFO    SenderThread:11999 [sender.py:transition_state():617] send defer: 9
+2024-08-04 14:40:12,987 DEBUG   SenderThread:11999 [sender.py:send():382] send: artifact
+2024-08-04 14:40:12,988 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:40:12,989 INFO    HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 9
+2024-08-04 14:40:13,115 INFO    Thread-12 :11999 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240804_144007-dds6qqbt/files/output.log
+2024-08-04 14:40:13,842 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:40:13,848 INFO    SenderThread:11999 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTA5MTk2NTkzOA==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTA5MzUzODM4NQ==', 'versionIndex': 3}}}
+2024-08-04 14:40:13,848 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:40:13,848 INFO    SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 9
+2024-08-04 14:40:13,848 INFO    SenderThread:11999 [dir_watcher.py:finish():358] shutting down directory watcher
+2024-08-04 14:40:14,116 INFO    SenderThread:11999 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240804_144007-dds6qqbt/files
+2024-08-04 14:40:14,116 INFO    SenderThread:11999 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_144007-dds6qqbt/files/requirements.txt requirements.txt
+2024-08-04 14:40:14,116 INFO    SenderThread:11999 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_144007-dds6qqbt/files/config.yaml config.yaml
+2024-08-04 14:40:14,118 INFO    SenderThread:11999 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_144007-dds6qqbt/files/wandb-metadata.json wandb-metadata.json
+2024-08-04 14:40:14,118 INFO    SenderThread:11999 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_144007-dds6qqbt/files/wandb-summary.json wandb-summary.json
+2024-08-04 14:40:14,119 INFO    SenderThread:11999 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240804_144007-dds6qqbt/files/output.log output.log
+2024-08-04 14:40:14,121 INFO    SenderThread:11999 [sender.py:transition_state():617] send defer: 10
+2024-08-04 14:40:14,121 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:40:14,121 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:40:14,121 INFO    HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 10
+2024-08-04 14:40:14,123 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:40:14,123 INFO    SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 10
+2024-08-04 14:40:14,123 INFO    SenderThread:11999 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-04 14:40:14,515 INFO    wandb-upload_0:11999 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_144007-dds6qqbt/files/requirements.txt
+2024-08-04 14:40:14,617 INFO    wandb-upload_1:11999 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_144007-dds6qqbt/files/config.yaml
+2024-08-04 14:40:14,698 INFO    wandb-upload_2:11999 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_144007-dds6qqbt/files/wandb-summary.json
+2024-08-04 14:40:14,843 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:40:14,843 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:40:15,184 INFO    wandb-upload_3:11999 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240804_144007-dds6qqbt/files/output.log
+2024-08-04 14:40:15,384 INFO    Thread-11 (_thread_body):11999 [sender.py:transition_state():617] send defer: 11
+2024-08-04 14:40:15,384 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:40:15,385 INFO    HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 11
+2024-08-04 14:40:15,385 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:40:15,385 INFO    SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 11
+2024-08-04 14:40:15,385 INFO    SenderThread:11999 [file_pusher.py:join():178] waiting for file pusher
+2024-08-04 14:40:15,385 INFO    SenderThread:11999 [sender.py:transition_state():617] send defer: 12
+2024-08-04 14:40:15,385 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:40:15,385 INFO    HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 12
+2024-08-04 14:40:15,385 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:40:15,386 INFO    SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 12
+2024-08-04 14:40:15,386 INFO    SenderThread:11999 [file_stream.py:finish():595] file stream finish called
+2024-08-04 14:40:15,573 INFO    SenderThread:11999 [file_stream.py:finish():599] file stream finish is done
+2024-08-04 14:40:15,573 INFO    SenderThread:11999 [sender.py:transition_state():617] send defer: 13
+2024-08-04 14:40:15,573 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:40:15,573 INFO    HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 13
+2024-08-04 14:40:15,573 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:40:15,574 INFO    SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 13
+2024-08-04 14:40:15,574 INFO    SenderThread:11999 [sender.py:transition_state():617] send defer: 14
+2024-08-04 14:40:15,574 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: defer
+2024-08-04 14:40:15,574 DEBUG   SenderThread:11999 [sender.py:send():382] send: final
+2024-08-04 14:40:15,574 INFO    HandlerThread:11999 [handler.py:handle_request_defer():172] handle defer: 14
+2024-08-04 14:40:15,574 DEBUG   SenderThread:11999 [sender.py:send():382] send: footer
+2024-08-04 14:40:15,574 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: defer
+2024-08-04 14:40:15,574 INFO    SenderThread:11999 [sender.py:send_request_defer():613] handle sender defer: 14
+2024-08-04 14:40:15,575 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:40:15,575 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:40:15,575 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-04 14:40:15,576 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: poll_exit
+2024-08-04 14:40:15,576 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: server_info
+2024-08-04 14:40:15,576 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: get_summary
+2024-08-04 14:40:15,576 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: server_info
+2024-08-04 14:40:15,578 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: sampled_history
+2024-08-04 14:40:15,578 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-04 14:40:15,578 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: job_info
+2024-08-04 14:40:15,734 DEBUG   SenderThread:11999 [sender.py:send_request():409] send_request: job_info
+2024-08-04 14:40:15,735 INFO    MainThread:11999 [wandb_run.py:_footer_history_summary_info():3866] rendering history
+2024-08-04 14:40:15,735 INFO    MainThread:11999 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
+2024-08-04 14:40:15,735 INFO    MainThread:11999 [wandb_run.py:_footer_sync_info():3825] logging synced files
+2024-08-04 14:40:15,735 DEBUG   HandlerThread:11999 [handler.py:handle_request():146] handle_request: shutdown
+2024-08-04 14:40:15,735 INFO    HandlerThread:11999 [handler.py:finish():869] shutting down handler
+2024-08-04 14:40:16,578 INFO    WriterThread:11999 [datastore.py:close():296] close: /project/wandb/run-20240804_144007-dds6qqbt/run-dds6qqbt.wandb
+2024-08-04 14:40:16,735 INFO    SenderThread:11999 [sender.py:finish():1572] shutting down sender
+2024-08-04 14:40:16,735 INFO    SenderThread:11999 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-04 14:40:16,735 INFO    SenderThread:11999 [file_pusher.py:join():178] waiting for file pusher

wandb/run-20240804_144007-dds6qqbt/logs/debug.log ADDED Viewed

	@@ -0,0 +1,30 @@

+2024-08-04 14:40:07,600 INFO    MainThread:11928 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-04 14:40:07,601 INFO    MainThread:11928 [wandb_setup.py:_flush():76] Configure stats pid to 11928
+2024-08-04 14:40:07,601 INFO    MainThread:11928 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-04 14:40:07,601 INFO    MainThread:11928 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-04 14:40:07,601 INFO    MainThread:11928 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train tiny llama sample'}
+2024-08-04 14:40:07,601 INFO    MainThread:11928 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-04 14:40:07,601 INFO    MainThread:11928 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-04 14:40:07,601 INFO    MainThread:11928 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_144007-dds6qqbt/logs/debug.log
+2024-08-04 14:40:07,601 INFO    MainThread:11928 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_144007-dds6qqbt/logs/debug-internal.log
+2024-08-04 14:40:07,601 INFO    MainThread:11928 [wandb_init.py:init():566] calling init triggers
+2024-08-04 14:40:07,601 INFO    MainThread:11928 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'valid_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'test_data_path': ['4013541', '/work/llm_recipes/datasets/bin/common_crawl_and_extended_common_crawl.doc_extracted.200.sorted.uniq.filtered.shuf.head/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 512, 'num_workers': 2, 'tokenizer_type': 'Llama2Tokenizer', 'tokenizer_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1/tokenizer.model', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'tiny-llama_train_2024-08-04-14:39:57', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/tiny-llama', 'save': '/work/llm_recipes/models/tiny-llama', 'base_model': '/share/pretrained_lm/meta-llama/TinyLlama_v1.1', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 2000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 2000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 8, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/tiny-llama', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 32000, 'gradient_accumulation_steps': 40}
+2024-08-04 14:40:07,602 INFO    MainThread:11928 [wandb_init.py:init():616] starting backend
+2024-08-04 14:40:07,602 INFO    MainThread:11928 [wandb_init.py:init():620] setting up manager
+2024-08-04 14:40:07,606 INFO    MainThread:11928 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-04 14:40:07,607 INFO    MainThread:11928 [wandb_init.py:init():628] backend started and connected
+2024-08-04 14:40:07,612 INFO    MainThread:11928 [wandb_init.py:init():720] updated telemetry
+2024-08-04 14:40:07,623 INFO    MainThread:11928 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-04 14:40:08,115 INFO    MainThread:11928 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-04 14:40:08,197 INFO    MainThread:11928 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-04 14:40:08,197 INFO    MainThread:11928 [wandb_init.py:init():804] starting run threads in backend
+2024-08-04 14:40:08,254 INFO    MainThread:11928 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-04 14:40:08,254 INFO    MainThread:11928 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-04 14:40:08,254 INFO    MainThread:11928 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-04 14:40:08,255 INFO    MainThread:11928 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-04 14:40:08,255 INFO    MainThread:11928 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-04 14:40:10,755 INFO    MainThread:11928 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'silu', 'hidden_size': 2048, 'model_type': 'llama', 'max_position_embeddings': 2048, 'num_attention_heads': 32, 'num_hidden_layers': 22, 'model_architecture': 'LlamaForCausalLM'}
+2024-08-04 14:40:10,755 INFO    MainThread:11928 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
+2024-08-04 14:40:16,736 WARNING MsgRouterThr:11928 [router.py:message_loop():77] message_loop has been closed

wandb/run-20240804_144007-dds6qqbt/run-dds6qqbt.wandb ADDED Viewed

Binary file (20.5 kB). View file

wandb/run-20240804_222226-kh5katc1/files/config.yaml ADDED Viewed

	@@ -0,0 +1,335 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '235289369'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '235289369'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '235289369'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 4096
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: HFPreTrainedTokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/google/gemma-2-2b
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: yans-sample-gemma-2-2b_train_2024-08-04-22:22:15
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/yans-sample-gemma-2-2b
+save:
+  desc: null
+  value: /work/llm_recipes/models/yans-sample-gemma-2-2b
+base_model:
+  desc: null
+  value: /share/pretrained_lm/google/gemma-2-2b
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 200
+save_interval:
+  desc: null
+  value: 200
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: anyprecision
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 20000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 20000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 1
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/yans-sample-gemma-2-2b
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 256000
+gradient_accumulation_steps:
+  desc: null
+  value: 320
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1722777746.267116
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+activation_function:
+  desc: null
+  value: gelu_pytorch_tanh
+hidden_size:
+  desc: null
+  value: 2304
+model_type:
+  desc: null
+  value: gemma2
+max_position_embeddings:
+  desc: null
+  value: 4096
+num_attention_heads:
+  desc: null
+  value: 8
+num_hidden_layers:
+  desc: null
+  value: 26
+model_architecture:
+  desc: null
+  value: Gemma2ForCausalLM

wandb/run-20240804_222226-kh5katc1/files/output.log ADDED Viewed

	@@ -0,0 +1,468 @@

+Created Hugging Face repository with ID koichi12/yans-sample-gemma-2-2b.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
+Loading checkpoint shards:  67%|██████▋   | 2/3 [00:03<00:01,  1.62s/it]
+File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping model loading
+--> Model /share/pretrained_lm/google/gemma-2-2b
+--> /share/pretrained_lm/google/gemma-2-2b has 2614.341888 Million params
+BFloat16 enabled for mixed precision - using bfSixteen policy
+--> applying fsdp activation checkpointing...
+ > datasets target sizes (minimum size):
+    train:      6400000
+    validation: 323200
+    test:       3200
+> building train, validation, and test datasets for GPT ...
+> finished creating GPT datasets ...
+File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping optimizer loading
+File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+model info: FullyShardedDataParallel(
+  (_fsdp_wrapped_module): Gemma2ForCausalLM(
+    (model): Gemma2Model(
+      (embed_tokens): Embedding(256000, 2304, padding_idx=0)
+      (layers): ModuleList(
+        (0-25): 26 x FullyShardedDataParallel(
+          (_fsdp_wrapped_module): CheckpointWrapper(
+            (_checkpoint_wrapped_module): Gemma2DecoderLayer(
+              (self_attn): Gemma2FlashAttention2(
+                (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
+                (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
+                (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
+                (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
+                (rotary_emb): Gemma2RotaryEmbedding()
+              )
+              (mlp): Gemma2MLP(
+                (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
+                (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
+                (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
+                (act_fn): PytorchGELUTanh()
+              )
+              (input_layernorm): Gemma2RMSNorm()
+              (post_attention_layernorm): Gemma2RMSNorm()
+              (pre_feedforward_layernorm): Gemma2RMSNorm()
+              (post_feedforward_layernorm): Gemma2RMSNorm()
+            )
+          )
+        )
+      )
+      (norm): Gemma2RMSNorm()
+    )
+    (lm_head): Linear(in_features=2304, out_features=256000, bias=False)
+  )
+)
+model config: Gemma2Config {
+  "_name_or_path": "/share/pretrained_lm/google/gemma-2-2b",
+  "architectures": [
+    "Gemma2ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attn_logit_softcapping": 50.0,
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "eos_token_id": 1,
+  "final_logit_softcapping": 30.0,
+  "head_dim": 256,
+  "hidden_act": "gelu_pytorch_tanh",
+  "hidden_activation": "gelu_pytorch_tanh",
+  "hidden_size": 2304,
+  "initializer_range": 0.02,
+  "intermediate_size": 9216,
+  "label_smoothing": 0.0,
+  "max_position_embeddings": 4096,
+  "model_type": "gemma2",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 26,
+  "num_key_value_heads": 4,
+  "pad_token_id": 0,
+  "query_pre_attn_scalar": 256,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 10000.0,
+  "sliding_window": 4096,
+  "torch_dtype": "float32",
+  "transformers_version": "4.43.3",
+  "use_cache": false,
+  "vocab_size": 256000
+Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.16s/it]
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
+  warnings.warn(
+Let split = None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
+------------------------------------------------------------------
+iteration: 1 , TFLOPS: 86.75197547568487, Tokens per sec: 5563.411303067021, Loss: 4.171908378601074
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 2 , TFLOPS: 66.89933870537911, Tokens per sec: 4290.26007857923, Loss: 4.01677942276001
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 3 , TFLOPS: 67.16726893555325, Tokens per sec: 4307.442466217215, Loss: 3.9401252269744873
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 4 , TFLOPS: 67.25290490347041, Tokens per sec: 4312.934307864013, Loss: 3.754024028778076
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 5 , TFLOPS: 67.291445985822, Tokens per sec: 4315.405950636545, Loss: 3.8183631896972656
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 6 , TFLOPS: 67.19993814817916, Tokens per sec: 4309.537545502599, Loss: 3.913503408432007
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 7 , TFLOPS: 67.30122810400093, Tokens per sec: 4316.033278677735, Loss: 3.851064682006836
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 8 , TFLOPS: 67.16795653479824, Tokens per sec: 4307.486562013197, Loss: 3.6646127700805664
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 9 , TFLOPS: 67.23016958415664, Tokens per sec: 4311.4762899715615, Loss: 3.7966654300689697
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 10 , TFLOPS: 67.23271391538408, Tokens per sec: 4311.639458141876, Loss: 3.5526936054229736
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 11 , TFLOPS: 67.17798338980677, Tokens per sec: 4308.129585047344, Loss: 3.6002132892608643
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 12 , TFLOPS: 67.30360350705875, Tokens per sec: 4316.185613470676, Loss: 3.5705204010009766
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 13 , TFLOPS: 67.13811947997524, Tokens per sec: 4305.573109240019, Loss: 3.5447990894317627
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 14 , TFLOPS: 67.15854019228757, Tokens per sec: 4306.882691195614, Loss: 3.450416088104248
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 15 , TFLOPS: 67.19845754951105, Tokens per sec: 4309.442594588568, Loss: 3.443570613861084
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 16 , TFLOPS: 67.23455541812397, Tokens per sec: 4311.757553863634, Loss: 3.3366641998291016
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 17 , TFLOPS: 67.30688001895524, Tokens per sec: 4316.395736447352, Loss: 3.332282066345215
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 18 , TFLOPS: 67.36120902746241, Tokens per sec: 4319.879860219242, Loss: 3.34403395652771
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 19 , TFLOPS: 67.26840440584516, Tokens per sec: 4313.928292222649, Loss: 3.256293773651123
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 20 , TFLOPS: 67.17348341042366, Tokens per sec: 4307.8410010003945, Loss: 3.3122451305389404
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 21 , TFLOPS: 67.2001168793811, Tokens per sec: 4309.54900754924, Loss: 3.2204227447509766
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 22 , TFLOPS: 67.23699865533753, Tokens per sec: 4311.914238866545, Loss: 3.2488620281219482
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 23 , TFLOPS: 67.2425865851171, Tokens per sec: 4312.272593261658, Loss: 3.163287401199341
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 24 , TFLOPS: 67.21941753377986, Tokens per sec: 4310.786760098965, Loss: 3.2160401344299316
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 25 , TFLOPS: 67.09871135713247, Tokens per sec: 4303.04586308967, Loss: 3.0935139656066895
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 26 , TFLOPS: 67.20080576079224, Tokens per sec: 4309.593185570642, Loss: 3.047175168991089
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 27 , TFLOPS: 67.27441115034365, Tokens per sec: 4314.313505240039, Loss: 3.0304696559906006
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 28 , TFLOPS: 67.26365793583362, Tokens per sec: 4313.623900711482, Loss: 3.0319135189056396
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 29 , TFLOPS: 67.16464708688589, Tokens per sec: 4307.27432684712, Loss: 2.959254264831543
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 30 , TFLOPS: 67.3000542568793, Tokens per sec: 4315.957999765541, Loss: 2.913499116897583
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 31 , TFLOPS: 67.18211917043104, Tokens per sec: 4308.3948129980145, Loss: 2.940014362335205
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 32 , TFLOPS: 67.25841762372463, Tokens per sec: 4313.287839066096, Loss: 2.8469998836517334
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 33 , TFLOPS: 67.33731321073192, Tokens per sec: 4318.347419532266, Loss: 2.829812526702881
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 34 , TFLOPS: 67.24161982046462, Tokens per sec: 4312.210594565195, Loss: 2.8521993160247803
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 35 , TFLOPS: 67.24248740627992, Tokens per sec: 4312.266232914695, Loss: 2.8338708877563477
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 36 , TFLOPS: 67.24777489174788, Tokens per sec: 4312.60531979146, Loss: 2.787545680999756
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 37 , TFLOPS: 67.30205154448893, Tokens per sec: 4316.086085983773, Loss: 2.81471848487854
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 38 , TFLOPS: 67.13737290861587, Tokens per sec: 4305.525231557506, Loss: 2.7764387130737305
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 39 , TFLOPS: 67.22735358248879, Tokens per sec: 4311.295699553621, Loss: 2.7642412185668945
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 40 , TFLOPS: 67.26715109677696, Tokens per sec: 4313.847917409303, Loss: 2.7132599353790283
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 41 , TFLOPS: 67.23918606123682, Tokens per sec: 4312.054517386288, Loss: 2.668989896774292
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 42 , TFLOPS: 67.13128246048267, Tokens per sec: 4305.134650619155, Loss: 2.6973328590393066
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 43 , TFLOPS: 67.23091373690416, Tokens per sec: 4311.524012548299, Loss: 2.685912609100342
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 44 , TFLOPS: 67.27693115124784, Tokens per sec: 4314.475113104727, Loss: 2.662001371383667
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 45 , TFLOPS: 67.27965002709941, Tokens per sec: 4314.649474836105, Loss: 2.6665873527526855
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 46 , TFLOPS: 67.15514015419501, Tokens per sec: 4306.664646473851, Loss: 2.6501307487487793
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 47 , TFLOPS: 67.2760527329066, Tokens per sec: 4314.418780064453, Loss: 2.6316823959350586
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 48 , TFLOPS: 67.25548187637087, Tokens per sec: 4313.099569347494, Loss: 2.6278648376464844
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 49 , TFLOPS: 67.35263957774154, Tokens per sec: 4319.330300705736, Loss: 2.6157166957855225
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 50 , TFLOPS: 67.32408825677271, Tokens per sec: 4317.499302150089, Loss: 2.5965774059295654
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 51 , TFLOPS: 67.1953666892378, Tokens per sec: 4309.244377465717, Loss: 2.578054904937744
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 52 , TFLOPS: 67.25156682148656, Tokens per sec: 4312.848496556634, Loss: 2.5468966960906982
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 53 , TFLOPS: 67.32404734871982, Tokens per sec: 4317.496678713301, Loss: 2.53428316116333
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 54 , TFLOPS: 67.15867426285547, Tokens per sec: 4306.89128915213, Loss: 2.545722246170044
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 55 , TFLOPS: 67.27601676163123, Tokens per sec: 4314.416473223611, Loss: 2.5279200077056885
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 56 , TFLOPS: 67.19740155918589, Tokens per sec: 4309.374873842397, Loss: 2.534917116165161
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 57 , TFLOPS: 67.2461120484207, Tokens per sec: 4312.498681512492, Loss: 2.5658233165740967
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 58 , TFLOPS: 67.2920938769174, Tokens per sec: 4315.447499945635, Loss: 2.5472288131713867
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 59 , TFLOPS: 67.27804058384706, Tokens per sec: 4314.546261108317, Loss: 2.4994900226593018
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 60 , TFLOPS: 67.28150855801171, Tokens per sec: 4314.768662575956, Loss: 2.502976417541504
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 61 , TFLOPS: 67.3506410671317, Tokens per sec: 4319.2021360563995, Loss: 2.5281176567077637
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 62 , TFLOPS: 67.23894764547772, Tokens per sec: 4312.039227764101, Loss: 2.514285087585449
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 63 , TFLOPS: 67.26110814707724, Tokens per sec: 4313.460382549388, Loss: 2.482907772064209
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 64 , TFLOPS: 67.16648997644158, Tokens per sec: 4307.39251150549, Loss: 2.4810938835144043
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 65 , TFLOPS: 67.13380749324574, Tokens per sec: 4305.2965811773665, Loss: 2.4889049530029297
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 66 , TFLOPS: 67.29568135916668, Tokens per sec: 4315.677565476544, Loss: 2.4739832878112793
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 67 , TFLOPS: 67.2353824902874, Tokens per sec: 4311.810594069316, Loss: 2.4979248046875
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 68 , TFLOPS: 67.16737608801321, Tokens per sec: 4307.449337913261, Loss: 2.4705636501312256
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 69 , TFLOPS: 67.17368447741053, Tokens per sec: 4307.853895442756, Loss: 2.431494951248169
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 70 , TFLOPS: 67.27513003078525, Tokens per sec: 4314.3596071017255, Loss: 2.4638864994049072
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 71 , TFLOPS: 67.13314091760232, Tokens per sec: 4305.253833626679, Loss: 2.4194881916046143
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 72 , TFLOPS: 67.35945536468331, Tokens per sec: 4319.767397681375, Loss: 2.4741766452789307
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 73 , TFLOPS: 67.22132247798172, Tokens per sec: 4310.908924326882, Loss: 2.438474416732788
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 74 , TFLOPS: 67.20619442505729, Tokens per sec: 4309.9387610519625, Loss: 2.466714859008789
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 75 , TFLOPS: 67.2254479385552, Tokens per sec: 4311.17349045185, Loss: 2.4174747467041016
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 76 , TFLOPS: 67.24521841222351, Tokens per sec: 4312.441372549867, Loss: 2.424267053604126
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 77 , TFLOPS: 67.22922395995721, Tokens per sec: 4311.415647014088, Loss: 2.404212474822998
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 78 , TFLOPS: 67.23452652330809, Tokens per sec: 4311.755700836721, Loss: 2.450658082962036
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 79 , TFLOPS: 67.0846114872016, Tokens per sec: 4302.141637274464, Loss: 2.4231417179107666
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 80 , TFLOPS: 67.17704276320255, Tokens per sec: 4308.069262586061, Loss: 2.413994312286377
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 81 , TFLOPS: 67.2345689529718, Tokens per sec: 4311.758421854535, Loss: 2.4133667945861816
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 82 , TFLOPS: 67.18505033340458, Tokens per sec: 4308.582788719936, Loss: 2.389362335205078
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 83 , TFLOPS: 67.28162310992364, Tokens per sec: 4314.776008799464, Loss: 2.4374401569366455
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 84 , TFLOPS: 67.2334157092426, Tokens per sec: 4311.684464239587, Loss: 2.3909661769866943
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 85 , TFLOPS: 67.31368056601009, Tokens per sec: 4316.831856087792, Loss: 2.411787748336792
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 86 , TFLOPS: 67.11865914241415, Tokens per sec: 4304.325116195997, Loss: 2.4398515224456787
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 87 , TFLOPS: 67.24083693352927, Tokens per sec: 4312.160387961816, Loss: 2.3902275562286377
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 88 , TFLOPS: 67.3222851144248, Tokens per sec: 4317.383666483415, Loss: 2.3877973556518555
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 89 , TFLOPS: 67.14511488288893, Tokens per sec: 4306.021725002672, Loss: 2.376176357269287
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 90 , TFLOPS: 67.29125521000229, Tokens per sec: 4315.3937161675785, Loss: 2.3973848819732666
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 91 , TFLOPS: 67.1356528047859, Tokens per sec: 4305.414921157799, Loss: 2.388991355895996
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 92 , TFLOPS: 67.25754211457983, Tokens per sec: 4313.231692592827, Loss: 2.383312463760376
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 93 , TFLOPS: 67.15498729921683, Tokens per sec: 4306.654843871562, Loss: 2.3923604488372803
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 94 , TFLOPS: 67.32478814446085, Tokens per sec: 4317.544186004938, Loss: 2.3716728687286377
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 95 , TFLOPS: 67.3161465459375, Tokens per sec: 4316.989999582809, Loss: 2.405150890350342
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 96 , TFLOPS: 67.20162737067454, Tokens per sec: 4309.645875479786, Loss: 2.365361213684082
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 97 , TFLOPS: 67.17173577081181, Tokens per sec: 4307.728924728738, Loss: 2.3839645385742188
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 98 , TFLOPS: 67.20004987934048, Tokens per sec: 4309.544710831139, Loss: 2.3723373413085938
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 99 , TFLOPS: 67.30991336059388, Tokens per sec: 4316.590264895447, Loss: 2.3913819789886475
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 100 , TFLOPS: 67.23987549288418, Tokens per sec: 4312.098730694383, Loss: 2.3768458366394043
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 101 , TFLOPS: 67.33907694033823, Tokens per sec: 4318.460527656589, Loss: 2.3836305141448975
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 102 , TFLOPS: 67.30975607840512, Tokens per sec: 4316.580178375781, Loss: 2.3950178623199463
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 103 , TFLOPS: 67.1982354002556, Tokens per sec: 4309.428348138593, Loss: 2.361278772354126
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 104 , TFLOPS: 67.20376894334782, Tokens per sec: 4309.783214710986, Loss: 2.3559556007385254
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 105 , TFLOPS: 67.23013357946196, Tokens per sec: 4311.47398098754, Loss: 2.349632740020752
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 106 , TFLOPS: 67.23129147862021, Tokens per sec: 4311.548237155534, Loss: 2.379448652267456
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 107 , TFLOPS: 67.16429762559119, Tokens per sec: 4307.251915865627, Loss: 2.4072415828704834
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 108 , TFLOPS: 67.26025670890765, Tokens per sec: 4313.405779749734, Loss: 2.3945987224578857
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 109 , TFLOPS: 67.13558642664209, Tokens per sec: 4305.410664321992, Loss: 2.3535115718841553
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 110 , TFLOPS: 67.27982379366702, Tokens per sec: 4314.660618500338, Loss: 2.3627665042877197
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 111 , TFLOPS: 67.26391532288811, Tokens per sec: 4313.640406964398, Loss: 2.3859591484069824
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 112 , TFLOPS: 67.27053505647855, Tokens per sec: 4314.064931022535, Loss: 2.3465442657470703
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 113 , TFLOPS: 67.22654753561278, Tokens per sec: 4311.244007701346, Loss: 2.396284818649292
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 114 , TFLOPS: 67.12289176484347, Tokens per sec: 4304.596554619569, Loss: 2.3716585636138916
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 115 , TFLOPS: 67.13262769476694, Tokens per sec: 4305.220920604149, Loss: 2.3369154930114746
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 116 , TFLOPS: 67.17146478693049, Tokens per sec: 4307.711546510201, Loss: 2.302396535873413
+------------------------------------------------------------------
+Traceback (most recent call last):
+  File "/project/examples/finetuning.py", line 13, in <module>
+    main()
+  File "/project/src/llama_recipes/finetuning.py", line 281, in main
+    train(
+  File "/project/src/llama_recipes/utils/train_utils.py", line 118, in train
+    loss.backward()
+  File "/usr/local/lib/python3.10/dist-packages/torch/_tensor.py", line 522, in backward
+    torch.autograd.backward(
+  File "/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py", line 267, in backward
+    _engine_run_backward(
+  File "/usr/local/lib/python3.10/dist-packages/torch/autograd/graph.py", line 681, in _engine_run_backward
+    return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+KeyboardInterrupt

wandb/run-20240804_222226-kh5katc1/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,271 @@

+absl-py==2.1.0
+accelerate==0.33.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+distributed==2023.11.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+grpcio==1.60.1
+huggingface-hub==0.24.5
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+jedi==0.19.1
+jinja2==3.1.3
+joblib==1.3.2
+json5==0.9.14
+jsonnet==0.19.1
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy-loader==0.3
+librosa==0.10.1
+llvmlite==0.40.1
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+murmurhash==1.0.10
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+onnx==1.15.0rc2
+opencv==4.7.0
+optree==0.10.0
+packaging==23.2
+pandas==1.5.3
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1.dev0+gba5374836.d20240125
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pynvml==11.4.1
+pyparsing==3.1.1
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+rich==13.7.0
+rmm==23.12.0
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.0
+safetensors==0.4.3
+scikit-learn==1.2.0
+scipy==1.12.0
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.9.0
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+xdoctest==1.0.2
+xgboost==1.7.6
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0

wandb/run-20240804_222226-kh5katc1/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-04T13:22:26.872566",
+    "startedAt": "2024-08-04T13:22:26.250232",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "4096",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "1",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "20000",
+        "--tokenizer-type",
+        "HFPreTrainedTokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/google/gemma-2-2b",
+        "--train-data-path",
+        "235289369",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
+        "--valid-data-path",
+        "235289369",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
+        "--test-data-path",
+        "235289369",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "20000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "anyprecision",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "200",
+        "--eval-interval",
+        "200",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/google/gemma-2-2b",
+        "--save",
+        "/work/llm_recipes/models/yans-sample-gemma-2-2b",
+        "--load",
+        "/work/llm_recipes/models/yans-sample-gemma-2-2b",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/yans-sample-gemma-2-2b",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "yans-sample-gemma-2-2b_train_2024-08-04-22:22:15"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "0336bd6c20fe25d78eda1d14afa66c1ae2e6d687"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.044999999999,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.045,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.48782730102539
+    }
+}

wandb/run-20240804_222226-kh5katc1/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"training/loss": 2.302396535873413, "training/perplexity": 9.99811460655144, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 320, "utils/iteration": 116, "optimizer/lr": 5.4080000000000006e-06, "optimizer/variance_l2": 0.0030219239359304895, "optimizer/variance_sqrt_l2": 0.8405880490942215, "optimizer/momentum_l2": 0.36270596473675665, "optimizer/weight_l2": 1167.8420269882395, "optimizer/variance_l1": 0.70648193359375, "optimizer/variance_sqrt_l1": 19948.0, "optimizer/momentum_l1": 5862.0, "optimizer/weight_l1": 29775872.0, "optimizer/variance_abs_max": 0.001068115234375, "optimizer/variance_sqrt_abs_max": 0.03271484375, "optimizer/momentum_abs_max": 0.0250244140625, "optimizer/weight_abs_max": 12.9375, "stats/1_iteration_time": 304.34721216700564, "stats/tokens_per_sec": 4307.711546510201, "stats/tokens_per_sec_per_gpu": 4307.711546510201, "stats/tflops": 67.17146478693049, "_timestamp": 1722812960.6351748, "_runtime": 35214.36805868149, "_step": 116, "_wandb": {"runtime": 35371}}

wandb/run-20240804_222226-kh5katc1/logs/debug-internal.log ADDED Viewed

The diff for this file is too large to render. See raw diff

wandb/run-20240804_222226-kh5katc1/logs/debug.log ADDED Viewed

	@@ -0,0 +1,30 @@

+2024-08-04 22:22:26,260 INFO    MainThread:12896 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-04 22:22:26,261 INFO    MainThread:12896 [wandb_setup.py:_flush():76] Configure stats pid to 12896
+2024-08-04 22:22:26,261 INFO    MainThread:12896 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-04 22:22:26,261 INFO    MainThread:12896 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-04 22:22:26,261 INFO    MainThread:12896 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
+2024-08-04 22:22:26,261 INFO    MainThread:12896 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-04 22:22:26,261 INFO    MainThread:12896 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-04 22:22:26,261 INFO    MainThread:12896 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240804_222226-kh5katc1/logs/debug.log
+2024-08-04 22:22:26,261 INFO    MainThread:12896 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240804_222226-kh5katc1/logs/debug-internal.log
+2024-08-04 22:22:26,261 INFO    MainThread:12896 [wandb_init.py:init():566] calling init triggers
+2024-08-04 22:22:26,261 INFO    MainThread:12896 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'test_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/google/gemma-2-2b', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-sample-gemma-2-2b_train_2024-08-04-22:22:15', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'save': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'base_model': '/share/pretrained_lm/google/gemma-2-2b', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 200, 'save_interval': 200, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-sample-gemma-2-2b', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 256000, 'gradient_accumulation_steps': 320}
+2024-08-04 22:22:26,261 INFO    MainThread:12896 [wandb_init.py:init():616] starting backend
+2024-08-04 22:22:26,261 INFO    MainThread:12896 [wandb_init.py:init():620] setting up manager
+2024-08-04 22:22:26,266 INFO    MainThread:12896 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-04 22:22:26,266 INFO    MainThread:12896 [wandb_init.py:init():628] backend started and connected
+2024-08-04 22:22:26,271 INFO    MainThread:12896 [wandb_init.py:init():720] updated telemetry
+2024-08-04 22:22:26,282 INFO    MainThread:12896 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-04 22:22:26,766 INFO    MainThread:12896 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-04 22:22:26,847 INFO    MainThread:12896 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.5 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-04 22:22:26,847 INFO    MainThread:12896 [wandb_init.py:init():804] starting run threads in backend
+2024-08-04 22:22:26,902 INFO    MainThread:12896 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-04 22:22:26,902 INFO    MainThread:12896 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-04 22:22:26,903 INFO    MainThread:12896 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-04 22:22:26,903 INFO    MainThread:12896 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-04 22:22:26,903 INFO    MainThread:12896 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-04 22:22:32,202 INFO    MainThread:12896 [wandb_run.py:_config_callback():1343] config_cb None None {'activation_function': 'gelu_pytorch_tanh', 'hidden_size': 2304, 'model_type': 'gemma2', 'max_position_embeddings': 4096, 'num_attention_heads': 8, 'num_hidden_layers': 26, 'model_architecture': 'Gemma2ForCausalLM'}
+2024-08-04 22:22:32,203 INFO    MainThread:12896 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
+2024-08-05 08:12:06,481 WARNING MsgRouterThr:12896 [router.py:message_loop():77] message_loop has been closed

wandb/run-20240812_063447-whqmtxyq/files/config.yaml ADDED Viewed

	@@ -0,0 +1,335 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '235289369'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '235289369'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '235289369'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 1021
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: HFPreTrainedTokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/google/gemma-2-2b
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: yans-sample-gemma-2-2b_train_2024-08-12-06:34:36
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/yans-sample-gemma-2-2b
+save:
+  desc: null
+  value: /work/llm_recipes/models/yans-sample-gemma-2-2b
+base_model:
+  desc: null
+  value: /share/pretrained_lm/google/gemma-2-2b
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 3
+save_interval:
+  desc: null
+  value: 3
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: anyprecision
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 20000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 20000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 1
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/yans-sample-gemma-2-2b
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 256000
+gradient_accumulation_steps:
+  desc: null
+  value: 320
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1723412087.358797
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+model_architecture:
+  desc: null
+  value: Gemma2ForCausalLM
+activation_function:
+  desc: null
+  value: gelu_pytorch_tanh
+hidden_size:
+  desc: null
+  value: 2304
+model_type:
+  desc: null
+  value: gemma2
+max_position_embeddings:
+  desc: null
+  value: 1021
+num_attention_heads:
+  desc: null
+  value: 8
+num_hidden_layers:
+  desc: null
+  value: 26

wandb/run-20240812_063447-whqmtxyq/files/output.log ADDED Viewed

	@@ -0,0 +1,144 @@

+Created Hugging Face repository with ID koichi12/yans-sample-gemma-2-2b.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
+Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:03,  1.92s/it]
+File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping model loading
+--> Model /share/pretrained_lm/google/gemma-2-2b
+--> /share/pretrained_lm/google/gemma-2-2b has 2614.341888 Million params
+Loading checkpoint shards: 100%|██████████| 3/3 [01:18<00:00, 26.21s/it]
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
+  warnings.warn(
+Let split = None
+--> applying fsdp activation checkpointing...
+ > datasets target sizes (minimum size):
+    train:      6400000
+    validation: 21334400
+    test:       3200
+> building train, validation, and test datasets for GPT ...
+> finished creating GPT datasets ...
+File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+No checkpoint found in /work/llm_recipes/models/yans-sample-gemma-2-2b, skipping optimizer loading
+File not found: /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+Unable to read latest iteration from /work/llm_recipes/models/yans-sample-gemma-2-2b/latest_iteration.txt
+model info: FullyShardedDataParallel(
+  (_fsdp_wrapped_module): Gemma2ForCausalLM(
+    (model): Gemma2Model(
+      (embed_tokens): Embedding(256000, 2304, padding_idx=0)
+      (layers): ModuleList(
+        (0-25): 26 x FullyShardedDataParallel(
+          (_fsdp_wrapped_module): CheckpointWrapper(
+            (_checkpoint_wrapped_module): Gemma2DecoderLayer(
+              (self_attn): Gemma2FlashAttention2(
+                (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
+                (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
+                (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
+                (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
+                (rotary_emb): Gemma2RotaryEmbedding()
+              )
+              (mlp): Gemma2MLP(
+                (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
+                (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
+                (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
+                (act_fn): PytorchGELUTanh()
+              )
+              (input_layernorm): Gemma2RMSNorm()
+              (post_attention_layernorm): Gemma2RMSNorm()
+              (pre_feedforward_layernorm): Gemma2RMSNorm()
+              (post_feedforward_layernorm): Gemma2RMSNorm()
+            )
+          )
+        )
+      )
+      (norm): Gemma2RMSNorm()
+    )
+    (lm_head): Linear(in_features=2304, out_features=256000, bias=False)
+  )
+)
+model config: Gemma2Config {
+  "_name_or_path": "/share/pretrained_lm/google/gemma-2-2b",
+  "architectures": [
+    "Gemma2ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attn_logit_softcapping": 50.0,
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "eos_token_id": 1,
+  "final_logit_softcapping": 30.0,
+  "head_dim": 256,
+  "hidden_act": "gelu_pytorch_tanh",
+  "hidden_activation": "gelu_pytorch_tanh",
+  "hidden_size": 2304,
+  "initializer_range": 0.02,
+  "intermediate_size": 9216,
+  "label_smoothing": 0.0,
+  "max_position_embeddings": 1021,
+  "model_type": "gemma2",
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
+  "num_attention_heads": 8,
+  "num_hidden_layers": 26,
+  "num_key_value_heads": 4,
+  "pad_token_id": 0,
+  "query_pre_attn_scalar": 256,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 10000.0,
+  "sliding_window": 4096,
+  "torch_dtype": "float32",
+  "transformers_version": "4.43.3",
+  "use_cache": false,
+  "vocab_size": 256000
+}
+------------------------------------------------------------------
+iteration: 1 , TFLOPS: 52.56331460229552, Tokens per sec: 3927.6626762354495, Loss: 16.080825805664062
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 2 , TFLOPS: 52.356892101499724, Tokens per sec: 3912.238269345489, Loss: 15.729490280151367
+------------------------------------------------------------------
+------------------------------------------------------------------
+iteration: 3 , TFLOPS: 52.39645244456057, Tokens per sec: 3915.194317381553, Loss: 15.54540729522705
+------------------------------------------------------------------
+ eval ppl=4948606.5, eval loss=15.414616584777832
+Saving checkpoint to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003
+Saving model state dict to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/model.pt
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:773: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
+  warnings.warn(
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_state_dict_utils.py:716: UserWarning: When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict willbe returned.
+  warnings.warn(
+Saved model state dict to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/model.pt
+Saving optimizer state dict to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/optimizer.pt
+[rank0]:[2024-08-12 06:40:35,335] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling:  defaultdict(<class 'float'>, {'preprocessing': 0.008401250000133587, 'preprocessing_with_comm': 0.0009138020004684222, 'state_converting': 5.079375774000255, <Type.ALL: 'all'>: 5.090390497000044})
+Saved optimizer state dict to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/optimizer.pt
+Traceback (most recent call last):
+  File "/project/examples/finetuning.py", line 13, in <module>
+    main()
+  File "/project/src/llama_recipes/finetuning.py", line 281, in main
+    train(
+  File "/project/src/llama_recipes/utils/train_utils.py", line 175, in train
+    save_checkpoint(
+  File "/project/src/llama_recipes/utils/checkpoint.py", line 168, in save_checkpoint
+    tokenizer.tokenizer.save_pretrained(tokenizer_path)
+  File "/project/lib/transformers/src/transformers/tokenization_utils_base.py", line 2622, in save_pretrained
+    if os.path.isfile(save_directory):
+  File "/usr/lib/python3.10/genericpath.py", line 30, in isfile
+    st = os.stat(path)
+TypeError: stat: path should be string, bytes, os.PathLike or integer, not NoneType
+Saving scheduler state dict to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/scheduler.pt
+Saved scheduler state dict to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/scheduler.pt
+Saving RNG states to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/rng.pt
+Saved RNG states to /work/llm_recipes/models/yans-sample-gemma-2-2b/iter_0000003/rng.pt

wandb/run-20240812_063447-whqmtxyq/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,271 @@

+absl-py==2.1.0
+accelerate==0.33.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+distributed==2023.11.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+grpcio==1.60.1
+huggingface-hub==0.24.5
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+jedi==0.19.1
+jinja2==3.1.3
+joblib==1.3.2
+json5==0.9.14
+jsonnet==0.19.1
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy-loader==0.3
+librosa==0.10.1
+llvmlite==0.40.1
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+murmurhash==1.0.10
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+onnx==1.15.0rc2
+opencv==4.7.0
+optree==0.10.0
+packaging==23.2
+pandas==1.5.3
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1.dev0+gba5374836.d20240125
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pynvml==11.4.1
+pyparsing==3.1.1
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.31.0
+rich==13.7.0
+rmm==23.12.0
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.0
+safetensors==0.4.3
+scikit-learn==1.2.0
+scipy==1.12.0
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm==4.66.1
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.9.0
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+xdoctest==1.0.2
+xgboost==1.7.6
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0

wandb/run-20240812_063447-whqmtxyq/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-11T21:34:47.942238",
+    "startedAt": "2024-08-11T21:34:47.345817",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "1021",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "1",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "20000",
+        "--tokenizer-type",
+        "HFPreTrainedTokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/google/gemma-2-2b",
+        "--train-data-path",
+        "235289369",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
+        "--valid-data-path",
+        "235289369",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
+        "--test-data-path",
+        "235289369",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "20000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "anyprecision",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "3",
+        "--eval-interval",
+        "3",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/google/gemma-2-2b",
+        "--save",
+        "/work/llm_recipes/models/yans-sample-gemma-2-2b",
+        "--load",
+        "/work/llm_recipes/models/yans-sample-gemma-2-2b",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/yans-sample-gemma-2-2b",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "yans-sample-gemma-2-2b_train_2024-08-12-06:34:36"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.0429999999997,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.043,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.487823486328125
+    }
+}

wandb/run-20240812_063447-whqmtxyq/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"training/loss": 15.54540729522705, "training/perplexity": 5640071.469138662, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 1022, "utils/gradient_accumulation_steps": 320, "utils/iteration": 3, "optimizer/lr": 1.114e-06, "optimizer/variance_l2": 0.0003583679885385243, "optimizer/variance_sqrt_l2": 0.3777214531330342, "optimizer/momentum_l2": 0.26258589724268894, "optimizer/weight_l2": 1167.8420269882395, "optimizer/variance_l1": 0.14256858825683594, "optimizer/variance_sqrt_l1": 5085.8125, "optimizer/momentum_l1": 3147.65625, "optimizer/weight_l1": 29773824.0, "optimizer/variance_abs_max": 7.009506225585938e-05, "optimizer/variance_sqrt_abs_max": 0.00836181640625, "optimizer/momentum_abs_max": 0.005950927734375, "optimizer/weight_abs_max": 12.9375, "stats/1_iteration_time": 83.53097534600056, "stats/tokens_per_sec": 3915.194317381553, "stats/tokens_per_sec_per_gpu": 3915.194317381553, "stats/tflops": 52.39645244456057, "_timestamp": 1723412421.3049276, "_runtime": 333.9461305141449, "_step": 3, "_wandb": {"runtime": 356}, "evaluation/val_loss": 15.414616584777832, "evaluation/val_ppl": 4948606.5}

wandb/run-20240812_063447-whqmtxyq/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,359 @@

+2024-08-12 06:34:47,360 INFO    StreamThr :13101 [internal.py:wandb_internal():86] W&B internal server running at pid: 13101, started at: 2024-08-12 06:34:47.359620
+2024-08-12 06:34:47,362 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status
+2024-08-12 06:34:47,363 INFO    WriterThread:13101 [datastore.py:open_for_write():87] open: /project/wandb/run-20240812_063447-whqmtxyq/run-whqmtxyq.wandb
+2024-08-12 06:34:47,364 DEBUG   SenderThread:13101 [sender.py:send():382] send: header
+2024-08-12 06:34:47,378 DEBUG   SenderThread:13101 [sender.py:send():382] send: run
+2024-08-12 06:34:47,829 INFO    SenderThread:13101 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240812_063447-whqmtxyq/files
+2024-08-12 06:34:47,829 INFO    SenderThread:13101 [sender.py:_start_run_threads():1136] run started: whqmtxyq with start time 1723412087.358797
+2024-08-12 06:34:47,835 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: check_version
+2024-08-12 06:34:47,835 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: check_version
+2024-08-12 06:34:47,922 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: run_start
+2024-08-12 06:34:47,929 DEBUG   HandlerThread:13101 [system_info.py:__init__():27] System info init
+2024-08-12 06:34:47,929 DEBUG   HandlerThread:13101 [system_info.py:__init__():42] System info init done
+2024-08-12 06:34:47,929 INFO    HandlerThread:13101 [system_monitor.py:start():194] Starting system monitor
+2024-08-12 06:34:47,929 INFO    SystemMonitor:13101 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-08-12 06:34:47,929 INFO    HandlerThread:13101 [system_monitor.py:probe():214] Collecting system info
+2024-08-12 06:34:47,930 INFO    SystemMonitor:13101 [interfaces.py:start():190] Started cpu monitoring
+2024-08-12 06:34:47,930 INFO    SystemMonitor:13101 [interfaces.py:start():190] Started disk monitoring
+2024-08-12 06:34:47,930 INFO    SystemMonitor:13101 [interfaces.py:start():190] Started gpu monitoring
+2024-08-12 06:34:47,931 INFO    SystemMonitor:13101 [interfaces.py:start():190] Started memory monitoring
+2024-08-12 06:34:47,932 INFO    SystemMonitor:13101 [interfaces.py:start():190] Started network monitoring
+2024-08-12 06:34:47,942 DEBUG   HandlerThread:13101 [system_info.py:probe():151] Probing system
+2024-08-12 06:34:47,944 DEBUG   HandlerThread:13101 [system_info.py:_probe_git():136] Probing git
+2024-08-12 06:34:47,957 DEBUG   HandlerThread:13101 [system_info.py:_probe_git():144] Probing git done
+2024-08-12 06:34:47,957 DEBUG   HandlerThread:13101 [system_info.py:probe():199] Probing system done
+2024-08-12 06:34:47,957 DEBUG   HandlerThread:13101 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-11T21:34:47.942238', 'startedAt': '2024-08-11T21:34:47.345817', 'docker': None, 'cuda': None, 'args': ('--seq-length', '1021', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/google/gemma-2-2b', '--train-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--valid-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--test-data-path', '235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '3', '--eval-interval', '3', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/google/gemma-2-2b', '--save', '/work/llm_recipes/models/yans-sample-gemma-2-2b', '--load', '/work/llm_recipes/models/yans-sample-gemma-2-2b', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-sample-gemma-2-2b', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-sample-gemma-2-2b_train_2024-08-12-06:34:36'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0429999999997, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}, {'current': 2400.043, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487823486328125}}
+2024-08-12 06:34:47,957 INFO    HandlerThread:13101 [system_monitor.py:probe():224] Finished collecting system info
+2024-08-12 06:34:47,957 INFO    HandlerThread:13101 [system_monitor.py:probe():227] Publishing system info
+2024-08-12 06:34:47,958 INFO    HandlerThread:13101 [system_monitor.py:probe():229] Finished publishing system info
+2024-08-12 06:34:47,964 DEBUG   SenderThread:13101 [sender.py:send():382] send: files
+2024-08-12 06:34:47,964 INFO    SenderThread:13101 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
+2024-08-12 06:34:47,974 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: python_packages
+2024-08-12 06:34:47,974 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: python_packages
+2024-08-12 06:34:47,975 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:34:47,975 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:34:47,976 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:34:48,281 DEBUG   SenderThread:13101 [sender.py:send():382] send: telemetry
+2024-08-12 06:34:48,615 INFO    wandb-upload_0:13101 [upload_job.py:push():131] Uploaded file /tmp/tmpxyme_qqmwandb/cck49p4b-wandb-metadata.json
+2024-08-12 06:34:48,831 INFO    Thread-12 :13101 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
+2024-08-12 06:34:48,831 INFO    Thread-12 :13101 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_063447-whqmtxyq/files/requirements.txt
+2024-08-12 06:34:48,832 INFO    Thread-12 :13101 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_063447-whqmtxyq/files/wandb-metadata.json
+2024-08-12 06:34:50,832 INFO    Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
+2024-08-12 06:34:52,543 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:34:52,833 INFO    Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
+2024-08-12 06:34:57,543 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:35:02,544 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:35:02,974 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:35:02,974 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:35:02,975 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:35:08,234 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:35:13,235 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:35:17,973 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:35:17,974 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:35:18,016 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:35:18,247 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:35:18,849 INFO    Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/config.yaml
+2024-08-12 06:35:23,452 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:35:28,453 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:35:32,974 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:35:32,974 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:35:33,016 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:35:34,202 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:35:39,202 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:35:44,203 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:35:47,932 DEBUG   SystemMonitor:13101 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-08-12 06:35:47,934 DEBUG   SenderThread:13101 [sender.py:send():382] send: stats
+2024-08-12 06:35:47,973 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:35:47,974 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:35:48,016 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:35:49,237 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:35:54,238 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:35:59,239 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:36:02,974 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:36:02,974 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:36:03,016 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:36:05,234 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:36:08,884 INFO    Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
+2024-08-12 06:36:09,915 DEBUG   SenderThread:13101 [sender.py:send():382] send: config
+2024-08-12 06:36:09,915 DEBUG   SenderThread:13101 [sender.py:send():382] send: config
+2024-08-12 06:36:10,885 INFO    Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
+2024-08-12 06:36:11,119 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:36:16,120 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:36:17,935 DEBUG   SenderThread:13101 [sender.py:send():382] send: stats
+2024-08-12 06:36:17,974 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:36:17,974 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:36:17,975 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:36:21,237 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:36:21,893 INFO    Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/config.yaml
+2024-08-12 06:36:26,451 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:36:31,452 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:36:32,974 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:36:32,974 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:36:33,016 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:36:37,174 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:36:42,174 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:36:47,175 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:36:47,936 DEBUG   SenderThread:13101 [sender.py:send():382] send: stats
+2024-08-12 06:36:47,975 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:36:47,975 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:36:48,016 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:36:52,199 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:36:57,199 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:37:02,200 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:37:02,975 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:37:02,975 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:37:03,016 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:37:08,177 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:37:13,178 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:37:17,937 DEBUG   SenderThread:13101 [sender.py:send():382] send: stats
+2024-08-12 06:37:17,975 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:37:17,975 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:37:18,016 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:37:18,239 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:37:23,240 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:37:28,240 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:37:32,975 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:37:32,975 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:37:33,016 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:37:33,471 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 06:37:33,513 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:37:34,938 INFO    Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
+2024-08-12 06:37:38,514 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:37:43,515 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:37:47,938 DEBUG   SenderThread:13101 [sender.py:send():382] send: stats
+2024-08-12 06:37:47,975 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:37:47,975 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:37:47,977 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:37:49,236 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:37:54,236 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:37:59,237 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:38:02,975 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:38:02,975 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:38:03,016 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:38:05,173 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:38:10,174 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:38:15,175 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:38:17,940 DEBUG   SenderThread:13101 [sender.py:send():382] send: stats
+2024-08-12 06:38:17,975 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:38:17,975 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:38:18,016 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:38:20,189 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:38:25,189 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:38:30,190 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:38:32,975 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:38:32,976 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:38:33,016 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:38:36,181 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:38:41,181 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:38:46,182 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:38:47,941 DEBUG   SenderThread:13101 [sender.py:send():382] send: stats
+2024-08-12 06:38:47,975 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:38:47,976 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:38:48,016 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:38:52,158 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:38:57,068 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 06:38:57,070 DEBUG   SenderThread:13101 [sender.py:send():382] send: history
+2024-08-12 06:38:57,071 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 06:38:57,072 INFO    SenderThread:13101 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 06:38:57,991 INFO    Thread-12 :13101 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240812_063447-whqmtxyq/files/wandb-summary.json
+2024-08-12 06:38:58,109 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:38:58,991 INFO    Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
+2024-08-12 06:39:02,976 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:39:02,977 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:39:02,977 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:39:03,220 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:39:08,220 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:39:13,221 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:39:17,942 DEBUG   SenderThread:13101 [sender.py:send():382] send: stats
+2024-08-12 06:39:17,976 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:39:17,977 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:39:18,020 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:39:19,166 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:39:24,167 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:39:29,167 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:39:32,976 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:39:32,976 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:39:33,016 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:39:34,262 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:39:39,263 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:39:44,264 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:39:47,943 DEBUG   SenderThread:13101 [sender.py:send():382] send: stats
+2024-08-12 06:39:47,976 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:39:47,976 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:39:48,016 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:39:50,213 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:39:55,214 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:40:00,215 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:40:02,976 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:40:02,977 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:40:03,016 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:40:05,253 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:40:10,254 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:40:15,254 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:40:17,944 DEBUG   SenderThread:13101 [sender.py:send():382] send: stats
+2024-08-12 06:40:17,976 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:40:17,977 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:40:18,016 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:40:20,601 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 06:40:20,603 DEBUG   SenderThread:13101 [sender.py:send():382] send: history
+2024-08-12 06:40:20,604 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 06:40:20,604 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:40:20,605 INFO    SenderThread:13101 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 06:40:21,044 INFO    Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
+2024-08-12 06:40:21,045 INFO    Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/wandb-summary.json
+2024-08-12 06:40:21,305 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-12 06:40:23,046 INFO    Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
+2024-08-12 06:40:26,337 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:40:31,051 INFO    Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
+2024-08-12 06:40:32,226 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:40:32,977 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-12 06:40:32,977 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: stop_status
+2024-08-12 06:40:32,979 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:40:37,055 INFO    Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
+2024-08-12 06:40:37,381 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:40:42,382 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:40:44,855 DEBUG   SenderThread:13101 [sender.py:send():382] send: exit
+2024-08-12 06:40:44,856 INFO    SenderThread:13101 [sender.py:send_exit():589] handling exit code: 1
+2024-08-12 06:40:44,856 INFO    SenderThread:13101 [sender.py:send_exit():591] handling runtime: 356
+2024-08-12 06:40:44,857 INFO    SenderThread:13101 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 06:40:44,857 INFO    SenderThread:13101 [sender.py:send_exit():597] send defer
+2024-08-12 06:40:44,857 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 06:40:44,857 INFO    HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 0
+2024-08-12 06:40:44,857 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: defer
+2024-08-12 06:40:44,857 INFO    SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-08-12 06:40:44,857 INFO    SenderThread:13101 [sender.py:transition_state():617] send defer: 1
+2024-08-12 06:40:44,858 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 06:40:44,858 INFO    HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 1
+2024-08-12 06:40:44,858 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: defer
+2024-08-12 06:40:44,858 INFO    SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-08-12 06:40:44,858 INFO    SenderThread:13101 [sender.py:transition_state():617] send defer: 2
+2024-08-12 06:40:44,858 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 06:40:44,858 INFO    HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 2
+2024-08-12 06:40:44,858 INFO    HandlerThread:13101 [system_monitor.py:finish():203] Stopping system monitor
+2024-08-12 06:40:44,858 DEBUG   SystemMonitor:13101 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-08-12 06:40:44,858 INFO    HandlerThread:13101 [interfaces.py:finish():202] Joined cpu monitor
+2024-08-12 06:40:44,859 DEBUG   SystemMonitor:13101 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-08-12 06:40:44,859 INFO    HandlerThread:13101 [interfaces.py:finish():202] Joined disk monitor
+2024-08-12 06:40:44,893 INFO    HandlerThread:13101 [interfaces.py:finish():202] Joined gpu monitor
+2024-08-12 06:40:44,893 INFO    HandlerThread:13101 [interfaces.py:finish():202] Joined memory monitor
+2024-08-12 06:40:44,893 INFO    HandlerThread:13101 [interfaces.py:finish():202] Joined network monitor
+2024-08-12 06:40:44,894 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: defer
+2024-08-12 06:40:44,894 INFO    SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-08-12 06:40:44,894 INFO    SenderThread:13101 [sender.py:transition_state():617] send defer: 3
+2024-08-12 06:40:44,894 DEBUG   SenderThread:13101 [sender.py:send():382] send: stats
+2024-08-12 06:40:44,894 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 06:40:44,894 INFO    HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 3
+2024-08-12 06:40:44,896 DEBUG   SenderThread:13101 [sender.py:send():382] send: history
+2024-08-12 06:40:44,896 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: summary_record
+2024-08-12 06:40:44,897 INFO    SenderThread:13101 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 06:40:44,898 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: defer
+2024-08-12 06:40:44,898 INFO    SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-08-12 06:40:44,898 INFO    SenderThread:13101 [sender.py:transition_state():617] send defer: 4
+2024-08-12 06:40:44,898 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 06:40:44,898 INFO    HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 4
+2024-08-12 06:40:44,898 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: defer
+2024-08-12 06:40:44,898 INFO    SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-08-12 06:40:44,898 INFO    SenderThread:13101 [sender.py:transition_state():617] send defer: 5
+2024-08-12 06:40:44,898 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 06:40:44,898 INFO    HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 5
+2024-08-12 06:40:44,899 DEBUG   SenderThread:13101 [sender.py:send():382] send: summary
+2024-08-12 06:40:44,900 INFO    SenderThread:13101 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-12 06:40:44,900 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: defer
+2024-08-12 06:40:44,900 INFO    SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-08-12 06:40:44,900 INFO    SenderThread:13101 [sender.py:transition_state():617] send defer: 6
+2024-08-12 06:40:44,900 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 06:40:44,900 INFO    HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 6
+2024-08-12 06:40:44,900 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: defer
+2024-08-12 06:40:44,900 INFO    SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-08-12 06:40:44,901 INFO    SenderThread:13101 [sender.py:transition_state():617] send defer: 7
+2024-08-12 06:40:44,901 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: status_report
+2024-08-12 06:40:44,901 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 06:40:44,901 INFO    HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 7
+2024-08-12 06:40:44,901 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: defer
+2024-08-12 06:40:44,901 INFO    SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-08-12 06:40:45,060 INFO    Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
+2024-08-12 06:40:45,061 INFO    Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/wandb-summary.json
+2024-08-12 06:40:45,855 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-12 06:40:47,007 INFO    SenderThread:13101 [sender.py:transition_state():617] send defer: 8
+2024-08-12 06:40:47,007 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: poll_exit
+2024-08-12 06:40:47,007 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 06:40:47,008 INFO    HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 8
+2024-08-12 06:40:47,008 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: defer
+2024-08-12 06:40:47,008 INFO    SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-08-12 06:40:47,008 INFO    SenderThread:13101 [job_builder.py:build():296] Attempting to build job artifact
+2024-08-12 06:40:47,009 INFO    SenderThread:13101 [job_builder.py:_get_source_type():426] is repo sourced job
+2024-08-12 06:40:47,023 INFO    SenderThread:13101 [job_builder.py:build():402] adding wandb-job metadata file
+2024-08-12 06:40:47,031 INFO    SenderThread:13101 [sender.py:transition_state():617] send defer: 9
+2024-08-12 06:40:47,032 DEBUG   SenderThread:13101 [sender.py:send():382] send: artifact
+2024-08-12 06:40:47,032 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 06:40:47,033 INFO    HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 9
+2024-08-12 06:40:47,062 INFO    Thread-12 :13101 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
+2024-08-12 06:40:47,856 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-12 06:40:47,912 INFO    SenderThread:13101 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE0MDA5NDY1MQ==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE0MDA5NDY1MQ==', 'versionIndex': 9}}}
+2024-08-12 06:40:47,912 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: defer
+2024-08-12 06:40:47,912 INFO    SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 9
+2024-08-12 06:40:47,913 INFO    SenderThread:13101 [dir_watcher.py:finish():358] shutting down directory watcher
+2024-08-12 06:40:48,063 INFO    SenderThread:13101 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240812_063447-whqmtxyq/files
+2024-08-12 06:40:48,063 INFO    SenderThread:13101 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063447-whqmtxyq/files/requirements.txt requirements.txt
+2024-08-12 06:40:48,063 INFO    SenderThread:13101 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063447-whqmtxyq/files/config.yaml config.yaml
+2024-08-12 06:40:48,065 INFO    SenderThread:13101 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063447-whqmtxyq/files/wandb-metadata.json wandb-metadata.json
+2024-08-12 06:40:48,065 INFO    SenderThread:13101 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063447-whqmtxyq/files/wandb-summary.json wandb-summary.json
+2024-08-12 06:40:48,067 INFO    SenderThread:13101 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240812_063447-whqmtxyq/files/output.log output.log
+2024-08-12 06:40:48,067 INFO    SenderThread:13101 [sender.py:transition_state():617] send defer: 10
+2024-08-12 06:40:48,068 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: poll_exit
+2024-08-12 06:40:48,069 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 06:40:48,069 INFO    HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 10
+2024-08-12 06:40:48,070 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: defer
+2024-08-12 06:40:48,071 INFO    SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 10
+2024-08-12 06:40:48,071 INFO    SenderThread:13101 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-12 06:40:48,555 INFO    wandb-upload_1:13101 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_063447-whqmtxyq/files/config.yaml
+2024-08-12 06:40:48,607 INFO    wandb-upload_0:13101 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_063447-whqmtxyq/files/requirements.txt
+2024-08-12 06:40:48,857 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-12 06:40:48,857 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: poll_exit
+2024-08-12 06:40:49,047 INFO    wandb-upload_2:13101 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_063447-whqmtxyq/files/wandb-summary.json
+2024-08-12 06:40:49,065 INFO    wandb-upload_3:13101 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240812_063447-whqmtxyq/files/output.log
+2024-08-12 06:40:49,265 INFO    Thread-11 (_thread_body):13101 [sender.py:transition_state():617] send defer: 11
+2024-08-12 06:40:49,266 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 06:40:49,266 INFO    HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 11
+2024-08-12 06:40:49,266 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: defer
+2024-08-12 06:40:49,266 INFO    SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 11
+2024-08-12 06:40:49,266 INFO    SenderThread:13101 [file_pusher.py:join():178] waiting for file pusher
+2024-08-12 06:40:49,266 INFO    SenderThread:13101 [sender.py:transition_state():617] send defer: 12
+2024-08-12 06:40:49,267 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 06:40:49,267 INFO    HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 12
+2024-08-12 06:40:49,267 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: defer
+2024-08-12 06:40:49,267 INFO    SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 12
+2024-08-12 06:40:49,267 INFO    SenderThread:13101 [file_stream.py:finish():595] file stream finish called
+2024-08-12 06:40:49,435 INFO    SenderThread:13101 [file_stream.py:finish():599] file stream finish is done
+2024-08-12 06:40:49,435 INFO    SenderThread:13101 [sender.py:transition_state():617] send defer: 13
+2024-08-12 06:40:49,436 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 06:40:49,436 INFO    HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 13
+2024-08-12 06:40:49,436 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: defer
+2024-08-12 06:40:49,436 INFO    SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 13
+2024-08-12 06:40:49,436 INFO    SenderThread:13101 [sender.py:transition_state():617] send defer: 14
+2024-08-12 06:40:49,436 DEBUG   SenderThread:13101 [sender.py:send():382] send: final
+2024-08-12 06:40:49,436 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: defer
+2024-08-12 06:40:49,436 DEBUG   SenderThread:13101 [sender.py:send():382] send: footer
+2024-08-12 06:40:49,436 INFO    HandlerThread:13101 [handler.py:handle_request_defer():172] handle defer: 14
+2024-08-12 06:40:49,437 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: defer
+2024-08-12 06:40:49,437 INFO    SenderThread:13101 [sender.py:send_request_defer():613] handle sender defer: 14
+2024-08-12 06:40:49,437 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-12 06:40:49,437 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: poll_exit
+2024-08-12 06:40:49,438 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-12 06:40:49,438 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: poll_exit
+2024-08-12 06:40:49,438 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: server_info
+2024-08-12 06:40:49,438 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: server_info
+2024-08-12 06:40:49,439 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: get_summary
+2024-08-12 06:40:49,440 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: sampled_history
+2024-08-12 06:40:49,442 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-12 06:40:49,442 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: job_info
+2024-08-12 06:40:49,609 DEBUG   SenderThread:13101 [sender.py:send_request():409] send_request: job_info
+2024-08-12 06:40:49,610 INFO    MainThread:13101 [wandb_run.py:_footer_history_summary_info():3866] rendering history
+2024-08-12 06:40:49,610 INFO    MainThread:13101 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
+2024-08-12 06:40:49,611 INFO    MainThread:13101 [wandb_run.py:_footer_sync_info():3825] logging synced files
+2024-08-12 06:40:49,611 DEBUG   HandlerThread:13101 [handler.py:handle_request():146] handle_request: shutdown
+2024-08-12 06:40:49,611 INFO    HandlerThread:13101 [handler.py:finish():869] shutting down handler
+2024-08-12 06:40:50,442 INFO    WriterThread:13101 [datastore.py:close():296] close: /project/wandb/run-20240812_063447-whqmtxyq/run-whqmtxyq.wandb
+2024-08-12 06:40:50,610 INFO    SenderThread:13101 [sender.py:finish():1572] shutting down sender
+2024-08-12 06:40:50,610 INFO    SenderThread:13101 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-12 06:40:50,610 INFO    SenderThread:13101 [file_pusher.py:join():178] waiting for file pusher

wandb/run-20240812_063447-whqmtxyq/logs/debug.log ADDED Viewed

	@@ -0,0 +1,30 @@

+2024-08-12 06:34:47,351 INFO    MainThread:13030 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-12 06:34:47,352 INFO    MainThread:13030 [wandb_setup.py:_flush():76] Configure stats pid to 13030
+2024-08-12 06:34:47,352 INFO    MainThread:13030 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-12 06:34:47,352 INFO    MainThread:13030 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-12 06:34:47,352 INFO    MainThread:13030 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
+2024-08-12 06:34:47,352 INFO    MainThread:13030 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-12 06:34:47,352 INFO    MainThread:13030 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-12 06:34:47,352 INFO    MainThread:13030 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240812_063447-whqmtxyq/logs/debug.log
+2024-08-12 06:34:47,352 INFO    MainThread:13030 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240812_063447-whqmtxyq/logs/debug-internal.log
+2024-08-12 06:34:47,352 INFO    MainThread:13030 [wandb_init.py:init():566] calling init triggers
+2024-08-12 06:34:47,352 INFO    MainThread:13030 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'test_data_path': ['235289369', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 1021, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/google/gemma-2-2b', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-sample-gemma-2-2b_train_2024-08-12-06:34:36', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'save': '/work/llm_recipes/models/yans-sample-gemma-2-2b', 'base_model': '/share/pretrained_lm/google/gemma-2-2b', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 3, 'save_interval': 3, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-sample-gemma-2-2b', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 256000, 'gradient_accumulation_steps': 320}
+2024-08-12 06:34:47,353 INFO    MainThread:13030 [wandb_init.py:init():616] starting backend
+2024-08-12 06:34:47,353 INFO    MainThread:13030 [wandb_init.py:init():620] setting up manager
+2024-08-12 06:34:47,357 INFO    MainThread:13030 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-12 06:34:47,358 INFO    MainThread:13030 [wandb_init.py:init():628] backend started and connected
+2024-08-12 06:34:47,363 INFO    MainThread:13030 [wandb_init.py:init():720] updated telemetry
+2024-08-12 06:34:47,374 INFO    MainThread:13030 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-12 06:34:47,834 INFO    MainThread:13030 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-12 06:34:47,915 INFO    MainThread:13030 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-12 06:34:47,915 INFO    MainThread:13030 [wandb_init.py:init():804] starting run threads in backend
+2024-08-12 06:34:47,973 INFO    MainThread:13030 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-12 06:34:47,973 INFO    MainThread:13030 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-12 06:34:47,973 INFO    MainThread:13030 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-12 06:34:47,974 INFO    MainThread:13030 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-12 06:34:47,975 INFO    MainThread:13030 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-12 06:36:09,914 INFO    MainThread:13030 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Gemma2ForCausalLM', 'activation_function': 'gelu_pytorch_tanh', 'hidden_size': 2304, 'model_type': 'gemma2', 'max_position_embeddings': 1021, 'num_attention_heads': 8, 'num_hidden_layers': 26}
+2024-08-12 06:36:09,915 INFO    MainThread:13030 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
+2024-08-12 06:40:50,612 WARNING MsgRouterThr:13030 [router.py:message_loop():77] message_loop has been closed

wandb/run-20240812_063447-whqmtxyq/run-whqmtxyq.wandb ADDED Viewed

Binary file (42.3 kB). View file

wandb/run-20240815_031216-0szn78ph/files/config.yaml ADDED Viewed

	@@ -0,0 +1,335 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '304771887'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '304771887'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '304771887'
+  - /work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 4096
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: HFPreTrainedTokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-0.5B
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: yans-qwen2-0.5B_train_2024-08-15-03:11:59
+wandb_project:
+  desc: null
+  value: llm_tutorial
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/yans-qwen2-0.5B
+save:
+  desc: null
+  value: /work/llm_recipes/models/yans-qwen2-0.5B
+base_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-0.5B
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 10
+save_interval:
+  desc: null
+  value: 10
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: adam
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 20000
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 20000
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 320
+micro_batch_size:
+  desc: null
+  value: 1
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 4096
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/yans-qwen2-0.5B
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: false
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 151680
+gradient_accumulation_steps:
+  desc: null
+  value: 320
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1723659136.24386
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+model_architecture:
+  desc: null
+  value: Qwen2ForCausalLM
+activation_function:
+  desc: null
+  value: silu
+hidden_size:
+  desc: null
+  value: 896
+model_type:
+  desc: null
+  value: qwen2
+max_position_embeddings:
+  desc: null
+  value: 4096
+num_attention_heads:
+  desc: null
+  value: 14
+num_hidden_layers:
+  desc: null
+  value: 24

wandb/run-20240815_031216-0szn78ph/files/output.log ADDED Viewed

	@@ -0,0 +1,92 @@

+Created Hugging Face repository with ID koichi12/yans-qwen2-0.5B.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+Loading model state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/model.pt
+You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
+Loaded model state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/model.pt
+--> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
+--> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
+BFloat16 enabled for mixed precision - using bfSixteen policy
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
+  warnings.warn(
+--> applying fsdp activation checkpointing...
+ > datasets target sizes (minimum size):
+    train:      6400000
+    validation: 6403200
+    test:       3200
+> building train, validation, and test datasets for GPT ...
+> finished creating GPT datasets ...
+Loading optimizer state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/optimizer.pt
+Let split = None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Loaded optimizer state dict from /work/llm_recipes/models/yans-qwen2-0.5B/iter_0001160/optimizer.pt
+model info: FullyShardedDataParallel(
+  (_fsdp_wrapped_module): Qwen2ForCausalLM(
+    (model): Qwen2Model(
+      (embed_tokens): Embedding(151936, 896)
+      (layers): ModuleList(
+        (0-23): 24 x FullyShardedDataParallel(
+          (_fsdp_wrapped_module): CheckpointWrapper(
+            (_checkpoint_wrapped_module): Qwen2DecoderLayer(
+              (self_attn): Qwen2FlashAttention2(
+                (q_proj): Linear(in_features=896, out_features=896, bias=True)
+                (k_proj): Linear(in_features=896, out_features=128, bias=True)
+                (v_proj): Linear(in_features=896, out_features=128, bias=True)
+                (o_proj): Linear(in_features=896, out_features=896, bias=False)
+                (rotary_emb): Qwen2RotaryEmbedding()
+              )
+              (mlp): Qwen2MLP(
+                (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
+                (up_proj): Linear(in_features=896, out_features=4864, bias=False)
+                (down_proj): Linear(in_features=4864, out_features=896, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): Qwen2RMSNorm()
+              (post_attention_layernorm): Qwen2RMSNorm()
+            )
+          )
+        )
+      )
+      (norm): Qwen2RMSNorm()
+    )
+    (lm_head): Linear(in_features=896, out_features=151936, bias=False)
+  )
+)
+model config: Qwen2Config {
+  "_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "label_smoothing": 0.0,
+  "max_position_embeddings": 4096,
+  "max_window_layers": 24,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.43.3",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
+[rank0]:[2024-08-15 03:12:42,940] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling:  defaultdict(<class 'float'>, {})
+------------------------------------------------------------------
+iteration: 1161 , TFLOPS: 67.46644597716896, Tokens per sec: 16778.56616965974, Loss: 2.442603349685669
+------------------------------------------------------------------

wandb/run-20240815_031216-0szn78ph/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,293 @@

+absl-py==2.1.0
+accelerate==0.33.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bleach==6.1.0
+blis==0.7.11
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+chardet==5.2.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+dataproperty==1.0.1
+datasets==2.20.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.8
+distributed==2023.11.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+einops==0.7.0
+evaluate==0.4.2
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+grpcio==1.60.1
+huggingface-hub==0.24.5
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+jedi==0.19.1
+jinja2==3.1.3
+joblib==1.3.2
+json5==0.9.14
+jsonlines==4.0.0
+jsonnet==0.19.1
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy-loader==0.3
+librosa==0.10.1
+llvmlite==0.40.1
+lm-eval==0.4.3
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mbstrdecoder==1.1.3
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+multiprocess==0.70.16
+murmurhash==1.0.10
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numexpr==2.10.1
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+onnx==1.15.0rc2
+opencv==4.7.0
+optree==0.10.0
+packaging==23.2
+pandas==1.5.3
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+pathvalidate==3.2.0
+peft==0.11.1
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow-hotfix==0.6
+pyarrow==17.0.0
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pynvml==11.4.1
+pyparsing==3.1.1
+pytablewriter==1.2.0
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests==2.32.3
+rich==13.7.0
+rmm==23.12.0
+rouge-score==0.1.2
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.0
+safetensors==0.4.3
+scikit-learn==1.2.0
+scipy==1.12.0
+send2trash==1.8.2
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+sqlitedict==2.1.0
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabledata==1.3.3
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tcolorpy==0.1.6
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm-multiprocess==0.0.11
+tqdm==4.66.5
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+typepy==1.3.2
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.9.0
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+urllib3==1.26.18
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+word2number==1.1
+xdoctest==1.0.2
+xgboost==1.7.6
+xxhash==3.4.1
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0
+zstandard==0.23.0

wandb/run-20240815_031216-0szn78ph/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-14T18:12:16.980997",
+    "startedAt": "2024-08-14T18:12:16.230100",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "4096",
+        "--sliding-window-size",
+        "4096",
+        "--micro-batch-size",
+        "1",
+        "--global-batch-size",
+        "320",
+        "--train-iters",
+        "20000",
+        "--tokenizer-type",
+        "HFPreTrainedTokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+        "--train-data-path",
+        "304771887",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
+        "--valid-data-path",
+        "304771887",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
+        "--test-data-path",
+        "304771887",
+        "/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "20000",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "adam",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "10",
+        "--eval-interval",
+        "10",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+        "--save",
+        "/work/llm_recipes/models/yans-qwen2-0.5B",
+        "--load",
+        "/work/llm_recipes/models/yans-qwen2-0.5B",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/yans-qwen2-0.5B",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial",
+        "--wandb-name",
+        "yans-qwen2-0.5B_train_2024-08-15-03:11:59"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "6da01327e78c302bc0cfdb335f3ca297e2a19c8c"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.0389999999993,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.48782730102539
+    }
+}

wandb/run-20240815_031216-0szn78ph/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"_wandb": {"runtime": 168}, "training/loss": 2.442603349685669, "training/perplexity": 11.502947992429535, "utils/batch_size": 1, "utils/global_batch_size": 320, "utils/seq_len": 4097, "utils/gradient_accumulation_steps": 320, "utils/iteration": 1161, "optimizer/lr": 1.9946184158325198e-05, "optimizer/variance_l2": 0.004682497095771901, "optimizer/variance_sqrt_l2": 0.5343142380105511, "optimizer/momentum_l2": 0.12459250428605805, "optimizer/weight_l2": 825.0639369164065, "optimizer/variance_l1": 0.2849578857421875, "optimizer/variance_sqrt_l1": 4625.0, "optimizer/momentum_l1": 977.875, "optimizer/weight_l1": 6918144.0, "optimizer/variance_abs_max": 0.0030059814453125, "optimizer/variance_sqrt_abs_max": 0.054931640625, "optimizer/momentum_abs_max": 0.0108642578125, "optimizer/weight_abs_max": 175.0, "stats/1_iteration_time": 78.13778523999986, "stats/tokens_per_sec": 16778.56616965974, "stats/tokens_per_sec_per_gpu": 16778.56616965974, "stats/tflops": 67.46644597716896, "_timestamp": 1723659241.8232834, "_runtime": 105.57942342758179, "_step": 1161}

wandb/run-20240815_031216-0szn78ph/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,260 @@

+2024-08-15 03:12:16,244 INFO    StreamThr :10026 [internal.py:wandb_internal():86] W&B internal server running at pid: 10026, started at: 2024-08-15 03:12:16.243481
+2024-08-15 03:12:16,245 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status
+2024-08-15 03:12:16,248 INFO    WriterThread:10026 [datastore.py:open_for_write():87] open: /project/wandb/run-20240815_031216-0szn78ph/run-0szn78ph.wandb
+2024-08-15 03:12:16,249 DEBUG   SenderThread:10026 [sender.py:send():382] send: header
+2024-08-15 03:12:16,409 DEBUG   SenderThread:10026 [sender.py:send():382] send: run
+2024-08-15 03:12:16,887 INFO    SenderThread:10026 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240815_031216-0szn78ph/files
+2024-08-15 03:12:16,887 INFO    SenderThread:10026 [sender.py:_start_run_threads():1136] run started: 0szn78ph with start time 1723659136.24386
+2024-08-15 03:12:16,892 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: check_version
+2024-08-15 03:12:16,892 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: check_version
+2024-08-15 03:12:16,962 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: run_start
+2024-08-15 03:12:16,969 DEBUG   HandlerThread:10026 [system_info.py:__init__():27] System info init
+2024-08-15 03:12:16,969 DEBUG   HandlerThread:10026 [system_info.py:__init__():42] System info init done
+2024-08-15 03:12:16,969 INFO    HandlerThread:10026 [system_monitor.py:start():194] Starting system monitor
+2024-08-15 03:12:16,969 INFO    SystemMonitor:10026 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-08-15 03:12:16,969 INFO    HandlerThread:10026 [system_monitor.py:probe():214] Collecting system info
+2024-08-15 03:12:16,969 INFO    SystemMonitor:10026 [interfaces.py:start():190] Started cpu monitoring
+2024-08-15 03:12:16,970 INFO    SystemMonitor:10026 [interfaces.py:start():190] Started disk monitoring
+2024-08-15 03:12:16,971 INFO    SystemMonitor:10026 [interfaces.py:start():190] Started gpu monitoring
+2024-08-15 03:12:16,972 INFO    SystemMonitor:10026 [interfaces.py:start():190] Started memory monitoring
+2024-08-15 03:12:16,972 INFO    SystemMonitor:10026 [interfaces.py:start():190] Started network monitoring
+2024-08-15 03:12:16,980 DEBUG   HandlerThread:10026 [system_info.py:probe():151] Probing system
+2024-08-15 03:12:16,983 DEBUG   HandlerThread:10026 [system_info.py:_probe_git():136] Probing git
+2024-08-15 03:12:16,995 DEBUG   HandlerThread:10026 [system_info.py:_probe_git():144] Probing git done
+2024-08-15 03:12:16,995 DEBUG   HandlerThread:10026 [system_info.py:probe():199] Probing system done
+2024-08-15 03:12:16,995 DEBUG   HandlerThread:10026 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-14T18:12:16.980997', 'startedAt': '2024-08-14T18:12:16.230100', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '4096', '--micro-batch-size', '1', '--global-batch-size', '320', '--train-iters', '20000', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--valid-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--test-data-path', '304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '20000', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'adam', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '10', '--eval-interval', '10', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/yans-qwen2-0.5B', '--load', '/work/llm_recipes/models/yans-qwen2-0.5B', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/yans-qwen2-0.5B', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial', '--wandb-name', 'yans-qwen2-0.5B_train_2024-08-15-03:11:59'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '6da01327e78c302bc0cfdb335f3ca297e2a19c8c'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.48782730102539}}
+2024-08-15 03:12:16,995 INFO    HandlerThread:10026 [system_monitor.py:probe():224] Finished collecting system info
+2024-08-15 03:12:16,995 INFO    HandlerThread:10026 [system_monitor.py:probe():227] Publishing system info
+2024-08-15 03:12:16,997 INFO    HandlerThread:10026 [system_monitor.py:probe():229] Finished publishing system info
+2024-08-15 03:12:17,023 DEBUG   SenderThread:10026 [sender.py:send():382] send: files
+2024-08-15 03:12:17,024 INFO    SenderThread:10026 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
+2024-08-15 03:12:17,033 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: python_packages
+2024-08-15 03:12:17,034 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-15 03:12:17,034 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: python_packages
+2024-08-15 03:12:17,034 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-15 03:12:17,036 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
+2024-08-15 03:12:17,320 DEBUG   SenderThread:10026 [sender.py:send():382] send: telemetry
+2024-08-15 03:12:17,786 INFO    wandb-upload_0:10026 [upload_job.py:push():131] Uploaded file /tmp/tmp2lpzau9swandb/2fbn8bzg-wandb-metadata.json
+2024-08-15 03:12:17,889 INFO    Thread-12 :10026 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
+2024-08-15 03:12:17,889 INFO    Thread-12 :10026 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_031216-0szn78ph/files/wandb-metadata.json
+2024-08-15 03:12:17,889 INFO    Thread-12 :10026 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_031216-0szn78ph/files/requirements.txt
+2024-08-15 03:12:19,889 INFO    Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
+2024-08-15 03:12:21,867 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:12:21,890 INFO    Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
+2024-08-15 03:12:22,891 INFO    Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
+2024-08-15 03:12:26,867 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:12:31,868 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:12:32,032 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-15 03:12:32,033 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
+2024-08-15 03:12:32,033 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-15 03:12:37,282 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:12:37,900 INFO    Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
+2024-08-15 03:12:38,901 INFO    Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
+2024-08-15 03:12:39,901 INFO    Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
+2024-08-15 03:12:40,902 INFO    Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
+2024-08-15 03:12:42,647 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:12:43,260 DEBUG   SenderThread:10026 [sender.py:send():382] send: config
+2024-08-15 03:12:43,261 DEBUG   SenderThread:10026 [sender.py:send():382] send: config
+2024-08-15 03:12:43,904 INFO    Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
+2024-08-15 03:12:44,904 INFO    Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
+2024-08-15 03:12:47,034 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-15 03:12:47,034 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-15 03:12:47,035 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
+2024-08-15 03:12:48,218 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:12:48,907 INFO    Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/config.yaml
+2024-08-15 03:12:53,411 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:12:58,411 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:13:02,035 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-15 03:13:02,035 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-15 03:13:02,035 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
+2024-08-15 03:13:04,284 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:13:09,285 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:13:14,285 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:13:16,973 DEBUG   SystemMonitor:10026 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-08-15 03:13:16,974 DEBUG   SenderThread:10026 [sender.py:send():382] send: stats
+2024-08-15 03:13:17,034 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-15 03:13:17,034 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
+2024-08-15 03:13:17,078 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-15 03:13:19,286 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:13:24,287 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:13:29,288 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:13:32,034 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-15 03:13:32,034 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
+2024-08-15 03:13:32,078 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-15 03:13:35,214 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:13:40,215 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:13:45,216 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:13:46,975 DEBUG   SenderThread:10026 [sender.py:send():382] send: stats
+2024-08-15 03:13:47,034 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-15 03:13:47,034 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-15 03:13:47,035 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
+2024-08-15 03:13:50,291 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:13:55,292 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:14:00,292 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:14:01,824 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: partial_history
+2024-08-15 03:14:01,949 INFO    Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
+2024-08-15 03:14:02,034 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-15 03:14:02,034 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-15 03:14:02,035 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
+2024-08-15 03:14:06,235 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:14:11,236 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:14:16,236 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:14:16,976 DEBUG   SenderThread:10026 [sender.py:send():382] send: stats
+2024-08-15 03:14:17,034 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-15 03:14:17,034 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-15 03:14:17,035 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
+2024-08-15 03:14:21,267 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:14:26,267 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:14:31,268 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:14:32,034 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-15 03:14:32,035 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-15 03:14:32,035 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
+2024-08-15 03:14:37,220 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:14:42,221 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:14:46,977 DEBUG   SenderThread:10026 [sender.py:send():382] send: stats
+2024-08-15 03:14:47,034 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-15 03:14:47,035 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-15 03:14:47,035 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
+2024-08-15 03:14:48,220 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:14:53,220 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:14:58,221 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:15:02,034 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-15 03:15:02,035 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-15 03:15:02,035 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: stop_status
+2024-08-15 03:15:03,261 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:15:05,634 DEBUG   SenderThread:10026 [sender.py:send():382] send: exit
+2024-08-15 03:15:05,634 INFO    SenderThread:10026 [sender.py:send_exit():589] handling exit code: 255
+2024-08-15 03:15:05,634 INFO    SenderThread:10026 [sender.py:send_exit():591] handling runtime: 168
+2024-08-15 03:15:05,636 INFO    SenderThread:10026 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-15 03:15:05,636 INFO    SenderThread:10026 [sender.py:send_exit():597] send defer
+2024-08-15 03:15:05,636 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 03:15:05,636 INFO    HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 0
+2024-08-15 03:15:05,636 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: defer
+2024-08-15 03:15:05,636 INFO    SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-08-15 03:15:05,636 INFO    SenderThread:10026 [sender.py:transition_state():617] send defer: 1
+2024-08-15 03:15:05,636 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 03:15:05,637 INFO    HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 1
+2024-08-15 03:15:05,637 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: defer
+2024-08-15 03:15:05,637 INFO    SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-08-15 03:15:05,637 INFO    SenderThread:10026 [sender.py:transition_state():617] send defer: 2
+2024-08-15 03:15:05,637 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 03:15:05,637 INFO    HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 2
+2024-08-15 03:15:05,637 INFO    HandlerThread:10026 [system_monitor.py:finish():203] Stopping system monitor
+2024-08-15 03:15:05,637 DEBUG   SystemMonitor:10026 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-08-15 03:15:05,637 DEBUG   SystemMonitor:10026 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-08-15 03:15:05,637 INFO    HandlerThread:10026 [interfaces.py:finish():202] Joined cpu monitor
+2024-08-15 03:15:05,639 INFO    HandlerThread:10026 [interfaces.py:finish():202] Joined disk monitor
+2024-08-15 03:15:05,671 INFO    HandlerThread:10026 [interfaces.py:finish():202] Joined gpu monitor
+2024-08-15 03:15:05,672 INFO    HandlerThread:10026 [interfaces.py:finish():202] Joined memory monitor
+2024-08-15 03:15:05,672 INFO    HandlerThread:10026 [interfaces.py:finish():202] Joined network monitor
+2024-08-15 03:15:05,672 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: defer
+2024-08-15 03:15:05,672 INFO    SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-08-15 03:15:05,672 INFO    SenderThread:10026 [sender.py:transition_state():617] send defer: 3
+2024-08-15 03:15:05,672 DEBUG   SenderThread:10026 [sender.py:send():382] send: stats
+2024-08-15 03:15:05,673 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 03:15:05,673 INFO    HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 3
+2024-08-15 03:15:05,676 DEBUG   SenderThread:10026 [sender.py:send():382] send: history
+2024-08-15 03:15:05,676 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: summary_record
+2024-08-15 03:15:05,677 INFO    SenderThread:10026 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-15 03:15:05,677 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: defer
+2024-08-15 03:15:05,677 INFO    SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-08-15 03:15:05,677 INFO    SenderThread:10026 [sender.py:transition_state():617] send defer: 4
+2024-08-15 03:15:05,677 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 03:15:05,677 INFO    HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 4
+2024-08-15 03:15:05,677 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: defer
+2024-08-15 03:15:05,677 INFO    SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-08-15 03:15:05,677 INFO    SenderThread:10026 [sender.py:transition_state():617] send defer: 5
+2024-08-15 03:15:05,677 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 03:15:05,677 INFO    HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 5
+2024-08-15 03:15:05,678 DEBUG   SenderThread:10026 [sender.py:send():382] send: summary
+2024-08-15 03:15:05,679 INFO    SenderThread:10026 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-15 03:15:05,679 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: defer
+2024-08-15 03:15:05,679 INFO    SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-08-15 03:15:05,679 INFO    SenderThread:10026 [sender.py:transition_state():617] send defer: 6
+2024-08-15 03:15:05,679 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 03:15:05,679 INFO    HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 6
+2024-08-15 03:15:05,680 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: defer
+2024-08-15 03:15:05,680 INFO    SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-08-15 03:15:05,680 INFO    SenderThread:10026 [sender.py:transition_state():617] send defer: 7
+2024-08-15 03:15:05,680 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:15:05,680 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 03:15:05,680 INFO    HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 7
+2024-08-15 03:15:05,680 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: defer
+2024-08-15 03:15:05,680 INFO    SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-08-15 03:15:05,984 INFO    Thread-12 :10026 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240815_031216-0szn78ph/files/wandb-summary.json
+2024-08-15 03:15:06,481 INFO    SenderThread:10026 [sender.py:transition_state():617] send defer: 8
+2024-08-15 03:15:06,481 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 03:15:06,481 INFO    HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 8
+2024-08-15 03:15:06,481 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: defer
+2024-08-15 03:15:06,481 INFO    SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-08-15 03:15:06,481 INFO    SenderThread:10026 [job_builder.py:build():296] Attempting to build job artifact
+2024-08-15 03:15:06,482 INFO    SenderThread:10026 [job_builder.py:_get_source_type():426] is repo sourced job
+2024-08-15 03:15:06,507 INFO    SenderThread:10026 [job_builder.py:build():402] adding wandb-job metadata file
+2024-08-15 03:15:06,516 INFO    SenderThread:10026 [sender.py:transition_state():617] send defer: 9
+2024-08-15 03:15:06,517 DEBUG   SenderThread:10026 [sender.py:send():382] send: artifact
+2024-08-15 03:15:06,517 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 03:15:06,518 INFO    HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 9
+2024-08-15 03:15:06,633 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-15 03:15:06,985 INFO    Thread-12 :10026 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240815_031216-0szn78ph/files/output.log
+2024-08-15 03:15:08,040 INFO    wandb-upload_0:10026 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmpb932s___
+2024-08-15 03:15:08,047 INFO    wandb-upload_1:10026 [upload_job.py:push():89] Uploaded file /singularity_home/.local/share/wandb/artifacts/staging/tmpl85vnluw
+2024-08-15 03:15:09,160 INFO    SenderThread:10026 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE1MDEyMDEwMQ==', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjM2MjY3MjMzNA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE0MDA5NDY1MQ==', 'versionIndex': 9}}}
+2024-08-15 03:15:09,160 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: defer
+2024-08-15 03:15:09,160 INFO    SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 9
+2024-08-15 03:15:09,160 INFO    SenderThread:10026 [dir_watcher.py:finish():358] shutting down directory watcher
+2024-08-15 03:15:09,986 INFO    SenderThread:10026 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240815_031216-0szn78ph/files
+2024-08-15 03:15:09,987 INFO    SenderThread:10026 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240815_031216-0szn78ph/files/requirements.txt requirements.txt
+2024-08-15 03:15:09,987 INFO    SenderThread:10026 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240815_031216-0szn78ph/files/config.yaml config.yaml
+2024-08-15 03:15:09,988 INFO    SenderThread:10026 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240815_031216-0szn78ph/files/wandb-metadata.json wandb-metadata.json
+2024-08-15 03:15:09,989 INFO    SenderThread:10026 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240815_031216-0szn78ph/files/wandb-summary.json wandb-summary.json
+2024-08-15 03:15:09,990 INFO    SenderThread:10026 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240815_031216-0szn78ph/files/output.log output.log
+2024-08-15 03:15:09,992 INFO    SenderThread:10026 [sender.py:transition_state():617] send defer: 10
+2024-08-15 03:15:09,992 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: poll_exit
+2024-08-15 03:15:09,992 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 03:15:09,993 INFO    HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 10
+2024-08-15 03:15:09,994 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: defer
+2024-08-15 03:15:09,994 INFO    SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 10
+2024-08-15 03:15:09,994 INFO    SenderThread:10026 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-15 03:15:10,399 INFO    wandb-upload_1:10026 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240815_031216-0szn78ph/files/config.yaml
+2024-08-15 03:15:10,439 INFO    wandb-upload_0:10026 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240815_031216-0szn78ph/files/requirements.txt
+2024-08-15 03:15:10,453 INFO    wandb-upload_2:10026 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240815_031216-0szn78ph/files/wandb-summary.json
+2024-08-15 03:15:10,537 INFO    wandb-upload_3:10026 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240815_031216-0szn78ph/files/output.log
+2024-08-15 03:15:10,635 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-15 03:15:10,635 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: poll_exit
+2024-08-15 03:15:10,737 INFO    Thread-11 (_thread_body):10026 [sender.py:transition_state():617] send defer: 11
+2024-08-15 03:15:10,738 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 03:15:10,738 INFO    HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 11
+2024-08-15 03:15:10,738 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: defer
+2024-08-15 03:15:10,738 INFO    SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 11
+2024-08-15 03:15:10,738 INFO    SenderThread:10026 [file_pusher.py:join():178] waiting for file pusher
+2024-08-15 03:15:10,738 INFO    SenderThread:10026 [sender.py:transition_state():617] send defer: 12
+2024-08-15 03:15:10,738 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 03:15:10,738 INFO    HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 12
+2024-08-15 03:15:10,738 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: defer
+2024-08-15 03:15:10,738 INFO    SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 12
+2024-08-15 03:15:10,738 INFO    SenderThread:10026 [file_stream.py:finish():595] file stream finish called
+2024-08-15 03:15:11,367 INFO    SenderThread:10026 [file_stream.py:finish():599] file stream finish is done
+2024-08-15 03:15:11,368 INFO    SenderThread:10026 [sender.py:transition_state():617] send defer: 13
+2024-08-15 03:15:11,368 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 03:15:11,368 INFO    HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 13
+2024-08-15 03:15:11,368 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: defer
+2024-08-15 03:15:11,368 INFO    SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 13
+2024-08-15 03:15:11,368 INFO    SenderThread:10026 [sender.py:transition_state():617] send defer: 14
+2024-08-15 03:15:11,369 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: defer
+2024-08-15 03:15:11,369 DEBUG   SenderThread:10026 [sender.py:send():382] send: final
+2024-08-15 03:15:11,369 INFO    HandlerThread:10026 [handler.py:handle_request_defer():172] handle defer: 14
+2024-08-15 03:15:11,369 DEBUG   SenderThread:10026 [sender.py:send():382] send: footer
+2024-08-15 03:15:11,369 DEBUG   SenderThread:10026 [sender.py:send_request():409] send_request: defer
+2024-08-15 03:15:11,369 INFO    SenderThread:10026 [sender.py:send_request_defer():613] handle sender defer: 14
+2024-08-15 03:15:14,370 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:15:19,370 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:15:24,371 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:15:29,371 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:15:34,372 DEBUG   HandlerThread:10026 [handler.py:handle_request():146] handle_request: status_report
+2024-08-15 03:15:37,452 WARNING StreamThr :10026 [internal.py:is_dead():414] Internal process exiting, parent pid 9957 disappeared
+2024-08-15 03:15:37,452 ERROR   StreamThr :10026 [internal.py:wandb_internal():152] Internal process shutdown.
+2024-08-15 03:15:38,372 INFO    SenderThread:10026 [sender.py:finish():1572] shutting down sender
+2024-08-15 03:15:38,372 INFO    SenderThread:10026 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-15 03:15:38,372 INFO    SenderThread:10026 [file_pusher.py:join():178] waiting for file pusher
+2024-08-15 03:15:38,372 INFO    WriterThread:10026 [datastore.py:close():296] close: /project/wandb/run-20240815_031216-0szn78ph/run-0szn78ph.wandb
+2024-08-15 03:15:38,373 INFO    HandlerThread:10026 [handler.py:finish():869] shutting down handler

wandb/run-20240815_031216-0szn78ph/logs/debug.log ADDED Viewed

	@@ -0,0 +1,29 @@

+2024-08-15 03:12:16,236 INFO    MainThread:9957 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-15 03:12:16,236 INFO    MainThread:9957 [wandb_setup.py:_flush():76] Configure stats pid to 9957
+2024-08-15 03:12:16,236 INFO    MainThread:9957 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-15 03:12:16,236 INFO    MainThread:9957 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-15 03:12:16,236 INFO    MainThread:9957 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train Qwen2'}
+2024-08-15 03:12:16,236 INFO    MainThread:9957 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-15 03:12:16,236 INFO    MainThread:9957 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-15 03:12:16,236 INFO    MainThread:9957 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240815_031216-0szn78ph/logs/debug.log
+2024-08-15 03:12:16,236 INFO    MainThread:9957 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240815_031216-0szn78ph/logs/debug-internal.log
+2024-08-15 03:12:16,236 INFO    MainThread:9957 [wandb_init.py:init():566] calling init triggers
+2024-08-15 03:12:16,237 INFO    MainThread:9957 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'valid_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'test_data_path': ['304771887', '/work/llm_recipes/datasets/bin/sample/llm_jp_corpus_v2_ja_wiki_train_0/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'yans-qwen2-0.5B_train_2024-08-15-03:11:59', 'wandb_project': 'llm_tutorial', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/yans-qwen2-0.5B', 'save': '/work/llm_recipes/models/yans-qwen2-0.5B', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 10, 'save_interval': 10, 'eval_iters': 10, 'optimizer': 'adam', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 20000, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 20000, 'train_samples': None, 'global_batch_size': 320, 'micro_batch_size': 1, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 4096, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/yans-qwen2-0.5B', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': False, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 320}
+2024-08-15 03:12:16,237 INFO    MainThread:9957 [wandb_init.py:init():616] starting backend
+2024-08-15 03:12:16,237 INFO    MainThread:9957 [wandb_init.py:init():620] setting up manager
+2024-08-15 03:12:16,241 INFO    MainThread:9957 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-15 03:12:16,243 INFO    MainThread:9957 [wandb_init.py:init():628] backend started and connected
+2024-08-15 03:12:16,248 INFO    MainThread:9957 [wandb_init.py:init():720] updated telemetry
+2024-08-15 03:12:16,405 INFO    MainThread:9957 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-15 03:12:16,892 INFO    MainThread:9957 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-15 03:12:16,915 INFO    MainThread:9957 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.6 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-15 03:12:16,915 INFO    MainThread:9957 [wandb_init.py:init():804] starting run threads in backend
+2024-08-15 03:12:17,032 INFO    MainThread:9957 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-15 03:12:17,033 INFO    MainThread:9957 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-15 03:12:17,033 INFO    MainThread:9957 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-15 03:12:17,033 INFO    MainThread:9957 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-15 03:12:17,034 INFO    MainThread:9957 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-15 03:12:43,259 INFO    MainThread:9957 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
+2024-08-15 03:12:43,260 INFO    MainThread:9957 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}

wandb/run-20240815_031216-0szn78ph/run-0szn78ph.wandb ADDED Viewed

Binary file (21.9 kB). View file

wandb/run-20240823_162543-eroprw00/files/config.yaml ADDED Viewed

	@@ -0,0 +1,342 @@

+wandb_version: 1
+sharding_strategy:
+  desc: null
+  value: FULL_SHARD
+checkpoint_type:
+  desc: null
+  value: LOCAL_STATE_DICT
+fsdp_activation_checkpointing:
+  desc: null
+  value: true
+fsdp_cpu_offload:
+  desc: null
+  value: false
+low_cpu_fsdp:
+  desc: null
+  value: false
+no_meta_device:
+  desc: null
+  value: false
+data_path:
+  desc: null
+  value: null
+split:
+  desc: null
+  value: 969, 30, 1
+train_data_path:
+  desc: null
+  value:
+  - '1754785366'
+  - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
+  - '28623823675'
+  - /project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document
+valid_data_path:
+  desc: null
+  value:
+  - '1754785366'
+  - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
+test_data_path:
+  desc: null
+  value:
+  - '1754785366'
+  - /project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document
+data_cache_path:
+  desc: null
+  value: null
+vocab_size:
+  desc: null
+  value: null
+vocab_file:
+  desc: null
+  value: null
+merge_file:
+  desc: null
+  value: null
+seq_length:
+  desc: null
+  value: 4096
+num_workers:
+  desc: null
+  value: 2
+tokenizer_type:
+  desc: null
+  value: HFPreTrainedTokenizer
+tokenizer_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-0.5B
+reset_position_ids:
+  desc: null
+  value: false
+reset_attention_mask:
+  desc: null
+  value: false
+eod_mask_loss:
+  desc: null
+  value: false
+retro_return_doc_ids:
+  desc: null
+  value: false
+short_seq_prob:
+  desc: null
+  value: 0.1
+vocab_extra_ids:
+  desc: null
+  value: 0
+seed:
+  desc: null
+  value: 1234
+use_mpi:
+  desc: null
+  value: false
+wandb_entity:
+  desc: null
+  value: iwakawa-koichi-q5-tohoku-nlp6723
+wandb_name:
+  desc: null
+  value: Qwen2-0.5b-0.2_train_2024-08-23-16:25:30
+wandb_project:
+  desc: null
+  value: llm_tutorial-0.2
+quantization:
+  desc: null
+  value: false
+use_freeze_layers:
+  desc: null
+  value: false
+freeze_layers:
+  desc: null
+  value: null
+bf16:
+  desc: null
+  value: true
+fp16:
+  desc: null
+  value: false
+mixed_precision:
+  desc: null
+  value: true
+param_dtype:
+  desc: null
+  value: null
+load:
+  desc: null
+  value: /work/llm_recipes/models/Qwen2-0.5b-0.2
+save:
+  desc: null
+  value: /work/llm_recipes/models/Qwen2-0.5b-0.2
+base_model:
+  desc: null
+  value: /share/pretrained_lm/Qwen/Qwen2-0.5B
+use_better_transformer:
+  desc: null
+  value: false
+grad_clip_norm:
+  desc: null
+  value: 1.0
+eval_interval:
+  desc: null
+  value: 10
+save_interval:
+  desc: null
+  value: 10
+eval_iters:
+  desc: null
+  value: 10
+optimizer:
+  desc: null
+  value: anyprecision
+lr:
+  desc: null
+  value: 2.0e-05
+lr_decay_style:
+  desc: null
+  value: cosine
+lr_decay_iters:
+  desc: null
+  value: 7500
+lr_warmup_iters:
+  desc: null
+  value: 500
+min_lr:
+  desc: null
+  value: 1.0e-06
+train_iters:
+  desc: null
+  value: 7500
+train_samples:
+  desc: null
+  value: null
+global_batch_size:
+  desc: null
+  value: 640
+micro_batch_size:
+  desc: null
+  value: 5
+make_vocab_size_divisible_by:
+  desc: null
+  value: 128
+sliding_window_size:
+  desc: null
+  value: 131072
+skip_batch:
+  desc: null
+  value: null
+no_save_optimizer_state:
+  desc: null
+  value: false
+continual_pretraining:
+  desc: null
+  value: false
+instruction_tuning:
+  desc: null
+  value: false
+direct_preference_optimization:
+  desc: null
+  value: false
+attention_dropout:
+  desc: null
+  value: 0.1
+hidden_dropout:
+  desc: null
+  value: 0.1
+weight_decay:
+  desc: null
+  value: 0.1
+adam_beta1:
+  desc: null
+  value: 0.9
+adam_beta2:
+  desc: null
+  value: 0.95
+adam_eps:
+  desc: null
+  value: 1.0e-06
+hf_transformer_model_dir:
+  desc: null
+  value: null
+instruction_train_data_path:
+  desc: null
+  value: null
+instruction_valid_data_path:
+  desc: null
+  value: null
+epoch:
+  desc: null
+  value: null
+instruction_dataset_size:
+  desc: null
+  value: null
+save_sampler_state:
+  desc: null
+  value: false
+label_smoothing:
+  desc: null
+  value: 0.0
+save_n_checkpoints:
+  desc: null
+  value: 10
+hf_repo_id:
+  desc: null
+  value: koichi12/Qwen2-0.5b-0.2
+create_public_hf_repo:
+  desc: null
+  value: false
+upload_all_checkpoints_to_hf:
+  desc: null
+  value: true
+hf_upload_retry_limit:
+  desc: null
+  value: 2
+exit_duration_in_mins:
+  desc: null
+  value: null
+source_key:
+  desc: null
+  value: null
+target_key:
+  desc: null
+  value: null
+attn_implementation:
+  desc: null
+  value: flash_attention_2
+efficient_instruction_tuning:
+  desc: null
+  value: false
+remove_padding_masking:
+  desc: null
+  value: false
+save_start_iter:
+  desc: null
+  value: null
+valid_micro_batch_size:
+  desc: null
+  value: 1
+rank:
+  desc: null
+  value: 0
+world_size:
+  desc: null
+  value: 1
+padded_vocab_size:
+  desc: null
+  value: 151680
+gradient_accumulation_steps:
+  desc: null
+  value: 128
+_wandb:
+  desc: null
+  value:
+    python_version: 3.10.12
+    cli_version: 0.16.3
+    framework: huggingface
+    huggingface_version: 4.43.3
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1724397943.202675
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      - 105
+      2:
+      - 1
+      - 11
+      - 49
+      - 55
+      - 71
+      - 105
+      3:
+      - 13
+      - 16
+      - 23
+      4: 3.10.12
+      5: 0.16.3
+      6: 4.43.3
+      8:
+      - 5
+      13: linux-x86_64
+model_architecture:
+  desc: null
+  value: Qwen2ForCausalLM
+activation_function:
+  desc: null
+  value: silu
+hidden_size:
+  desc: null
+  value: 896
+model_type:
+  desc: null
+  value: qwen2
+max_position_embeddings:
+  desc: null
+  value: 4096
+num_attention_heads:
+  desc: null
+  value: 14
+num_hidden_layers:
+  desc: null
+  value: 24

wandb/run-20240823_162543-eroprw00/files/output.log ADDED Viewed

	@@ -0,0 +1,116 @@

+Created Hugging Face repository with ID koichi12/Qwen2-0.5b-0.2.
+Clearing GPU cache for all ranks
+--> Running with torch torch_distributed debug set to detail
+Loading model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/model.pt
+Loaded model state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/model.pt
+--> Model /share/pretrained_lm/Qwen/Qwen2-0.5B
+--> /share/pretrained_lm/Qwen/Qwen2-0.5B has 494.032768 Million params
+BFloat16 enabled for mixed precision - using bfSixteen policy
+You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
+/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_init_utils.py:441: UserWarning: FSDP is switching to use `NO_SHARD` instead of ShardingStrategy.FULL_SHARD since the world size is 1.
+  warnings.warn(
+Let split = None
+--> applying fsdp activation checkpointing...
+ > datasets target sizes (minimum size):
+    train:      4800000
+    validation: 4806400
+    test:       6400
+> building train, validation, and test datasets for GPT ...
+Unable to save the indexes because path_to_cache is None
+> finished creating GPT datasets ...
+Loading optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/optimizer.pt
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Building a BlendedDataset for a single MegatronDataset
+Unable to save the indexes because path_to_cache is None
+Loaded optimizer state dict from /work/llm_recipes/models/Qwen2-0.5b-0.2/iter_0000040/optimizer.pt
+model info: FullyShardedDataParallel(
+  (_fsdp_wrapped_module): Qwen2ForCausalLM(
+    (model): Qwen2Model(
+      (embed_tokens): Embedding(151936, 896)
+      (layers): ModuleList(
+        (0-23): 24 x FullyShardedDataParallel(
+          (_fsdp_wrapped_module): CheckpointWrapper(
+            (_checkpoint_wrapped_module): Qwen2DecoderLayer(
+              (self_attn): Qwen2FlashAttention2(
+                (q_proj): Linear(in_features=896, out_features=896, bias=True)
+                (k_proj): Linear(in_features=896, out_features=128, bias=True)
+                (v_proj): Linear(in_features=896, out_features=128, bias=True)
+                (o_proj): Linear(in_features=896, out_features=896, bias=False)
+                (rotary_emb): Qwen2RotaryEmbedding()
+              )
+              (mlp): Qwen2MLP(
+                (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
+                (up_proj): Linear(in_features=896, out_features=4864, bias=False)
+                (down_proj): Linear(in_features=4864, out_features=896, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): Qwen2RMSNorm()
+              (post_attention_layernorm): Qwen2RMSNorm()
+            )
+          )
+        )
+      )
+      (norm): Qwen2RMSNorm()
+    )
+    (lm_head): Linear(in_features=896, out_features=151936, bias=False)
+  )
+)
+model config: Qwen2Config {
+  "_name_or_path": "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "label_smoothing": 0.0,
+  "max_position_embeddings": 4096,
+  "max_window_layers": 24,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": 131072,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.43.3",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
+[rank0]:[2024-08-23 16:25:50,866] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _flatten_optim_state_dict() profiling:  defaultdict(<class 'float'>, {})
+Traceback (most recent call last):
+  File "/project/examples/finetuning.py", line 13, in <module>
+    main()
+  File "/project/src/llama_recipes/finetuning.py", line 282, in main
+    train(
+  File "/project/src/llama_recipes/utils/train_utils.py", line 110, in train
+    loss: torch.Tensor = model(**batch).loss
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
+    output = self._fsdp_wrapped_module(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/project/lib/transformers/src/transformers/models/qwen2/modeling_qwen2.py", line 1082, in forward
+    loss = loss_fct(shift_logits, shift_labels)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/loss.py", line 1179, in forward
+    return F.cross_entropy(input, target, weight=self.weight,
+  File "/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py", line 3086, in cross_entropy
+    return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
+torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 11.59 GiB. GPU 0 has a total capacity of 39.39 GiB of which 11.28 GiB is free. Including non-PyTorch memory, this process has 28.11 GiB memory in use. Of the allocated memory 26.94 GiB is allocated by PyTorch, and 363.22 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

wandb/run-20240823_162543-eroprw00/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,375 @@

+absl-py==2.1.0
+accelerate==0.23.0
+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+antlr4-python3-runtime==4.9.3
+anyio==4.4.0
+apex==0.1
+appdirs==1.4.4
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+astroid==3.2.4
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+audioread==3.0.1
+beautifulsoup4==4.12.3
+bert-score==0.3.13
+bleach==6.1.0
+blis==0.7.11
+build==1.2.1
+cachecontrol==0.14.0
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2024.2.2
+cffi==1.16.0
+chardet==5.2.0
+charset-normalizer==3.3.2
+cleo==2.1.0
+click==8.1.7
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cmake==3.28.1
+colorama==0.4.6
+comm==0.2.1
+confection==0.1.4
+contourpy==1.2.0
+cramjam==2.8.3
+crashtest==0.4.1
+cryptography==43.0.0
+cubinlinker==0.3.0+2.g405ac64
+cuda-python==12.3.0rc4+9.gdb8c48a.dirty
+cudf==23.12.0
+cugraph-dgl==23.12.0
+cugraph-service-client==23.12.0
+cugraph-service-server==23.12.0
+cugraph==23.12.0
+cuml==23.12.0
+cupy-cuda12x==12.3.0
+cycler==0.12.1
+cymem==2.0.8
+cython==3.0.8
+dask-cuda==23.12.0
+dask-cudf==23.12.0
+dask==2023.11.0
+dataclasses-json==0.6.7
+dataproperty==1.0.1
+datasets==2.20.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.8
+distlib==0.3.8
+distributed==2023.11.0
+distro==1.9.0
+dm-tree==0.1.8
+docker-pycreds==0.4.0
+dulwich==0.21.7
+einops==0.7.0
+emoji==2.12.1
+entmax==1.3
+evaluate==0.4.2
+exceptiongroup==1.2.0
+execnet==2.0.2
+executing==2.0.1
+expecttest==0.1.3
+fastjsonschema==2.19.1
+fastparquet==2023.10.1
+fastrlock==0.8.2
+filelock==3.13.1
+flash-attn==2.4.2
+fonttools==4.48.1
+frozenlist==1.4.1
+fsspec==2023.12.2
+fugashi==1.3.2
+fuzzywuzzy==0.18.0
+gast==0.5.4
+gitdb==4.0.11
+gitpython==3.1.43
+google-auth-oauthlib==0.4.6
+google-auth==2.27.0
+graphsurgeon==0.4.6
+greenlet==3.0.3
+grpcio==1.60.1
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.0
+huggingface-hub==0.24.5
+hydra-core==1.3.2
+hypothesis==5.35.1
+idna==3.6
+importlib-metadata==7.0.1
+iniconfig==2.0.0
+installer==0.7.0
+intel-openmp==2021.4.0
+ipadic==1.0.0
+ipykernel==6.29.2
+ipython-genutils==0.2.0
+ipython==8.21.0
+isort==5.13.2
+jaraco.classes==3.4.0
+jedi==0.19.1
+jeepney==0.8.0
+jinja2==3.1.3
+jiter==0.5.0
+joblib==1.3.2
+json5==0.9.14
+jsonargparse==3.13.1
+jsonlines==4.0.0
+jsonnet==0.19.1
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema-specifications==2023.12.1
+jsonschema==4.21.1
+jupyter-client==8.6.0
+jupyter-core==5.7.1
+jupyter-tensorboard==0.2.0
+jupyterlab-pygments==0.3.0
+jupyterlab-server==1.2.0
+jupyterlab==2.3.2
+jupytext==1.16.1
+keyring==24.3.1
+kiwisolver==1.4.5
+langchain-community==0.2.12
+langchain-core==0.2.31
+langchain-huggingface==0.0.2
+langchain-openai==0.1.21
+langchain-text-splitters==0.2.2
+langchain==0.2.13
+langcodes==3.3.0
+langsmith==0.1.99
+lazy-loader==0.3
+levenshtein==0.25.1
+librosa==0.10.1
+lightning-utilities==0.11.6
+llm-jp-eval==1.4.0
+llvmlite==0.40.1
+lm-eval==0.3.0
+locket==1.0.0
+logzero==1.7.0
+lxml==5.2.2
+markdown-it-py==3.0.0
+markdown==3.5.2
+markupsafe==2.1.4
+marshmallow==3.21.3
+matplotlib-inline==0.1.6
+matplotlib==3.8.2
+mbstrdecoder==1.1.3
+mccabe==0.7.0
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mecab-python3==1.0.6
+mistune==3.0.2
+mkl-devel==2021.1.1
+mkl-include==2021.1.1
+mkl==2021.1.1
+mock==5.1.0
+mojimoji==0.0.13
+more-itertools==9.1.0
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+multiprocess==0.70.16
+murmurhash==1.0.10
+mypy-extensions==1.0.0
+nbclient==0.9.0
+nbconvert==7.16.0
+nbformat==5.9.2
+neologdn==0.5.3
+nest-asyncio==1.6.0
+networkx==2.6.3
+ninja==1.11.1.1
+nltk==3.8.1
+notebook==6.4.10
+numba==0.57.1+1.g1ff679645
+numexpr==2.10.1
+numpy==1.24.4
+nvfuser==0.1.4a0+d0bb811
+nvidia-dali-cuda120==1.34.0
+nvidia-pyindex==1.0.9
+nvtx==0.2.5
+oauthlib==3.2.2
+omegaconf==2.3.0
+onnx==1.15.0rc2
+openai==1.40.6
+opencv==4.7.0
+optree==0.10.0
+orjson==3.10.7
+packaging==23.2
+pandas==2.2.2
+pandocfilters==1.5.1
+parso==0.8.3
+partd==1.4.1
+pathvalidate==3.2.0
+peft==0.5.0
+pexpect==4.9.0
+pillow==10.2.0
+pip==24.0
+pkginfo==1.11.1
+plac==1.4.3
+platformdirs==4.2.0
+pluggy==1.4.0
+ply==3.11
+poetry-core==1.9.0
+poetry-plugin-export==1.8.0
+poetry==1.8.3
+polygraphy==0.49.4
+pooch==1.8.0
+portalocker==2.10.1
+preshed==3.0.9
+prettytable==3.9.0
+prometheus-client==0.19.0
+prompt-toolkit==3.0.43
+protobuf==4.24.4
+psutil==5.9.4
+ptxcompiler==0.8.1+2.g0d406d6
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow-hotfix==0.6
+pyarrow==15.0.2
+pyasn1-modules==0.3.0
+pyasn1==0.5.1
+pybind11-global==2.11.1
+pybind11==2.11.1
+pycocotools==2.0+nv0.8.0
+pycountry==24.6.1
+pycparser==2.21
+pydantic-core==2.16.2
+pydantic==2.6.1
+pygments==2.17.2
+pylibcugraph==23.12.0
+pylibcugraphops==23.12.0
+pylibraft==23.12.0
+pylint==3.2.6
+pynvml==11.4.1
+pyparsing==3.1.1
+pyproject-hooks==1.1.0
+pytablewriter==1.2.0
+pytest-flakefinder==1.1.0
+pytest-rerunfailures==13.0
+pytest-shard==0.1.2
+pytest-xdist==3.5.0
+pytest==8.0.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-hostlist==1.23.0
+python-levenshtein==0.25.1
+pytorch-lightning==2.4.0
+pytorch-quantization==2.1.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+pyzmq==25.1.2
+raft-dask==23.12.0
+rapidfuzz==3.9.6
+rapids-dask-dependency==23.12.1
+referencing==0.33.0
+regex==2023.12.25
+requests-oauthlib==1.3.1
+requests-toolbelt==1.0.0
+requests==2.32.3
+rhoknp==1.7.0
+rich==13.7.0
+rmm==23.12.0
+rouge-score==0.1.2
+rpds-py==0.17.1
+rsa==4.9
+sacrebleu==2.4.2
+safetensors==0.4.3
+scikit-learn==1.5.1
+scipy==1.12.0
+secretstorage==3.3.3
+send2trash==1.8.2
+sentence-transformers==3.0.1
+sentencepiece==0.1.99
+sentry-sdk==2.12.0
+setproctitle==1.3.3
+setuptools==68.2.2
+shellingham==1.5.4
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+spacy==3.7.2
+sphinx-glpi-theme==0.6
+sqlalchemy==2.0.32
+sqlitedict==2.1.0
+srsly==2.4.8
+stack-data==0.6.3
+sumeval==0.2.2
+sympy==1.12
+tabledata==1.3.3
+tabulate==0.9.0
+tbb==2021.11.0
+tblib==3.0.0
+tcolorpy==0.1.6
+tenacity==8.5.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+tensorboard==2.9.0
+tensorrt==8.6.3
+terminado==0.18.0
+termplotlib==0.3.9
+text-generation==0.7.0
+thinc==8.2.3
+threadpoolctl==3.2.0
+thriftpy2==0.4.17
+tiktoken==0.7.0
+tinycss2==1.2.1
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+tomlkit==0.13.2
+toolz==0.12.1
+torch-tensorrt==2.3.0a0
+torch==2.3.0a0+ebedce2
+torchdata==0.7.1a0
+torchmetrics==0.10.3
+torchtext==0.17.0a0
+torchvision==0.18.0a0
+tornado==6.4
+tqdm-multiprocess==0.0.11
+tqdm==4.66.5
+traitlets==5.9.0
+transformer-engine==1.3.0+5b90b7f
+transformers==4.43.3
+treelite-runtime==3.9.1
+treelite==3.9.1
+triton==2.2.0+e28a256
+trove-classifiers==2024.7.2
+typepy==1.3.2
+typer==0.9.0
+types-dataclasses==0.6.6
+typing-extensions==4.12.2
+typing-inspect==0.9.0
+tzdata==2024.1
+ucx-py==0.35.0
+uff==0.6.9
+ujson==5.8.0
+unbabel-comet==2.2.2
+unidic-lite==1.0.8
+urllib3==1.26.18
+virtualenv==20.26.3
+wandb==0.16.3
+wasabi==1.1.2
+wcwidth==0.2.13
+weasel==0.3.4
+webencodings==0.5.1
+werkzeug==3.0.1
+wheel==0.42.0
+word2number==1.1
+xdoctest==1.0.2
+xgboost==1.7.6
+xmltodict==0.13.0
+xxhash==3.4.1
+yarl==1.9.4
+zict==3.0.0
+zipp==3.17.0
+zstandard==0.23.0

wandb/run-20240823_162543-eroprw00/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,220 @@

+{
+    "os": "Linux-5.15.0-91-generic-x86_64-with-glibc2.35",
+    "python": "3.10.12",
+    "heartbeatAt": "2024-08-23T07:25:43.758914",
+    "startedAt": "2024-08-23T07:25:43.187250",
+    "docker": null,
+    "cuda": null,
+    "args": [
+        "--seq-length",
+        "4096",
+        "--sliding-window-size",
+        "131072",
+        "--micro-batch-size",
+        "5",
+        "--valid_micro_batch_size",
+        "1",
+        "--global-batch-size",
+        "640",
+        "--train-iters",
+        "7500",
+        "--tokenizer-type",
+        "HFPreTrainedTokenizer",
+        "--tokenizer-model",
+        "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+        "--train-data-path",
+        "1754785366",
+        "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
+        "28623823675",
+        "/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document",
+        "--valid-data-path",
+        "1754785366",
+        "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
+        "--test-data-path",
+        "1754785366",
+        "/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document",
+        "--lr",
+        "2e-5",
+        "--min-lr",
+        "1e-6",
+        "--lr-decay-style",
+        "cosine",
+        "--lr-warmup-iters",
+        "500",
+        "--lr-decay-iters",
+        "7500",
+        "--weight-decay",
+        "0.1",
+        "--grad-clip-norm",
+        "1.0",
+        "--optimizer",
+        "anyprecision",
+        "--adam-beta1",
+        "0.9",
+        "--adam-beta2",
+        "0.95",
+        "--adam-eps",
+        "1e-6",
+        "--save-interval",
+        "10",
+        "--eval-interval",
+        "10",
+        "--eval-iters",
+        "10",
+        "--bf16",
+        "--mixed-precision",
+        "--base-model",
+        "/share/pretrained_lm/Qwen/Qwen2-0.5B",
+        "--save",
+        "/work/llm_recipes/models/Qwen2-0.5b-0.2",
+        "--load",
+        "/work/llm_recipes/models/Qwen2-0.5b-0.2",
+        "--fsdp-activation-checkpointing",
+        "--sharding-strategy",
+        "FULL_SHARD",
+        "--checkpoint-type",
+        "LOCAL_STATE_DICT",
+        "--save-n-checkpoints",
+        "10",
+        "--upload-all-checkpoints-to-hf",
+        "--hf-upload-retry-limit",
+        "2",
+        "--hf-repo-id",
+        "koichi12/Qwen2-0.5b-0.2",
+        "--wandb-entity",
+        "iwakawa-koichi-q5-tohoku-nlp6723",
+        "--wandb-project",
+        "llm_tutorial-0.2",
+        "--wandb-name",
+        "Qwen2-0.5b-0.2_train_2024-08-23-16:25:30"
+    ],
+    "state": "running",
+    "program": "/project/examples/finetuning.py",
+    "codePathLocal": "examples/finetuning.py",
+    "codePath": "examples/finetuning.py",
+    "git": {
+        "remote": "https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git",
+        "commit": "887a2cc5d104c10264701f95cbbb0a6a116768d6"
+    },
+    "email": null,
+    "root": "/project",
+    "host": "gpu-koiwa-00",
+    "username": "koiwa",
+    "executable": "/usr/bin/python",
+    "cpu_count": 18,
+    "cpu_count_logical": 18,
+    "cpu_freq": {
+        "current": 2400.0389999999993,
+        "min": 0.0,
+        "max": 0.0
+    },
+    "cpu_freq_per_core": [
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        },
+        {
+            "current": 2400.039,
+            "min": 0.0,
+            "max": 0.0
+        }
+    ],
+    "disk": {
+        "/": {
+            "total": 0.0625,
+            "used": 1.1444091796875e-05
+        }
+    },
+    "gpu": "NVIDIA A100-SXM4-40GB",
+    "gpu_count": 1,
+    "gpu_devices": [
+        {
+            "name": "NVIDIA A100-SXM4-40GB",
+            "memory_total": 42949672960
+        }
+    ],
+    "memory": {
+        "total": 56.487831115722656
+    }
+}

wandb/run-20240823_162543-eroprw00/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_wandb": {"runtime": 8}}

wandb/run-20240823_162543-eroprw00/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,188 @@

+2024-08-23 16:25:43,204 INFO    StreamThr :11284 [internal.py:wandb_internal():86] W&B internal server running at pid: 11284, started at: 2024-08-23 16:25:43.204013
+2024-08-23 16:25:43,206 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: status
+2024-08-23 16:25:43,207 INFO    WriterThread:11284 [datastore.py:open_for_write():87] open: /project/wandb/run-20240823_162543-eroprw00/run-eroprw00.wandb
+2024-08-23 16:25:43,208 DEBUG   SenderThread:11284 [sender.py:send():382] send: header
+2024-08-23 16:25:43,222 DEBUG   SenderThread:11284 [sender.py:send():382] send: run
+2024-08-23 16:25:43,662 INFO    SenderThread:11284 [dir_watcher.py:__init__():211] watching files in: /project/wandb/run-20240823_162543-eroprw00/files
+2024-08-23 16:25:43,662 INFO    SenderThread:11284 [sender.py:_start_run_threads():1136] run started: eroprw00 with start time 1724397943.202675
+2024-08-23 16:25:43,667 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: check_version
+2024-08-23 16:25:43,668 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: check_version
+2024-08-23 16:25:43,739 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: run_start
+2024-08-23 16:25:43,746 DEBUG   HandlerThread:11284 [system_info.py:__init__():27] System info init
+2024-08-23 16:25:43,746 DEBUG   HandlerThread:11284 [system_info.py:__init__():42] System info init done
+2024-08-23 16:25:43,746 INFO    HandlerThread:11284 [system_monitor.py:start():194] Starting system monitor
+2024-08-23 16:25:43,746 INFO    SystemMonitor:11284 [system_monitor.py:_start():158] Starting system asset monitoring threads
+2024-08-23 16:25:43,746 INFO    HandlerThread:11284 [system_monitor.py:probe():214] Collecting system info
+2024-08-23 16:25:43,746 INFO    SystemMonitor:11284 [interfaces.py:start():190] Started cpu monitoring
+2024-08-23 16:25:43,747 INFO    SystemMonitor:11284 [interfaces.py:start():190] Started disk monitoring
+2024-08-23 16:25:43,747 INFO    SystemMonitor:11284 [interfaces.py:start():190] Started gpu monitoring
+2024-08-23 16:25:43,748 INFO    SystemMonitor:11284 [interfaces.py:start():190] Started memory monitoring
+2024-08-23 16:25:43,749 INFO    SystemMonitor:11284 [interfaces.py:start():190] Started network monitoring
+2024-08-23 16:25:43,758 DEBUG   HandlerThread:11284 [system_info.py:probe():151] Probing system
+2024-08-23 16:25:43,760 DEBUG   HandlerThread:11284 [system_info.py:_probe_git():136] Probing git
+2024-08-23 16:25:43,773 DEBUG   HandlerThread:11284 [system_info.py:_probe_git():144] Probing git done
+2024-08-23 16:25:43,773 DEBUG   HandlerThread:11284 [system_info.py:probe():199] Probing system done
+2024-08-23 16:25:43,773 DEBUG   HandlerThread:11284 [system_monitor.py:probe():223] {'os': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'python': '3.10.12', 'heartbeatAt': '2024-08-23T07:25:43.758914', 'startedAt': '2024-08-23T07:25:43.187250', 'docker': None, 'cuda': None, 'args': ('--seq-length', '4096', '--sliding-window-size', '131072', '--micro-batch-size', '5', '--valid_micro_batch_size', '1', '--global-batch-size', '640', '--train-iters', '7500', '--tokenizer-type', 'HFPreTrainedTokenizer', '--tokenizer-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--train-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document', '--valid-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--test-data-path', '1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '--lr', '2e-5', '--min-lr', '1e-6', '--lr-decay-style', 'cosine', '--lr-warmup-iters', '500', '--lr-decay-iters', '7500', '--weight-decay', '0.1', '--grad-clip-norm', '1.0', '--optimizer', 'anyprecision', '--adam-beta1', '0.9', '--adam-beta2', '0.95', '--adam-eps', '1e-6', '--save-interval', '10', '--eval-interval', '10', '--eval-iters', '10', '--bf16', '--mixed-precision', '--base-model', '/share/pretrained_lm/Qwen/Qwen2-0.5B', '--save', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--load', '/work/llm_recipes/models/Qwen2-0.5b-0.2', '--fsdp-activation-checkpointing', '--sharding-strategy', 'FULL_SHARD', '--checkpoint-type', 'LOCAL_STATE_DICT', '--save-n-checkpoints', '10', '--upload-all-checkpoints-to-hf', '--hf-upload-retry-limit', '2', '--hf-repo-id', 'koichi12/Qwen2-0.5b-0.2', '--wandb-entity', 'iwakawa-koichi-q5-tohoku-nlp6723', '--wandb-project', 'llm_tutorial-0.2', '--wandb-name', 'Qwen2-0.5b-0.2_train_2024-08-23-16:25:30'), 'state': 'running', 'program': '/project/examples/finetuning.py', 'codePathLocal': 'examples/finetuning.py', 'codePath': 'examples/finetuning.py', 'git': {'remote': 'https://github.com/cl-tohoku/llm-recipes-failab-m1-yans.git', 'commit': '887a2cc5d104c10264701f95cbbb0a6a116768d6'}, 'email': None, 'root': '/project', 'host': 'gpu-koiwa-00', 'username': 'koiwa', 'executable': '/usr/bin/python', 'cpu_count': 18, 'cpu_count_logical': 18, 'cpu_freq': {'current': 2400.0389999999993, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}, {'current': 2400.039, 'min': 0.0, 'max': 0.0}], 'disk': {'/': {'total': 0.0625, 'used': 1.1444091796875e-05}}, 'gpu': 'NVIDIA A100-SXM4-40GB', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-40GB', 'memory_total': 42949672960}], 'memory': {'total': 56.487831115722656}}
+2024-08-23 16:25:43,773 INFO    HandlerThread:11284 [system_monitor.py:probe():224] Finished collecting system info
+2024-08-23 16:25:43,773 INFO    HandlerThread:11284 [system_monitor.py:probe():227] Publishing system info
+2024-08-23 16:25:43,774 INFO    HandlerThread:11284 [system_monitor.py:probe():229] Finished publishing system info
+2024-08-23 16:25:43,780 DEBUG   SenderThread:11284 [sender.py:send():382] send: files
+2024-08-23 16:25:43,780 INFO    SenderThread:11284 [sender.py:_save_file():1403] saving file wandb-metadata.json with policy now
+2024-08-23 16:25:43,791 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: python_packages
+2024-08-23 16:25:43,791 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-23 16:25:43,792 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: stop_status
+2024-08-23 16:25:43,792 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: python_packages
+2024-08-23 16:25:43,794 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: stop_status
+2024-08-23 16:25:44,074 DEBUG   SenderThread:11284 [sender.py:send():382] send: telemetry
+2024-08-23 16:25:44,478 INFO    wandb-upload_0:11284 [upload_job.py:push():131] Uploaded file /tmp/tmpn8dztdufwandb/9bfyl56b-wandb-metadata.json
+2024-08-23 16:25:44,664 INFO    Thread-12 :11284 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_162543-eroprw00/files/wandb-metadata.json
+2024-08-23 16:25:44,664 INFO    Thread-12 :11284 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_162543-eroprw00/files/requirements.txt
+2024-08-23 16:25:44,664 INFO    Thread-12 :11284 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_162543-eroprw00/files/output.log
+2024-08-23 16:25:46,664 INFO    Thread-12 :11284 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162543-eroprw00/files/output.log
+2024-08-23 16:25:48,665 INFO    Thread-12 :11284 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162543-eroprw00/files/output.log
+2024-08-23 16:25:49,201 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: status_report
+2024-08-23 16:25:50,667 INFO    Thread-12 :11284 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162543-eroprw00/files/output.log
+2024-08-23 16:25:51,139 DEBUG   SenderThread:11284 [sender.py:send():382] send: config
+2024-08-23 16:25:51,140 DEBUG   SenderThread:11284 [sender.py:send():382] send: config
+2024-08-23 16:25:52,592 DEBUG   SenderThread:11284 [sender.py:send():382] send: exit
+2024-08-23 16:25:52,592 INFO    SenderThread:11284 [sender.py:send_exit():589] handling exit code: 1
+2024-08-23 16:25:52,592 INFO    SenderThread:11284 [sender.py:send_exit():591] handling runtime: 8
+2024-08-23 16:25:52,593 INFO    SenderThread:11284 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-23 16:25:52,594 INFO    SenderThread:11284 [sender.py:send_exit():597] send defer
+2024-08-23 16:25:52,594 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 16:25:52,594 INFO    HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 0
+2024-08-23 16:25:52,594 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: defer
+2024-08-23 16:25:52,594 INFO    SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 0
+2024-08-23 16:25:52,594 INFO    SenderThread:11284 [sender.py:transition_state():617] send defer: 1
+2024-08-23 16:25:52,594 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 16:25:52,594 INFO    HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 1
+2024-08-23 16:25:52,595 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: defer
+2024-08-23 16:25:52,595 INFO    SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 1
+2024-08-23 16:25:52,595 INFO    SenderThread:11284 [sender.py:transition_state():617] send defer: 2
+2024-08-23 16:25:52,595 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 16:25:52,595 INFO    HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 2
+2024-08-23 16:25:52,595 INFO    HandlerThread:11284 [system_monitor.py:finish():203] Stopping system monitor
+2024-08-23 16:25:52,595 DEBUG   SystemMonitor:11284 [system_monitor.py:_start():172] Starting system metrics aggregation loop
+2024-08-23 16:25:52,595 INFO    HandlerThread:11284 [interfaces.py:finish():202] Joined cpu monitor
+2024-08-23 16:25:52,595 DEBUG   SystemMonitor:11284 [system_monitor.py:_start():179] Finished system metrics aggregation loop
+2024-08-23 16:25:52,595 INFO    HandlerThread:11284 [interfaces.py:finish():202] Joined disk monitor
+2024-08-23 16:25:52,596 DEBUG   SystemMonitor:11284 [system_monitor.py:_start():183] Publishing last batch of metrics
+2024-08-23 16:25:52,629 INFO    HandlerThread:11284 [interfaces.py:finish():202] Joined gpu monitor
+2024-08-23 16:25:52,629 INFO    HandlerThread:11284 [interfaces.py:finish():202] Joined memory monitor
+2024-08-23 16:25:52,629 INFO    HandlerThread:11284 [interfaces.py:finish():202] Joined network monitor
+2024-08-23 16:25:52,629 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: defer
+2024-08-23 16:25:52,629 INFO    SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 2
+2024-08-23 16:25:52,629 INFO    SenderThread:11284 [sender.py:transition_state():617] send defer: 3
+2024-08-23 16:25:52,629 DEBUG   SenderThread:11284 [sender.py:send():382] send: stats
+2024-08-23 16:25:52,629 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 16:25:52,630 INFO    HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 3
+2024-08-23 16:25:52,630 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: defer
+2024-08-23 16:25:52,630 INFO    SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 3
+2024-08-23 16:25:52,630 INFO    SenderThread:11284 [sender.py:transition_state():617] send defer: 4
+2024-08-23 16:25:52,630 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 16:25:52,630 INFO    HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 4
+2024-08-23 16:25:52,630 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: defer
+2024-08-23 16:25:52,630 INFO    SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 4
+2024-08-23 16:25:52,630 INFO    SenderThread:11284 [sender.py:transition_state():617] send defer: 5
+2024-08-23 16:25:52,630 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 16:25:52,631 INFO    HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 5
+2024-08-23 16:25:52,631 DEBUG   SenderThread:11284 [sender.py:send():382] send: summary
+2024-08-23 16:25:52,632 INFO    SenderThread:11284 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end
+2024-08-23 16:25:52,632 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: defer
+2024-08-23 16:25:52,632 INFO    SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 5
+2024-08-23 16:25:52,632 INFO    SenderThread:11284 [sender.py:transition_state():617] send defer: 6
+2024-08-23 16:25:52,632 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 16:25:52,632 INFO    HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 6
+2024-08-23 16:25:52,632 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: defer
+2024-08-23 16:25:52,632 INFO    SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 6
+2024-08-23 16:25:52,635 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: status_report
+2024-08-23 16:25:52,668 INFO    Thread-12 :11284 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162543-eroprw00/files/output.log
+2024-08-23 16:25:52,668 INFO    Thread-12 :11284 [dir_watcher.py:_on_file_created():271] file/dir created: /project/wandb/run-20240823_162543-eroprw00/files/wandb-summary.json
+2024-08-23 16:25:52,831 INFO    SenderThread:11284 [sender.py:transition_state():617] send defer: 7
+2024-08-23 16:25:52,831 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 16:25:52,831 INFO    HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 7
+2024-08-23 16:25:52,831 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: defer
+2024-08-23 16:25:52,831 INFO    SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 7
+2024-08-23 16:25:53,592 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-23 16:25:53,669 INFO    Thread-12 :11284 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162543-eroprw00/files/config.yaml
+2024-08-23 16:25:54,373 INFO    SenderThread:11284 [sender.py:transition_state():617] send defer: 8
+2024-08-23 16:25:54,374 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: poll_exit
+2024-08-23 16:25:54,374 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 16:25:54,374 INFO    HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 8
+2024-08-23 16:25:54,374 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: defer
+2024-08-23 16:25:54,374 INFO    SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 8
+2024-08-23 16:25:54,374 INFO    SenderThread:11284 [job_builder.py:build():296] Attempting to build job artifact
+2024-08-23 16:25:54,375 INFO    SenderThread:11284 [job_builder.py:_get_source_type():426] is repo sourced job
+2024-08-23 16:25:54,389 INFO    SenderThread:11284 [job_builder.py:build():402] adding wandb-job metadata file
+2024-08-23 16:25:54,398 INFO    SenderThread:11284 [sender.py:transition_state():617] send defer: 9
+2024-08-23 16:25:54,398 DEBUG   SenderThread:11284 [sender.py:send():382] send: artifact
+2024-08-23 16:25:54,398 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 16:25:54,399 INFO    HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 9
+2024-08-23 16:25:54,593 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-23 16:25:54,670 INFO    Thread-12 :11284 [dir_watcher.py:_on_file_modified():288] file/dir modified: /project/wandb/run-20240823_162543-eroprw00/files/output.log
+2024-08-23 16:25:55,372 INFO    SenderThread:11284 [sender.py:send_artifact():1494] sent artifact job-https___github.com_cl-tohoku_llm-recipes-failab-m1-yans.git_examples_finetuning.py - {'id': 'QXJ0aWZhY3Q6MTE2MTk5MDU4OQ==', 'state': 'COMMITTED', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjQxNjQ1ODQ1MA==', 'latestArtifact': {'id': 'QXJ0aWZhY3Q6MTE2MTk5MDU4OQ==', 'versionIndex': 2}}}
+2024-08-23 16:25:55,372 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: defer
+2024-08-23 16:25:55,372 INFO    SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 9
+2024-08-23 16:25:55,372 INFO    SenderThread:11284 [dir_watcher.py:finish():358] shutting down directory watcher
+2024-08-23 16:25:55,671 INFO    SenderThread:11284 [dir_watcher.py:finish():388] scan: /project/wandb/run-20240823_162543-eroprw00/files
+2024-08-23 16:25:55,671 INFO    SenderThread:11284 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162543-eroprw00/files/requirements.txt requirements.txt
+2024-08-23 16:25:55,671 INFO    SenderThread:11284 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162543-eroprw00/files/config.yaml config.yaml
+2024-08-23 16:25:55,673 INFO    SenderThread:11284 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162543-eroprw00/files/wandb-metadata.json wandb-metadata.json
+2024-08-23 16:25:55,673 INFO    SenderThread:11284 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162543-eroprw00/files/wandb-summary.json wandb-summary.json
+2024-08-23 16:25:55,674 INFO    SenderThread:11284 [dir_watcher.py:finish():402] scan save: /project/wandb/run-20240823_162543-eroprw00/files/output.log output.log
+2024-08-23 16:25:55,676 INFO    SenderThread:11284 [sender.py:transition_state():617] send defer: 10
+2024-08-23 16:25:55,676 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: poll_exit
+2024-08-23 16:25:55,676 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 16:25:55,677 INFO    HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 10
+2024-08-23 16:25:55,678 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: defer
+2024-08-23 16:25:55,678 INFO    SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 10
+2024-08-23 16:25:55,678 INFO    SenderThread:11284 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-23 16:25:56,071 INFO    wandb-upload_0:11284 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_162543-eroprw00/files/requirements.txt
+2024-08-23 16:25:56,117 INFO    wandb-upload_1:11284 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_162543-eroprw00/files/config.yaml
+2024-08-23 16:25:56,151 INFO    wandb-upload_3:11284 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_162543-eroprw00/files/output.log
+2024-08-23 16:25:56,152 INFO    wandb-upload_2:11284 [upload_job.py:push():131] Uploaded file /project/wandb/run-20240823_162543-eroprw00/files/wandb-summary.json
+2024-08-23 16:25:56,353 INFO    Thread-11 (_thread_body):11284 [sender.py:transition_state():617] send defer: 11
+2024-08-23 16:25:56,353 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 16:25:56,353 INFO    HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 11
+2024-08-23 16:25:56,353 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: defer
+2024-08-23 16:25:56,353 INFO    SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 11
+2024-08-23 16:25:56,353 INFO    SenderThread:11284 [file_pusher.py:join():178] waiting for file pusher
+2024-08-23 16:25:56,353 INFO    SenderThread:11284 [sender.py:transition_state():617] send defer: 12
+2024-08-23 16:25:56,354 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 16:25:56,354 INFO    HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 12
+2024-08-23 16:25:56,354 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: defer
+2024-08-23 16:25:56,354 INFO    SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 12
+2024-08-23 16:25:56,354 INFO    SenderThread:11284 [file_stream.py:finish():595] file stream finish called
+2024-08-23 16:25:56,522 INFO    SenderThread:11284 [file_stream.py:finish():599] file stream finish is done
+2024-08-23 16:25:56,522 INFO    SenderThread:11284 [sender.py:transition_state():617] send defer: 13
+2024-08-23 16:25:56,523 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 16:25:56,523 INFO    HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 13
+2024-08-23 16:25:56,523 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: defer
+2024-08-23 16:25:56,523 INFO    SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 13
+2024-08-23 16:25:56,523 INFO    SenderThread:11284 [sender.py:transition_state():617] send defer: 14
+2024-08-23 16:25:56,523 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: defer
+2024-08-23 16:25:56,523 DEBUG   SenderThread:11284 [sender.py:send():382] send: final
+2024-08-23 16:25:56,523 INFO    HandlerThread:11284 [handler.py:handle_request_defer():172] handle defer: 14
+2024-08-23 16:25:56,523 DEBUG   SenderThread:11284 [sender.py:send():382] send: footer
+2024-08-23 16:25:56,524 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: defer
+2024-08-23 16:25:56,524 INFO    SenderThread:11284 [sender.py:send_request_defer():613] handle sender defer: 14
+2024-08-23 16:25:56,524 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-23 16:25:56,524 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: poll_exit
+2024-08-23 16:25:56,524 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: poll_exit
+2024-08-23 16:25:56,525 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: poll_exit
+2024-08-23 16:25:56,525 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: server_info
+2024-08-23 16:25:56,525 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: get_summary
+2024-08-23 16:25:56,525 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: server_info
+2024-08-23 16:25:56,527 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: sampled_history
+2024-08-23 16:25:56,527 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: internal_messages
+2024-08-23 16:25:56,527 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: job_info
+2024-08-23 16:25:56,684 DEBUG   SenderThread:11284 [sender.py:send_request():409] send_request: job_info
+2024-08-23 16:25:56,684 INFO    MainThread:11284 [wandb_run.py:_footer_history_summary_info():3866] rendering history
+2024-08-23 16:25:56,685 INFO    MainThread:11284 [wandb_run.py:_footer_history_summary_info():3898] rendering summary
+2024-08-23 16:25:56,685 INFO    MainThread:11284 [wandb_run.py:_footer_sync_info():3825] logging synced files
+2024-08-23 16:25:56,685 DEBUG   HandlerThread:11284 [handler.py:handle_request():146] handle_request: shutdown
+2024-08-23 16:25:56,685 INFO    HandlerThread:11284 [handler.py:finish():869] shutting down handler
+2024-08-23 16:25:57,528 INFO    WriterThread:11284 [datastore.py:close():296] close: /project/wandb/run-20240823_162543-eroprw00/run-eroprw00.wandb
+2024-08-23 16:25:57,685 INFO    SenderThread:11284 [sender.py:finish():1572] shutting down sender
+2024-08-23 16:25:57,685 INFO    SenderThread:11284 [file_pusher.py:finish():172] shutting down file pusher
+2024-08-23 16:25:57,685 INFO    SenderThread:11284 [file_pusher.py:join():178] waiting for file pusher

wandb/run-20240823_162543-eroprw00/logs/debug.log ADDED Viewed

	@@ -0,0 +1,30 @@

+2024-08-23 16:25:43,196 INFO    MainThread:11213 [wandb_setup.py:_flush():76] Current SDK version is 0.16.3
+2024-08-23 16:25:43,196 INFO    MainThread:11213 [wandb_setup.py:_flush():76] Configure stats pid to 11213
+2024-08-23 16:25:43,196 INFO    MainThread:11213 [wandb_setup.py:_flush():76] Loading settings from /singularity_home/.config/wandb/settings
+2024-08-23 16:25:43,196 INFO    MainThread:11213 [wandb_setup.py:_flush():76] Loading settings from /project/wandb/settings
+2024-08-23 16:25:43,196 INFO    MainThread:11213 [wandb_setup.py:_flush():76] Loading settings from environment variables: {'api_key': '***REDACTED***', 'run_notes': 'Train sample'}
+2024-08-23 16:25:43,196 INFO    MainThread:11213 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2024-08-23 16:25:43,196 INFO    MainThread:11213 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'examples/finetuning.py', 'program_abspath': '/project/examples/finetuning.py', 'program': '/project/examples/finetuning.py'}
+2024-08-23 16:25:43,196 INFO    MainThread:11213 [wandb_init.py:_log_setup():526] Logging user logs to /project/wandb/run-20240823_162543-eroprw00/logs/debug.log
+2024-08-23 16:25:43,196 INFO    MainThread:11213 [wandb_init.py:_log_setup():527] Logging internal logs to /project/wandb/run-20240823_162543-eroprw00/logs/debug-internal.log
+2024-08-23 16:25:43,196 INFO    MainThread:11213 [wandb_init.py:init():566] calling init triggers
+2024-08-23 16:25:43,197 INFO    MainThread:11213 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
+config: {'sharding_strategy': 'FULL_SHARD', 'checkpoint_type': 'LOCAL_STATE_DICT', 'fsdp_activation_checkpointing': True, 'fsdp_cpu_offload': False, 'low_cpu_fsdp': False, 'no_meta_device': False, 'data_path': None, 'split': '969, 30, 1', 'train_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document', '28623823675', '/project/datas/llm-jp-corpus-v2/ja-cc/level0/data_text_document'], 'valid_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'test_data_path': ['1754785366', '/project/datas/llm-jp-corpus-v2/ja-wiki/data/data_text_document'], 'data_cache_path': None, 'vocab_size': None, 'vocab_file': None, 'merge_file': None, 'seq_length': 4096, 'num_workers': 2, 'tokenizer_type': 'HFPreTrainedTokenizer', 'tokenizer_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'retro_return_doc_ids': False, 'short_seq_prob': 0.1, 'vocab_extra_ids': 0, 'seed': 1234, 'use_mpi': False, 'wandb_entity': 'iwakawa-koichi-q5-tohoku-nlp6723', 'wandb_name': 'Qwen2-0.5b-0.2_train_2024-08-23-16:25:30', 'wandb_project': 'llm_tutorial-0.2', 'quantization': False, 'use_freeze_layers': False, 'freeze_layers': None, 'bf16': True, 'fp16': False, 'mixed_precision': True, 'param_dtype': None, 'load': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'save': '/work/llm_recipes/models/Qwen2-0.5b-0.2', 'base_model': '/share/pretrained_lm/Qwen/Qwen2-0.5B', 'use_better_transformer': False, 'grad_clip_norm': 1.0, 'eval_interval': 10, 'save_interval': 10, 'eval_iters': 10, 'optimizer': 'anyprecision', 'lr': 2e-05, 'lr_decay_style': 'cosine', 'lr_decay_iters': 7500, 'lr_warmup_iters': 500, 'min_lr': 1e-06, 'train_iters': 7500, 'train_samples': None, 'global_batch_size': 640, 'micro_batch_size': 5, 'make_vocab_size_divisible_by': 128, 'sliding_window_size': 131072, 'skip_batch': None, 'no_save_optimizer_state': False, 'continual_pretraining': False, 'instruction_tuning': False, 'direct_preference_optimization': False, 'attention_dropout': 0.1, 'hidden_dropout': 0.1, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-06, 'hf_transformer_model_dir': None, 'instruction_train_data_path': None, 'instruction_valid_data_path': None, 'epoch': None, 'instruction_dataset_size': None, 'save_sampler_state': False, 'label_smoothing': 0.0, 'save_n_checkpoints': 10, 'hf_repo_id': 'koichi12/Qwen2-0.5b-0.2', 'create_public_hf_repo': False, 'upload_all_checkpoints_to_hf': True, 'hf_upload_retry_limit': 2, 'exit_duration_in_mins': None, 'source_key': None, 'target_key': None, 'attn_implementation': 'flash_attention_2', 'efficient_instruction_tuning': False, 'remove_padding_masking': False, 'save_start_iter': None, 'valid_micro_batch_size': 1, 'rank': 0, 'world_size': 1, 'padded_vocab_size': 151680, 'gradient_accumulation_steps': 128}
+2024-08-23 16:25:43,197 INFO    MainThread:11213 [wandb_init.py:init():616] starting backend
+2024-08-23 16:25:43,197 INFO    MainThread:11213 [wandb_init.py:init():620] setting up manager
+2024-08-23 16:25:43,201 INFO    MainThread:11213 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-08-23 16:25:43,202 INFO    MainThread:11213 [wandb_init.py:init():628] backend started and connected
+2024-08-23 16:25:43,207 INFO    MainThread:11213 [wandb_init.py:init():720] updated telemetry
+2024-08-23 16:25:43,218 INFO    MainThread:11213 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
+2024-08-23 16:25:43,667 INFO    MainThread:11213 [wandb_run.py:_on_init():2262] communicating current version
+2024-08-23 16:25:43,692 INFO    MainThread:11213 [wandb_run.py:_on_init():2271] got version response upgrade_message: "wandb version 0.17.7 is available!  To upgrade, please run:\n $ pip install wandb --upgrade"
+2024-08-23 16:25:43,692 INFO    MainThread:11213 [wandb_init.py:init():804] starting run threads in backend
+2024-08-23 16:25:43,790 INFO    MainThread:11213 [wandb_run.py:_console_start():2241] atexit reg
+2024-08-23 16:25:43,790 INFO    MainThread:11213 [wandb_run.py:_redirect():2096] redirect: wrap_raw
+2024-08-23 16:25:43,790 INFO    MainThread:11213 [wandb_run.py:_redirect():2161] Wrapping output streams.
+2024-08-23 16:25:43,790 INFO    MainThread:11213 [wandb_run.py:_redirect():2186] Redirects installed.
+2024-08-23 16:25:43,791 INFO    MainThread:11213 [wandb_init.py:init():847] run started, returning control to user process
+2024-08-23 16:25:51,139 INFO    MainThread:11213 [wandb_run.py:_config_callback():1343] config_cb None None {'model_architecture': 'Qwen2ForCausalLM', 'activation_function': 'silu', 'hidden_size': 896, 'model_type': 'qwen2', 'max_position_embeddings': 4096, 'num_attention_heads': 14, 'num_hidden_layers': 24}
+2024-08-23 16:25:51,139 INFO    MainThread:11213 [wandb_run.py:_config_callback():1343] config_cb None None {'world_size': 1}
+2024-08-23 16:25:57,685 WARNING MsgRouterThr:11213 [router.py:message_loop():77] message_loop has been closed

wandb/run-20240823_162543-eroprw00/run-eroprw00.wandb ADDED Viewed

Binary file (18.1 kB). View file