AdamG012
/

chat-opt-1.3b-sft-deepspeed

@@ -1,125 +1,125 @@
-/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
   warnings.warn(
 [2023-04-14 06:58:31,332] [WARNING] [runner.py:190:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
-[2023-04-14 06:58:32,784] [INFO] [runner.py:540:main] cmd = /home/minutiae/.conda/envs/py39/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgMywgNCwgNSwgNiwgN119 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None main.py --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets openai/webgpt_comparisons stanfordnlp/SHP --data_split 2,4,4 --model_name_or_path facebook/opt-1.3b --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --max_seq_len 512 --learning_rate 9.65e-6 --weight_decay 0.1 --num_train_epochs 2 --gradient_accumulation_steps 1 --lr_scheduler_type cosine --num_warmup_steps 0 --seed 1234 --zero_stage 2 --deepspeed --output_dir /lus/grand/projects/BNN-Scale/chatgpt/hf_runs/DeepSpeedExamples/applications/DeepSpeed-Chat/output/actor-models/1.3b
-/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
   warnings.warn(
 [2023-04-14 06:59:25,659] [INFO] [launch.py:229:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]}
 [2023-04-14 06:59:25,760] [INFO] [launch.py:235:main] nnodes=1, num_local_procs=8, node_rank=0
 [2023-04-14 06:59:25,760] [INFO] [launch.py:246:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]})
 [2023-04-14 06:59:25,760] [INFO] [launch.py:247:main] dist_world_size=8
 [2023-04-14 06:59:25,760] [INFO] [launch.py:249:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
   warnings.warn(
-/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
   warnings.warn(
-/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
   warnings.warn(
-/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
   warnings.warn(
-/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
   warnings.warn(
-/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
   warnings.warn(
-/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
   warnings.warn(
-/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
   warnings.warn(
 [2023-04-14 07:04:01,148] [INFO] [comm.py:586:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:00<00:00,  8.62it/s]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:00<00:00,  8.62it/s]Found cached dataset parquet (/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:02<00:02,  2.37s/it]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:02<00:02,  2.37s/it]Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:00<00:00,  1.76it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:01<00:01,  1.92s/it]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:01<00:01,  1.92s/it]Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:00<00:00,  1.60it/s]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:00<00:00,  1.60it/s]Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/1 [00:00<?, ?it/s]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/1 [00:00<?, ?it/s]Found cached dataset parquet (/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/1 [00:00<?, ?it/s]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/1 [00:00<?, ?it/s]Found cached dataset parquet (/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/1 [00:00<?, ?it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/1 [00:00<?, ?it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/1 [00:00<?, ?it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/1 [00:00<?, ?it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:00<00:00,  5.87it/s]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:00<00:00,  5.87it/s]Found cached dataset parquet (/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:00<00:00,  3.00it/s]
-Found cached dataset parquet (/grand/projects/BNN-Scale/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
-Found cached dataset webgpt_comparisons (/grand/projects/BNN-Scale/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
  0%|          | 0/1 [00:00<?, ?it/s]Found cached dataset webgpt_comparisons (/grand/projects/BNN-Scale/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
  0%|          | 0/1 [00:00<?, ?it/s]Found cached dataset webgpt_comparisons (/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
  0%|          | 0/1 [00:00<?, ?it/s]
-Found cached dataset webgpt_comparisons (/grand/projects/BNN-Scale/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
  0%|          | 0/1 [00:00<?, ?it/s]
-Found cached dataset webgpt_comparisons (/grand/projects/BNN-Scale/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
  0%|          | 0/1 [00:00<?, ?it/s]Found cached dataset webgpt_comparisons (/grand/projects/BNN-Scale/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
  0%|          | 0/1 [00:00<?, ?it/s]Found cached dataset webgpt_comparisons (/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
  0%|          | 0/1 [00:00<?, ?it/s]
-Found cached dataset webgpt_comparisons (/grand/projects/BNN-Scale/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
  0%|          | 0/1 [00:00<?, ?it/s]
-Found cached dataset webgpt_comparisons (/grand/projects/BNN-Scale/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
  0%|          | 0/1 [00:00<?, ?it/s]
-Found cached dataset webgpt_comparisons (/grand/projects/BNN-Scale/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
  0%|          | 0/1 [00:00<?, ?it/s]
-Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
  0%|          | 0/3 [00:00<?, ?it/s]Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
  0%|          | 0/3 [00:00<?, ?it/s]Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
-Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
  0%|          | 0/3 [00:00<?, ?it/s]
 33%|███▎      | 1/3 [00:11<00:22, 11.28s/it]
 33%|███▎      | 1/3 [00:08<00:17,  8.75s/it]
 33%|███▎      | 1/3 [00:01<00:02,  1.24s/it]
  0%|          | 0/3 [00:00<?, ?it/s]
 67%|██████▋   | 2/3 [00:02<00:01,  1.03s/it]
 67%|██████▋   | 2/3 [00:09<00:04,  4.31s/it]
 67%|██████▋   | 2/3 [00:12<00:05,  5.35s/it]Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
 33%|███▎      | 1/3 [00:03<00:07,  3.78s/it]
  0%|          | 0/3 [00:00<?, ?it/s]Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
  0%|          | 0/3 [00:00<?, ?it/s]Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
  0%|          | 0/3 [00:00<?, ?it/s]Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
  0%|          | 0/3 [00:00<?, ?it/s]
 33%|███▎      | 1/3 [00:11<00:22, 11.28s/it]
 33%|███▎      | 1/3 [00:08<00:17,  8.75s/it]
 33%|███▎      | 1/3 [00:01<00:02,  1.24s/it]
  0%|          | 0/3 [00:00<?, ?it/s]
 67%|██████▋   | 2/3 [00:02<00:01,  1.03s/it]
 67%|██████▋   | 2/3 [00:09<00:04,  4.31s/it]
 67%|██████▋   | 2/3 [00:12<00:05,  5.35s/it]Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
 33%|███▎      | 1/3 [00:03<00:07,  3.78s/it]
  0%|          | 0/3 [00:00<?, ?it/s]Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
 33%|███▎      | 1/3 [00:00<00:00,  5.25it/s]
  0%|          | 0/3 [00:00<?, ?it/s]
-Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
  0%|          | 0/3 [00:00<?, ?it/s]Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
  0%|          | 0/3 [00:00<?, ?it/s]Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
  0%|          | 0/3 [00:00<?, ?it/s]
 33%|███▎      | 1/3 [00:05<00:11,  5.84s/it]
 33%|███▎      | 1/3 [00:03<00:06,  3.03s/it]
 67%|██████▋   | 2/3 [00:06<00:02,  2.57s/it]
 67%|██████▋   | 2/3 [00:03<00:01,  1.42s/it]
 huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
@@ -179,19 +179,19 @@ To disable this warning, you can either:
 	- Avoid using `tokenizers` before the fork if possible
 	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
 To disable this warning, you can either:
 	- Avoid using `tokenizers` before the fork if possible
@@ -205,16 +205,16 @@ To disable this warning, you can either:
 	- Avoid using `tokenizers` before the fork if possible
 	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 Detected CUDA files, patching ldflags
-Emitting ninja build file /home/minutiae/.cache/torch_extensions/py39_cu113/fused_adam/build.ninja...
 Building extension module fused_adam...
 Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
 huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
 To disable this warning, you can either:
 	- Avoid using `tokenizers` before the fork if possible
 	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
-[1/3] /usr/local/cuda-11.4/bin/nvcc  -ccbin /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/includes -I/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/adam -isystem /home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/torch/include -isystem /home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/torch/include/TH -isystem /home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/torch/include/THC -isystem /usr/local/cuda-11.4/include -isystem /home/minutiae/.conda/envs/py39/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options '-fPIC' -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -std=c++14 -c /home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o
-[2/3] /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicxx -MMD -MF fused_adam_frontend.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/includes -I/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/adam -isystem /home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/torch/include -isystem /home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/torch/include/TH -isystem /home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/torch/include/THC -isystem /usr/local/cuda-11.4/include -isystem /home/minutiae/.conda/envs/py39/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -std=c++14 -g -Wno-reorder -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -c /home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp -o fused_adam_frontend.o
-[3/3] /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicxx fused_adam_frontend.o multi_tensor_adam.cuda.o -shared -L/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda_cu -ltorch_cuda_cpp -ltorch -ltorch_python -L/usr/local/cuda-11.4/lib64 -lcudart -o fused_adam.so
 Loading extension module fused_adam...
 Time to load fused_adam op: 37.18038511276245 seconds
 Loading extension module fused_adam...
@@ -238,7 +238,7 @@ To disable this warning, you can either:
 	- Avoid using `tokenizers` before the fork if possible
 	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
 To disable this warning, you can either:
 	- Avoid using `tokenizers` before the fork if possible
@@ -252,7 +252,7 @@ To disable this warning, you can either:
 	- Avoid using `tokenizers` before the fork if possible
 	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 Detected CUDA files, patching ldflags
-Emitting ninja build file /home/minutiae/.cache/torch_extensions/py39_cu113/fused_adam/build.ninja...
 Building extension module fused_adam...
 Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
 huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
@@ -274,14 +274,14 @@ Time to load fused_adam op: 6.415344715118408 seconds
 [2023-04-14 07:15:06,557] [INFO] [stage_1_and_2.py:134:__init__] Allgather bucket size 500,000,000
 [2023-04-14 07:15:06,557] [INFO] [stage_1_and_2.py:135:__init__] CPU Offload: False
 [2023-04-14 07:15:06,557] [INFO] [stage_1_and_2.py:136:__init__] Round robin gradient partitioning: False
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
 To disable this warning, you can either:
 	- Avoid using `tokenizers` before the fork if possible
@@ -294,15 +294,15 @@ huggingface/tokenizers: The current process just got forked, after parallelism h
 To disable this warning, you can either:
 	- Avoid using `tokenizers` before the fork if possible
 	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
-Emitting ninja build file /home/minutiae/.cache/torch_extensions/py39_cu113/utils/build.ninja...
 Building extension module utils...
 Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
 huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
 To disable this warning, you can either:
 	- Avoid using `tokenizers` before the fork if possible
 	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
-[1/2] /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicxx -MMD -MF flatten_unflatten.o.d -DTORCH_EXTENSION_NAME=utils -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/torch/include -isystem /home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/torch/include/TH -isystem /home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/torch/include/THC -isystem /home/minutiae/.conda/envs/py39/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -c /home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/utils/flatten_unflatten.cpp -o flatten_unflatten.o
-[2/2] /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicxx flatten_unflatten.o -shared -L/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/torch/lib -lc10 -ltorch_cpu -ltorch -ltorch_python -o utils.so
 Loading extension module utils...
 Time to load utils op: 21.48611044883728 seconds
 Loading extension module utils...
@@ -327,22 +327,22 @@ Rank: 7 partition count [8, 8] and sizes[(164401920, False), (67840, False)]
 Rank: 4 partition count [8, 8] and sizes[(164401920, False), (67840, False)]
 Rank: 6 partition count [8, 8] and sizes[(164401920, False), (67840, False)]
 Rank: 2 partition count [8, 8] and sizes[(164401920, False), (67840, False)]
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 No modifications detected for re-loaded extension module utils, skipping build step...
 Loading extension module utils...
 Time to load utils op: 0.0016155242919921875 seconds
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 No modifications detected for re-loaded extension module utils, skipping build step...
 Loading extension module utils...
 Time to load utils op: 0.0008933544158935547 seconds
 No modifications detected for re-loaded extension module utils, skipping build step...
 Loading extension module utils...
 Time to load utils op: 0.0008301734924316406 seconds
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 No modifications detected for re-loaded extension module utils, skipping build step...
 Loading extension module utils...
 No modifications detected for re-loaded extension module utils, skipping build step...
@@ -514,7 +514,7 @@ Time to load utils op: 0.0009191036224365234 seconds
         "tp_gather_partition_size": 8
     }
 }
-Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 No modifications detected for re-loaded extension module utils, skipping build step...
 Loading extension module utils...
 Time to load utils op: 0.0014319419860839844 seconds

+/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
   warnings.warn(
 [2023-04-14 06:58:31,332] [WARNING] [runner.py:190:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
+[2023-04-14 06:58:32,784] [INFO] [runner.py:540:main] cmd = /home/AdamG012/.conda/envs/py39/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgMywgNCwgNSwgNiwgN119 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None main.py --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets openai/webgpt_comparisons stanfordnlp/SHP --data_split 2,4,4 --model_name_or_path facebook/opt-1.3b --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --max_seq_len 512 --learning_rate 9.65e-6 --weight_decay 0.1 --num_train_epochs 2 --gradient_accumulation_steps 1 --lr_scheduler_type cosine --num_warmup_steps 0 --seed 1234 --zero_stage 2 --deepspeed --output_dir /lus/chatgpt/hf_runs/DeepSpeedExamples/applications/DeepSpeed-Chat/output/actor-models/1.3b
+/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
   warnings.warn(
 [2023-04-14 06:59:25,659] [INFO] [launch.py:229:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]}
 [2023-04-14 06:59:25,760] [INFO] [launch.py:235:main] nnodes=1, num_local_procs=8, node_rank=0
 [2023-04-14 06:59:25,760] [INFO] [launch.py:246:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]})
 [2023-04-14 06:59:25,760] [INFO] [launch.py:247:main] dist_world_size=8
 [2023-04-14 06:59:25,760] [INFO] [launch.py:249:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
   warnings.warn(
+/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
   warnings.warn(
+/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
   warnings.warn(
+/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
   warnings.warn(
+/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
   warnings.warn(
+/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
   warnings.warn(
+/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
   warnings.warn(
+/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version!
   warnings.warn(
 [2023-04-14 07:04:01,148] [INFO] [comm.py:586:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:00<00:00,  8.62it/s]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
+Found cached dataset parquet (/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:00<00:00,  8.62it/s]Found cached dataset parquet (/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
+Found cached dataset parquet (/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
+Found cached dataset parquet (/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
+Found cached dataset parquet (/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
+Found cached dataset parquet (/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
+Found cached dataset parquet (/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
+Found cached dataset parquet (/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:02<00:02,  2.37s/it]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
+Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:02<00:02,  2.37s/it]Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:00<00:00,  1.76it/s]
+Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:01<00:01,  1.92s/it]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
+Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:01<00:01,  1.92s/it]Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:00<00:00,  1.60it/s]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
+Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:00<00:00,  1.60it/s]Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
+Found cached dataset parquet (/reward/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
+Found cached dataset parquet (/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/1 [00:00<?, ?it/s]Found cached dataset parquet (/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/1 [00:00<?, ?it/s]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/1 [00:00<?, ?it/s]Found cached dataset parquet (/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/1 [00:00<?, ?it/s]
+Found cached dataset parquet (/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
+Found cached dataset parquet (/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
+Found cached dataset parquet (/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/1 [00:00<?, ?it/s]
+Found cached dataset parquet (/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/1 [00:00<?, ?it/s]
+Found cached dataset parquet (/reward/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:00<00:00,  5.87it/s]Found cached dataset parquet (/grand/projects/BNN-Scale/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
+Found cached dataset parquet (/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:00<00:00,  5.87it/s]Found cached dataset parquet (/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
+Found cached dataset parquet (/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
+Found cached dataset parquet (/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
+Found cached dataset parquet (/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
+Found cached dataset parquet (/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
+Found cached dataset parquet (/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:00<00:00,  3.00it/s]
+Found cached dataset parquet (/reward/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]Found cached dataset webgpt_comparisons (/grand/projects/BNN-Scale/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
+Found cached dataset webgpt_comparisons (/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
  0%|          | 0/1 [00:00<?, ?it/s]Found cached dataset webgpt_comparisons (/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
  0%|          | 0/1 [00:00<?, ?it/s]
+Found cached dataset webgpt_comparisons (/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]Found cached dataset webgpt_comparisons (/grand/projects/BNN-Scale/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
+Found cached dataset webgpt_comparisons (/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
  0%|          | 0/1 [00:00<?, ?it/s]Found cached dataset webgpt_comparisons (/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
  0%|          | 0/1 [00:00<?, ?it/s]
+Found cached dataset webgpt_comparisons (/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
  0%|          | 0/1 [00:00<?, ?it/s]
+Found cached dataset webgpt_comparisons (/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
  0%|          | 0/1 [00:00<?, ?it/s]
+Found cached dataset webgpt_comparisons (/reward/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a)
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s]Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
  0%|          | 0/3 [00:00<?, ?it/s]Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
  0%|          | 0/3 [00:00<?, ?it/s]
 33%|███▎      | 1/3 [00:11<00:22, 11.28s/it]
 33%|███▎      | 1/3 [00:08<00:17,  8.75s/it]
 33%|███▎      | 1/3 [00:01<00:02,  1.24s/it]
  0%|          | 0/3 [00:00<?, ?it/s]
 67%|██████▋   | 2/3 [00:02<00:01,  1.03s/it]
 67%|██████▋   | 2/3 [00:09<00:04,  4.31s/it]
 67%|██████▋   | 2/3 [00:12<00:05,  5.35s/it]Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
 33%|███▎      | 1/3 [00:03<00:07,  3.78s/it]
  0%|          | 0/3 [00:00<?, ?it/s]Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
+Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
  0%|          | 0/3 [00:00<?, ?it/s]Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
  0%|          | 0/3 [00:00<?, ?it/s]Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
+Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
  0%|          | 0/3 [00:00<?, ?it/s]
 33%|███▎      | 1/3 [00:11<00:22, 11.28s/it]
 33%|███▎      | 1/3 [00:08<00:17,  8.75s/it]
 33%|███▎      | 1/3 [00:01<00:02,  1.24s/it]
  0%|          | 0/3 [00:00<?, ?it/s]
 67%|██████▋   | 2/3 [00:02<00:01,  1.03s/it]
 67%|██████▋   | 2/3 [00:09<00:04,  4.31s/it]
 67%|██████▋   | 2/3 [00:12<00:05,  5.35s/it]Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
 33%|███▎      | 1/3 [00:03<00:07,  3.78s/it]
  0%|          | 0/3 [00:00<?, ?it/s]Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
 33%|███▎      | 1/3 [00:00<00:00,  5.25it/s]
  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s]Found cached dataset json (/grand/projects/BNN-Scale/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
+Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
  0%|          | 0/3 [00:00<?, ?it/s]Found cached dataset json (/reward/stanfordnlp___json/stanfordnlp--SHP-10ead9e54f5a107d/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
  0%|          | 0/3 [00:00<?, ?it/s]
 33%|███▎      | 1/3 [00:05<00:11,  5.84s/it]
 33%|███▎      | 1/3 [00:03<00:06,  3.03s/it]
 67%|██████▋   | 2/3 [00:06<00:02,  2.57s/it]
 67%|██████▋   | 2/3 [00:03<00:01,  1.42s/it]
 huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
 	- Avoid using `tokenizers` before the fork if possible
 	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
 To disable this warning, you can either:
 	- Avoid using `tokenizers` before the fork if possible
 	- Avoid using `tokenizers` before the fork if possible
 	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 Detected CUDA files, patching ldflags
+Emitting ninja build file /home/AdamG012/.cache/torch_extensions/py39_cu113/fused_adam/build.ninja...
 Building extension module fused_adam...
 Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
 huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
 To disable this warning, you can either:
 	- Avoid using `tokenizers` before the fork if possible
 	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+[1/3] /usr/local/cuda-11.4/bin/nvcc  -ccbin /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/includes -I/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/adam -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include/TH -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include/THC -isystem /usr/local/cuda-11.4/include -isystem /home/AdamG012/.conda/envs/py39/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options '-fPIC' -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -std=c++14 -c /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o
+[2/3] /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicxx -MMD -MF fused_adam_frontend.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/includes -I/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/adam -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include/TH -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include/THC -isystem /usr/local/cuda-11.4/include -isystem /home/AdamG012/.conda/envs/py39/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -std=c++14 -g -Wno-reorder -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -c /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp -o fused_adam_frontend.o
+[3/3] /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicxx fused_adam_frontend.o multi_tensor_adam.cuda.o -shared -L/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda_cu -ltorch_cuda_cpp -ltorch -ltorch_python -L/usr/local/cuda-11.4/lib64 -lcudart -o fused_adam.so
 Loading extension module fused_adam...
 Time to load fused_adam op: 37.18038511276245 seconds
 Loading extension module fused_adam...
 	- Avoid using `tokenizers` before the fork if possible
 	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 Installed CUDA version 11.4 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
 To disable this warning, you can either:
 	- Avoid using `tokenizers` before the fork if possible
 	- Avoid using `tokenizers` before the fork if possible
 	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 Detected CUDA files, patching ldflags
+Emitting ninja build file /home/AdamG012/.cache/torch_extensions/py39_cu113/fused_adam/build.ninja...
 Building extension module fused_adam...
 Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
 huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
 [2023-04-14 07:15:06,557] [INFO] [stage_1_and_2.py:134:__init__] Allgather bucket size 500,000,000
 [2023-04-14 07:15:06,557] [INFO] [stage_1_and_2.py:135:__init__] CPU Offload: False
 [2023-04-14 07:15:06,557] [INFO] [stage_1_and_2.py:136:__init__] Round robin gradient partitioning: False
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
 To disable this warning, you can either:
 	- Avoid using `tokenizers` before the fork if possible
 To disable this warning, you can either:
 	- Avoid using `tokenizers` before the fork if possible
 	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+Emitting ninja build file /home/AdamG012/.cache/torch_extensions/py39_cu113/utils/build.ninja...
 Building extension module utils...
 Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
 huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
 To disable this warning, you can either:
 	- Avoid using `tokenizers` before the fork if possible
 	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+[1/2] /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicxx -MMD -MF flatten_unflatten.o.d -DTORCH_EXTENSION_NAME=utils -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include/TH -isystem /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/include/THC -isystem /home/AdamG012/.conda/envs/py39/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -c /home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/utils/flatten_unflatten.cpp -o flatten_unflatten.o
+[2/2] /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicxx flatten_unflatten.o -shared -L/home/AdamG012/.conda/envs/py39/lib/python3.9/site-packages/torch/lib -lc10 -ltorch_cpu -ltorch -ltorch_python -o utils.so
 Loading extension module utils...
 Time to load utils op: 21.48611044883728 seconds
 Loading extension module utils...
 Rank: 4 partition count [8, 8] and sizes[(164401920, False), (67840, False)]
 Rank: 6 partition count [8, 8] and sizes[(164401920, False), (67840, False)]
 Rank: 2 partition count [8, 8] and sizes[(164401920, False), (67840, False)]
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 No modifications detected for re-loaded extension module utils, skipping build step...
 Loading extension module utils...
 Time to load utils op: 0.0016155242919921875 seconds
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 No modifications detected for re-loaded extension module utils, skipping build step...
 Loading extension module utils...
 Time to load utils op: 0.0008933544158935547 seconds
 No modifications detected for re-loaded extension module utils, skipping build step...
 Loading extension module utils...
 Time to load utils op: 0.0008301734924316406 seconds
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 No modifications detected for re-loaded extension module utils, skipping build step...
 Loading extension module utils...
 No modifications detected for re-loaded extension module utils, skipping build step...
         "tp_gather_partition_size": 8
     }
 }
+Using /home/AdamG012/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
 No modifications detected for re-loaded extension module utils, skipping build step...
 Loading extension module utils...
 Time to load utils op: 0.0014319419860839844 seconds