update

Browse files

Files changed (14) hide show

added_tokens.json +3 -0
arguments.json +52 -0
arguments.pkl +3 -0
config.json +27 -0
environ.txt +244 -0
latest +1 -0
pytorch_model.bin +3 -0
script.sh +111 -0
special_tokens_map.json +24 -0
stderr.log +0 -0
stdout.log +41 -0
tokenizer.model +3 -0
tokenizer_config.json +35 -0
zero_to_fp32.py +587 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<pad>": 32000
+}

arguments.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+    "model_name_or_path": "huggyllama/llama-7b",
+    "max_length": 512,
+    "trust_remote_code": true,
+    "train_datasets": [
+        [
+            "alpaca",
+            {
+                "proportion": 1.0
+            }
+        ]
+    ],
+    "eval_datasets": null,
+    "epochs": 3,
+    "per_device_train_batch_size": 4,
+    "per_device_eval_batch_size": 4,
+    "gradient_accumulation_steps": 16,
+    "gradient_checkpointing": true,
+    "lr": 2e-05,
+    "lr_scheduler_type": "cosine",
+    "lr_warmup_ratio": 0.03,
+    "weight_decay": 0.0,
+    "seed": 42,
+    "fp16": false,
+    "bf16": true,
+    "tf32": true,
+    "eval_strategy": "epoch",
+    "eval_interval": 1000000,
+    "need_eval": false,
+    "eval_split_ratio": null,
+    "output_dir": "/data/jiongxiao_wang/rlhf_attack/safe-rlhf/output/sft",
+    "log_type": "wandb",
+    "log_dir": "/data/jiongxiao_wang/rlhf_attack/safe-rlhf/output/sft",
+    "log_project": "Safe-RLHF-SFT",
+    "log_run_name": "sft-2023-12-31-20-07-40",
+    "save_16bit": false,
+    "save_interval": 1000000,
+    "local_rank": 0,
+    "zero_stage": 3,
+    "deepspeed": false,
+    "deepspeed_config": null,
+    "deepscale": false,
+    "deepscale_config": null,
+    "deepspeed_mpi": false,
+    "global_rank": 0,
+    "device": {
+        "type": "torch.device",
+        "repr": "device(type='cuda', index=0)"
+    },
+    "num_update_steps_per_epoch": 204,
+    "total_training_steps": 612
+}

arguments.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:558c472797170401090f0c1a08e8d1c8d31bcad35438ef6134aceb8a269cc318
+size 1019

config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "_name_or_path": "huggyllama/llama-7b",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 2048,
+  "max_sequence_length": 2048,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "pad_token_id": 32000,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.31.0",
+  "use_cache": true,
+  "vocab_size": 32001
+}

environ.txt ADDED Viewed

	@@ -0,0 +1,244 @@

+ADDR2LINE=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-addr2line
+AR=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-ar
+AS=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-as
+BASH_FUNC__spack_shell_wrapper()=() {  for var in LD_LIBRARY_PATH DYLD_LIBRARY_PATH DYLD_FALLBACK_LIBRARY_PATH;
+ do
+ eval "if [ -n \"\${${var}-}\" ]; then export SPACK_$var=\${${var}}; fi";
+ done;
+ if [ -n "${ZSH_VERSION:-}" ]; then
+ emulate -L sh;
+ fi;
+ _sp_flags="";
+ while [ ! -z ${1+x} ] && [ "${1#-}" != "${1}" ]; do
+ _sp_flags="$_sp_flags $1";
+ shift;
+ done;
+ if [ -n "$_sp_flags" ] && [ "${_sp_flags#*h}" != "${_sp_flags}" ] || [ "${_sp_flags#*V}" != "${_sp_flags}" ]; then
+ command spack $_sp_flags "$@";
+ return;
+ fi;
+ _sp_subcommand="";
+ if [ ! -z ${1+x} ]; then
+ _sp_subcommand="$1";
+ shift;
+ fi;
+ case $_sp_subcommand in
+ "cd")
+ _sp_arg="";
+ if [ -n "$1" ]; then
+ _sp_arg="$1";
+ shift;
+ fi;
+ if [ "$_sp_arg" = "-h" ] || [ "$_sp_arg" = "--help" ]; then
+ command spack cd -h;
+ else
+ LOC="$(spack location $_sp_arg "$@")";
+ if [ -d "$LOC" ]; then
+ cd "$LOC";
+ else
+ return 1;
+ fi;
+ fi;
+ return
+ ;;
+ "env")
+ _sp_arg="";
+ if [ -n "$1" ]; then
+ _sp_arg="$1";
+ shift;
+ fi;
+ if [ "$_sp_arg" = "-h" ] || [ "$_sp_arg" = "--help" ]; then
+ command spack env -h;
+ else
+ case $_sp_arg in
+ activate)
+ _a=" $@";
+ if [ -z ${1+x} ] || [ "${_a#* --sh}" != "$_a" ] || [ "${_a#* --csh}" != "$_a" ] || [ "${_a#* -h}" != "$_a" ] || [ "${_a#* --help}" != "$_a" ]; then
+ command spack env activate "$@";
+ else
+ stdout="$(command spack $_sp_flags env activate --sh "$@")" || return;
+ eval "$stdout";
+ fi
+ ;;
+ deactivate)
+ _a=" $@";
+ if [ "${_a#* --sh}" != "$_a" ] || [ "${_a#* --csh}" != "$_a" ]; then
+ command spack env deactivate "$@";
+ else
+ if [ -n "$*" ]; then
+ command spack env deactivate -h;
+ else
+ stdout="$(command spack $_sp_flags env deactivate --sh)" || return;
+ eval "$stdout";
+ fi;
+ fi
+ ;;
+ *)
+ command spack env $_sp_arg "$@"
+ ;;
+ esac;
+ fi;
+ return
+ ;;
+ "load" | "unload")
+ _a=" $@";
+ if [ "${_a#* --sh}" != "$_a" ] || [ "${_a#* --csh}" != "$_a" ] || [ "${_a#* -h}" != "$_a" ] || [ "${_a#* --list}" != "$_a" ] || [ "${_a#* --help}" != "$_a" ]; then
+ command spack $_sp_flags $_sp_subcommand "$@";
+ else
+ stdout="$(command spack $_sp_flags $_sp_subcommand --sh "$@")" || return;
+ eval "$stdout";
+ fi
+ ;;
+ *)
+ command spack $_sp_flags $_sp_subcommand "$@"
+ ;;
+ esac
+}
+BASH_FUNC_module()=() {  eval `/usr/bin/modulecmd bash $*`
+}
+BASH_FUNC_spack()=() {  : this is a shell function from: /nfs/cluster/spack/share/spack/setup-env.sh;
+ : the real spack script is here: /nfs/cluster/spack/bin/spack;
+ _spack_shell_wrapper "$@";
+ return $?
+}
+BUILD=x86_64-conda-linux-gnu
+CC=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-cc
+CC_FOR_BUILD=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-cc
+CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/jiongxiao_wang/anaconda3/envs/safe-rlhf/include
+CMAKE_ARGS=-DCMAKE_AR=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-ar -DCMAKE_CXX_COMPILER_AR=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-gcc-ar -DCMAKE_C_COMPILER_AR=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-gcc-ar -DCMAKE_RANLIB=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-ranlib -DCMAKE_CXX_COMPILER_RANLIB=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-gcc-ranlib -DCMAKE_C_COMPILER_RANLIB=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-gcc-ranlib -DCMAKE_LINKER=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-ld -DCMAKE_STRIP=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-strip -DCMAKE_BUILD_TYPE=Release
+CMAKE_PREFIX_PATH=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf:/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/x86_64-conda-linux-gnu/sysroot/usr
+COLORTERM=truecolor
+CONDA_BUILD_SYSROOT=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/x86_64-conda-linux-gnu/sysroot
+CONDA_DEFAULT_ENV=safe-rlhf
+CONDA_EXE=/data/jiongxiao_wang/anaconda3/bin/conda
+CONDA_PREFIX=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf
+CONDA_PREFIX_1=/data/jiongxiao_wang/anaconda3
+CONDA_PROMPT_MODIFIER=(safe-rlhf)
+CONDA_PYTHON_EXE=/data/jiongxiao_wang/anaconda3/bin/python
+CONDA_SHLVL=2
+CONDA_TOOLCHAIN_BUILD=x86_64-conda-linux-gnu
+CONDA_TOOLCHAIN_HOST=x86_64-conda-linux-gnu
+CPP=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-cpp
+CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/jiongxiao_wang/anaconda3/envs/safe-rlhf/include
+CROSS_RANK=0
+CROSS_SIZE=1
+CUDA_MODULE_LOADING=LAZY
+CUDA_VISIBLE_DEVICES=0,1,2,3
+CXX=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-c++
+CXXFILT=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-c++filt
+CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/jiongxiao_wang/anaconda3/envs/safe-rlhf/include
+CXX_FOR_BUILD=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-c++
+DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/jiongxiao_wang/anaconda3/envs/safe-rlhf/include
+DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/jiongxiao_wang/anaconda3/envs/safe-rlhf/include
+DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/jiongxiao_wang/anaconda3/envs/safe-rlhf/include
+ELFEDIT=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-elfedit
+ENVIRONMENT=BATCH
+GCC=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-gcc
+GCC_AR=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-gcc-ar
+GCC_NM=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-gcc-nm
+GCC_RANLIB=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-gcc-ranlib
+GPROF=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-gprof
+GPU_DEVICE_ORDINAL=0,1,2,3
+GXX=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-g++
+HOME=/data/jiongxiao_wang
+HOST=x86_64-conda-linux-gnu
+HOSTNAME=compute-permanent-node-153
+LANG=en_US.UTF-8
+LD=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-ld
+LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/lib -Wl,-rpath-link,/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/lib -L/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/lib
+LD_GOLD=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-ld.gold
+LD_LIBRARY_PATH=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/lib:/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/lib:
+LESSOPEN=||/usr/bin/lesspipe.sh %s
+LOADEDMODULES=
+LOCAL_RANK=0
+LOCAL_SIZE=4
+LOGLEVEL=WARNING
+LOGNAME=jiongxiao_wang
+LS_COLORS=rs=0:di=38;5;27:ln=38;5;51:mh=44;38;5;15:pi=40;38;5;11:so=38;5;13:do=38;5;5:bd=48;5;232;38;5;11:cd=48;5;232;38;5;3:or=48;5;232;38;5;9:mi=05;48;5;232;38;5;15:su=48;5;196;38;5;15:sg=48;5;11;38;5;16:ca=48;5;196;38;5;226:tw=48;5;10;38;5;16:ow=48;5;10;38;5;21:st=48;5;21;38;5;15:ex=38;5;34:*.tar=38;5;9:*.tgz=38;5;9:*.arc=38;5;9:*.arj=38;5;9:*.taz=38;5;9:*.lha=38;5;9:*.lz4=38;5;9:*.lzh=38;5;9:*.lzma=38;5;9:*.tlz=38;5;9:*.txz=38;5;9:*.tzo=38;5;9:*.t7z=38;5;9:*.zip=38;5;9:*.z=38;5;9:*.Z=38;5;9:*.dz=38;5;9:*.gz=38;5;9:*.lrz=38;5;9:*.lz=38;5;9:*.lzo=38;5;9:*.xz=38;5;9:*.bz2=38;5;9:*.bz=38;5;9:*.tbz=38;5;9:*.tbz2=38;5;9:*.tz=38;5;9:*.deb=38;5;9:*.rpm=38;5;9:*.jar=38;5;9:*.war=38;5;9:*.ear=38;5;9:*.sar=38;5;9:*.rar=38;5;9:*.alz=38;5;9:*.ace=38;5;9:*.zoo=38;5;9:*.cpio=38;5;9:*.7z=38;5;9:*.rz=38;5;9:*.cab=38;5;9:*.jpg=38;5;13:*.jpeg=38;5;13:*.gif=38;5;13:*.bmp=38;5;13:*.pbm=38;5;13:*.pgm=38;5;13:*.ppm=38;5;13:*.tga=38;5;13:*.xbm=38;5;13:*.xpm=38;5;13:*.tif=38;5;13:*.tiff=38;5;13:*.png=38;5;13:*.svg=38;5;13:*.svgz=38;5;13:*.mng=38;5;13:*.pcx=38;5;13:*.mov=38;5;13:*.mpg=38;5;13:*.mpeg=38;5;13:*.m2v=38;5;13:*.mkv=38;5;13:*.webm=38;5;13:*.ogm=38;5;13:*.mp4=38;5;13:*.m4v=38;5;13:*.mp4v=38;5;13:*.vob=38;5;13:*.qt=38;5;13:*.nuv=38;5;13:*.wmv=38;5;13:*.asf=38;5;13:*.rm=38;5;13:*.rmvb=38;5;13:*.flc=38;5;13:*.avi=38;5;13:*.fli=38;5;13:*.flv=38;5;13:*.gl=38;5;13:*.dl=38;5;13:*.xcf=38;5;13:*.xwd=38;5;13:*.yuv=38;5;13:*.cgm=38;5;13:*.emf=38;5;13:*.axv=38;5;13:*.anx=38;5;13:*.ogv=38;5;13:*.ogx=38;5;13:*.aac=38;5;45:*.au=38;5;45:*.flac=38;5;45:*.mid=38;5;45:*.midi=38;5;45:*.mka=38;5;45:*.mp3=38;5;45:*.mpc=38;5;45:*.ogg=38;5;45:*.ra=38;5;45:*.wav=38;5;45:*.axa=38;5;45:*.oga=38;5;45:*.spx=38;5;45:*.xspf=38;5;45:
+MAIL=/var/mail/jiongxiao_wang
+MASTER_ADDR=127.0.0.1
+MASTER_PORT=56337
+MESON_ARGS=--buildtype release
+MODULEPATH=/usr/share/Modules/modulefiles:/etc/modulefiles
+MODULESHOME=/usr/share/Modules
+NIX_CONF_DIR=/nix
+NM=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-nm
+OBJCOPY=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-objcopy
+OBJDUMP=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-objdump
+PATH=/nix/var/nix/profiles/default/bin:/data/jiongxiao_wang/.nix-profile/bin:/data/jiongxiao_wang/.vscode-server/bin/899d46d82c4c95423fb7e10e68eba52050e30ba3/bin:/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin:/data/jiongxiao_wang/anaconda3/condabin:/nix/var/nix/profiles/default/bin:/data/jiongxiao_wang/.nix-profile/bin:/nfs/cluster/spack/bin:/usr/local/bin:/usr/bin:/var/lib/snapd/snap/bin
+PWD=/data/jiongxiao_wang/rlhf_attack/safe-rlhf
+PYTHONHASHSEED=42
+PYTHONPATH=/data/jiongxiao_wang/rlhf_attack/safe-rlhf
+RANK=0
+RANLIB=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-ranlib
+READELF=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-readelf
+ROCR_VISIBLE_DEVICES=0,1,2,3
+SHELL=/bin/bash
+SHLVL=6
+SIZE=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-size
+SLURMD_NODENAME=compute-permanent-node-153
+SLURM_CLUSTER_NAME=cluster
+SLURM_CONF=/var/spool/slurmd/conf-cache/slurm.conf
+SLURM_CPUS_ON_NODE=8
+SLURM_GPUS_ON_NODE=4
+SLURM_GTIDS=0
+SLURM_JOBID=1031281
+SLURM_JOB_ACCOUNT=chaowei_xiao
+SLURM_JOB_CPUS_PER_NODE=8
+SLURM_JOB_END_TIME=1704226008
+SLURM_JOB_GID=10043
+SLURM_JOB_GPUS=2,3,5,6
+SLURM_JOB_ID=1031281
+SLURM_JOB_NAME=rlhf
+SLURM_JOB_NODELIST=compute-permanent-node-153
+SLURM_JOB_NUM_NODES=1
+SLURM_JOB_PARTITION=compute
+SLURM_JOB_QOS=default_qos
+SLURM_JOB_START_TIME=1704053208
+SLURM_JOB_UID=10193
+SLURM_JOB_USER=jiongxiao_wang
+SLURM_LOCALID=0
+SLURM_MEM_PER_NODE=40960
+SLURM_NNODES=1
+SLURM_NODEID=0
+SLURM_NODELIST=compute-permanent-node-153
+SLURM_NODE_ALIASES=(null)
+SLURM_NPROCS=1
+SLURM_NTASKS=1
+SLURM_PRIO_PROCESS=0
+SLURM_PROCID=0
+SLURM_SUBMIT_DIR=/data/jiongxiao_wang/rlhf_attack/safe-rlhf
+SLURM_SUBMIT_HOST=watch-tower-login
+SLURM_TASKS_PER_NODE=1
+SLURM_TASK_PID=189456
+SLURM_TOPOLOGY_ADDR=watch-tower.watch-tower:f102d2c503fbef087183246a.compute-permanent-node-153
+SLURM_TOPOLOGY_ADDR_PATTERN=switch.switch.node
+SLURM_WORKING_CLUSTER=cluster:watch-tower-bastion:6817:9984:109
+SPACK_PYTHON=/usr/bin/python3
+SPACK_ROOT=/nfs/cluster/spack
+SSH_CLIENT=73.208.16.93 60006 22
+SSH_CONNECTION=73.208.16.93 60006 172.16.0.238 22
+STRINGS=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-strings
+STRIP=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/x86_64-conda-linux-gnu-strip
+TERM=xterm-256color
+TERM_PROGRAM=vscode
+TERM_PROGRAM_VERSION=1.63.2
+TF2_BEHAVIOR=1
+TF_CPP_MIN_LOG_LEVEL=1
+TMPDIR=/tmp
+TPU_ML_PLATFORM=Tensorflow
+USER=jiongxiao_wang
+VSCODE_IPC_HOOK_CLI=/run/user/10193/vscode-ipc-dc9d7ab9-84f5-43c9-9d9b-fbbb0ce088b6.sock
+WANDB_API_KEY=f6021dca133c93e80a7dae4620bd335d4d08cac6
+WANDB_SERVICE=2-189882-tcp-localhost-56894
+WORLD_SIZE=4
+XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
+XDG_RUNTIME_DIR=/run/user/10193
+XDG_SESSION_ID=604022
+ZE_AFFINITY_MASK=0,1,2,3
+_=/data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/deepspeed
+_CE_CONDA=
+_CE_M=
+_CONDA_PYTHON_SYSCONFIGDATA_NAME=_sysconfigdata_x86_64_conda_cos6_linux_gnu
+build_alias=x86_64-conda-linux-gnu
+host_alias=x86_64-conda-linux-gnu

latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step609

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f72e5f4678e96c70fadc38a9c9aa8fbacef8613f1487e8dd78bcb84bc9fa2e1
+size 26953811021

script.sh ADDED Viewed

	@@ -0,0 +1,111 @@

+#!/usr/bin/env bash
+#
+# Copyright 2023 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+export WANDB_API_KEY="f6021dca133c93e80a7dae4620bd335d4d08cac6"
+if [ -z "${BASH_VERSION}" ]; then
+	echo "Please use bash to run this script." >&2
+	exit 1
+fi
+set -x
+SCRIPT_DIR="$(cd "$(dirname "$0")" &>/dev/null && pwd)"
+ROOT_DIR="$(dirname "${SCRIPT_DIR}")"
+export PYTHONPATH="${ROOT_DIR}${PYTHONPATH:+:${PYTHONPATH}}"
+export LOGLEVEL="${LOGLEVEL:-WARNING}"
+MODEL_NAME_OR_PATH="huggyllama/llama-7b"
+OUTPUT_DIR="${ROOT_DIR}/output/sft"
+ZERO_STAGE=3
+while [[ "$#" -gt 0 ]]; do
+	arg="$1"
+	shift
+	case "${arg}" in
+		--model_name_or_path)
+			MODEL_NAME_OR_PATH="$1"
+			shift
+			;;
+		--model_name_or_path=*)
+			MODEL_NAME_OR_PATH="${arg#*=}"
+			;;
+		--output_dir)
+			OUTPUT_DIR="$1"
+			shift
+			;;
+		--output_dir=*)
+			OUTPUT_DIR="${arg#*=}"
+			;;
+		--zero_stage)
+			ZERO_STAGE="$1"
+			shift
+			;;
+		--zero_stage=*)
+			ZERO_STAGE="${arg#*=}"
+			;;
+		*)
+			echo "Unknown parameter passed: '${arg}'" >&2
+			exit 1
+			;;
+	esac
+done
+mkdir -p "${OUTPUT_DIR}"
+OUTPUT_DIR="$(cd "${OUTPUT_DIR}" &>/dev/null && pwd)"
+if [[ ! -f "${OUTPUT_DIR}/.gitignore" ]]; then
+	echo '*' >"${OUTPUT_DIR}/.gitignore"
+fi
+cp -f "$0" "${OUTPUT_DIR}/script.sh"
+if [[ -z "${WANDB_API_KEY}" ]]; then
+	export WANDB_MODE="offline"
+fi
+MASTER_PORT_START=10000
+MASTER_PORT_END=65535
+MASTER_PORT="$(
+	comm -23 \
+		<(seq "${MASTER_PORT_START}" "${MASTER_PORT_END}" | sort) \
+		<(ss -Htan | awk '{ print $4 }' | awk -F ':' '{ print $NF }' | sort -u) |
+		shuf | head -n 1
+)"
+exec 1> >(tee "${OUTPUT_DIR}/stdout.log" >&1) 2> >(tee "${OUTPUT_DIR}/stderr.log" >&2)
+deepspeed --num_nodes=1 --num_gpus=4 \
+	--master_port "${MASTER_PORT}" \
+	--module safe_rlhf.finetune \
+	--train_datasets alpaca \
+	--model_name_or_path "${MODEL_NAME_OR_PATH}" \
+	--max_length 512 \
+	--trust_remote_code True \
+	--epochs 3 \
+	--per_device_train_batch_size 4 \
+	--per_device_eval_batch_size 4 \
+	--gradient_accumulation_steps 16 \
+	--gradient_checkpointing \
+	--learning_rate 2e-5 \
+	--lr_scheduler_type cosine \
+	--lr_warmup_ratio 0.03 \
+	--weight_decay 0.0 \
+	--seed 42 \
+	--output_dir "${OUTPUT_DIR}" \
+	--log_type wandb \
+	--log_project Safe-RLHF-SFT \
+	--zero_stage "${ZERO_STAGE}" \
+	--bf16 True \
+	--tf32 True

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

stderr.log ADDED Viewed

The diff for this file is too large to render. See raw diff

stdout.log ADDED Viewed

	@@ -0,0 +1,41 @@

+[2023-12-31 20:06:51,176] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-12-31 20:06:55,387] [WARNING] [runner.py:202:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
+Detected CUDA_VISIBLE_DEVICES=0,1,2,3 but ignoring it because one or several of --include/--exclude/--num_gpus/--num_nodes cl args were used. If you want to use CUDA_VISIBLE_DEVICES don't pass any of these arguments to deepspeed.
+[2023-12-31 20:06:55,387] [INFO] [runner.py:571:main] cmd = /data/jiongxiao_wang/anaconda3/envs/safe-rlhf/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgM119 --master_addr=127.0.0.1 --master_port=56337 --module --enable_each_rank_log=None safe_rlhf.finetune --train_datasets alpaca --model_name_or_path huggyllama/llama-7b --max_length 512 --trust_remote_code True --epochs 3 --per_device_train_batch_size 4 --per_device_eval_batch_size 4 --gradient_accumulation_steps 16 --gradient_checkpointing --learning_rate 2e-5 --lr_scheduler_type cosine --lr_warmup_ratio 0.03 --weight_decay 0.0 --seed 42 --output_dir /data/jiongxiao_wang/rlhf_attack/safe-rlhf/output/sft --log_type wandb --log_project Safe-RLHF-SFT --zero_stage 3 --bf16 True --tf32 True
+[2023-12-31 20:06:57,487] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-12-31 20:07:00,478] [INFO] [launch.py:145:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3]}
+[2023-12-31 20:07:00,478] [INFO] [launch.py:151:main] nnodes=1, num_local_procs=4, node_rank=0
+[2023-12-31 20:07:00,478] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1, 2, 3]})
+[2023-12-31 20:07:00,478] [INFO] [launch.py:163:main] dist_world_size=4
+[2023-12-31 20:07:00,478] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3
+[2023-12-31 20:07:02,815] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-12-31 20:07:02,856] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-12-31 20:07:03,011] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-12-31 20:07:03,040] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2023-12-31 20:07:10,639] [INFO] [comm.py:637:init_distributed] cdb=None
+[2023-12-31 20:07:10,640] [INFO] [comm.py:637:init_distributed] cdb=None
+[2023-12-31 20:07:10,640] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2023-12-31 20:07:10,670] [INFO] [comm.py:637:init_distributed] cdb=None
+[2023-12-31 20:07:10,675] [INFO] [comm.py:637:init_distributed] cdb=None
+Set logger level to WARNING.
+ninja: no work to do.
+Time to load fused_adam op: 0.14865803718566895 seconds
+Time to load fused_adam op: 0.2057504653930664 seconds
+Time to load fused_adam op: 0.20213913917541504 seconds
+Time to load fused_adam op: 0.2022261619567871 seconds
+Parameter Offload: Total persistent parameters: 266240 in 65 params
+***** Running training *****
+Saving model to "/data/jiongxiao_wang/rlhf_attack/safe-rlhf/output/sft" ...
+Saving DeepSpeed Checkpoints...
+Converting DeepSpeed Checkpoints to Hugging Face format...
+[2023-12-31 21:51:42,560] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Processing zero checkpoint './global_step609'
+Detected checkpoint of type zero stage 3, world_size: 4
+Parsing checkpoint created by deepspeed==0.12.6
+Reconstructed Trainable fp32 state dict with 291 params 6738423808 elements
+Saving fp32 state dict to pytorch_model.bin
+Model saved!
+[2023-12-31 21:52:50,198] [INFO] [launch.py:347:main] Process 189883 exits successfully.
+[2023-12-31 21:52:50,198] [INFO] [launch.py:347:main] Process 189885 exits successfully.
+[2023-12-31 21:52:50,198] [INFO] [launch.py:347:main] Process 189884 exits successfully.
+[2023-12-31 21:52:58,206] [INFO] [launch.py:347:main] Process 189882 exits successfully.

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "clean_up_tokenization_spaces": false,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "legacy": true,
+  "model_max_length": 512,
+  "pad_token": null,
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,587 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel()
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    """
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)