diff --git "a/logs/megatron-1b-code-161653.out" "b/logs/megatron-1b-code-161653.out"
new file mode 100644--- /dev/null
+++ "b/logs/megatron-1b-code-161653.out"
@@ -0,0 +1,9945 @@
++ source /admin/home/loubna/.bashrc
+++ HISTCONTROL=ignoreboth
+++ shopt -s histappend
+++ HISTSIZE=1000
+++ HISTFILESIZE=2000
+++ shopt -s checkwinsize
+++ '[' -x /usr/bin/lesspipe ']'
++++ SHELL=/bin/sh
++++ lesspipe
+++ eval 'export LESSOPEN="| /usr/bin/lesspipe %s";
+export LESSCLOSE="/usr/bin/lesspipe %s %s";'
++++ export 'LESSOPEN=| /usr/bin/lesspipe %s'
++++ LESSOPEN='| /usr/bin/lesspipe %s'
++++ export 'LESSCLOSE=/usr/bin/lesspipe %s %s'
++++ LESSCLOSE='/usr/bin/lesspipe %s %s'
+++ '[' -z '' ']'
+++ '[' -r /etc/debian_chroot ']'
+++ case "$TERM" in
+++ color_prompt=yes
+++ '[' -n '' ']'
+++ '[' yes = yes ']'
+++ PS1='${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '
+++ unset color_prompt force_color_prompt
+++ case "$TERM" in
+++ PS1='\[\e]0;\u@\h: \w\a\]${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '
+++ '[' -x /usr/bin/dircolors ']'
+++ test -r /admin/home/loubna/.dircolors
++++ dircolors -b
+++ eval 'LS_COLORS='\''rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:'\'';
+export LS_COLORS'
++++ LS_COLORS='rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:'
++++ export LS_COLORS
+++ alias 'ls=ls --color=auto'
+++ alias 'grep=grep --color=auto'
+++ alias 'fgrep=fgrep --color=auto'
+++ alias 'egrep=egrep --color=auto'
+++ alias 'll=ls -alF'
+++ alias 'la=ls -A'
+++ alias 'l=ls -CF'
+++ alias 'alert=notify-send --urgency=low -i "$([ $? = 0 ] && echo terminal || echo error)" "$(history|tail -n1|sed -e '\''s/^\s*[0-9]\+\s*//;s/[;&|]\s*alert$//'\'')"'
+++ '[' -f /admin/home/loubna/.bash_aliases ']'
+++ shopt -oq posix
+++ '[' -f /usr/share/bash-completion/bash_completion ']'
+++ . /usr/share/bash-completion/bash_completion
++++ BASH_COMPLETION_VERSINFO=(2 10)
++++ [[ ehxB == *v* ]]
++++ BASH_COMPLETION_ORIGINAL_V_VALUE=+v
++++ [[ -n '' ]]
++++ set +v
++++ _blacklist_glob='@(acroread.sh)'
++++ shopt -s extglob progcomp
++++ complete -u groups slay w sux
++++ complete -A stopped -P '"%' -S '"' bg
++++ complete -j -P '"%' -S '"' fg jobs disown
++++ complete -v readonly unset
++++ complete -A setopt set
++++ complete -A shopt shopt
++++ complete -A helptopic help
++++ complete -a unalias
++++ complete -c command type which
++++ complete -b builtin
++++ [[ linux-gnu == *@(solaris|aix)* ]]
++++ [[ linux-gnu == *@(solaris|aix)* ]]
++++ [[ linux-gnu == *@(solaris|aix)* ]]
++++ _backup_glob='@(#*#|*@(~|.@(bak|orig|rej|swp|dpkg*|rpm@(orig|new|save))))'
++++ complete -F _service service
++++ _sysvdirs
++++ sysvdirs=()
++++ [[ -d /etc/rc.d/init.d ]]
++++ [[ -d /etc/init.d ]]
++++ sysvdirs+=(/etc/init.d)
++++ [[ -f /etc/slackware-version ]]
++++ return 0
++++ for svcdir in "${sysvdirs[@]}"
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/acpid ]]
++++ complete -F _service /etc/init.d/acpid
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/apparmor ]]
++++ complete -F _service /etc/init.d/apparmor
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/apport ]]
++++ complete -F _service /etc/init.d/apport
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/atd ]]
++++ complete -F _service /etc/init.d/atd
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/chrony ]]
++++ complete -F _service /etc/init.d/chrony
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/console-setup.sh ]]
++++ complete -F _service /etc/init.d/console-setup.sh
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/cron ]]
++++ complete -F _service /etc/init.d/cron
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/cryptdisks ]]
++++ complete -F _service /etc/init.d/cryptdisks
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/cryptdisks-early ]]
++++ complete -F _service /etc/init.d/cryptdisks-early
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/datadog-agent ]]
++++ complete -F _service /etc/init.d/datadog-agent
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/datadog-agent-process ]]
++++ complete -F _service /etc/init.d/datadog-agent-process
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/datadog-agent-security ]]
++++ complete -F _service /etc/init.d/datadog-agent-security
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/datadog-agent-trace ]]
++++ complete -F _service /etc/init.d/datadog-agent-trace
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/dbus ]]
++++ complete -F _service /etc/init.d/dbus
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/docker ]]
++++ complete -F _service /etc/init.d/docker
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/gdrdrv ]]
++++ complete -F _service /etc/init.d/gdrdrv
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/grub-common ]]
++++ complete -F _service /etc/init.d/grub-common
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/hwclock.sh ]]
++++ complete -F _service /etc/init.d/hwclock.sh
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/irqbalance ]]
++++ complete -F _service /etc/init.d/irqbalance
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/iscsid ]]
++++ complete -F _service /etc/init.d/iscsid
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/iwpmd ]]
++++ complete -F _service /etc/init.d/iwpmd
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/keyboard-setup.sh ]]
++++ complete -F _service /etc/init.d/keyboard-setup.sh
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/kmod ]]
++++ complete -F _service /etc/init.d/kmod
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/lvm2 ]]
++++ complete -F _service /etc/init.d/lvm2
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/lvm2-lvmpolld ]]
++++ complete -F _service /etc/init.d/lvm2-lvmpolld
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/multipath-tools ]]
++++ complete -F _service /etc/init.d/multipath-tools
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/munge ]]
++++ complete -F _service /etc/init.d/munge
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/nfs-common ]]
++++ complete -F _service /etc/init.d/nfs-common
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/open-iscsi ]]
++++ complete -F _service /etc/init.d/open-iscsi
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/open-vm-tools ]]
++++ complete -F _service /etc/init.d/open-vm-tools
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/plymouth ]]
++++ complete -F _service /etc/init.d/plymouth
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/plymouth-log ]]
++++ complete -F _service /etc/init.d/plymouth-log
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/procps ]]
++++ complete -F _service /etc/init.d/procps
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/rpcbind ]]
++++ complete -F _service /etc/init.d/rpcbind
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/rsync ]]
++++ complete -F _service /etc/init.d/rsync
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/rsyslog ]]
++++ complete -F _service /etc/init.d/rsyslog
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/screen-cleanup ]]
++++ complete -F _service /etc/init.d/screen-cleanup
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/sendmail ]]
++++ complete -F _service /etc/init.d/sendmail
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/ssh ]]
++++ complete -F _service /etc/init.d/ssh
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/sysstat ]]
++++ complete -F _service /etc/init.d/sysstat
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/udev ]]
++++ complete -F _service /etc/init.d/udev
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/ufw ]]
++++ complete -F _service /etc/init.d/ufw
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/unattended-upgrades ]]
++++ complete -F _service /etc/init.d/unattended-upgrades
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/uuidd ]]
++++ complete -F _service /etc/init.d/uuidd
++++ for svc in $svcdir/!($_backup_glob)
++++ [[ -x /etc/init.d/x11-common ]]
++++ complete -F _service /etc/init.d/x11-common
++++ unset svc svcdir sysvdirs
++++ [[ linux-gnu == *freebsd* ]]
++++ shopt -u hostcomplete
++++ complete -F _user_at_host talk ytalk finger
++++ complete -F _known_hosts traceroute traceroute6 fping fping6 telnet rsh rlogin ftp dig mtr ssh-installkeys showmount
++++ shopt -q cdable_vars
++++ complete -F _cd -o nospace cd pushd
++++ complete -F _command aoss command do else eval exec ltrace nice nohup padsp then time tsocks vsound xargs
++++ complete -F _root_command fakeroot gksu gksudo kdesudo really
++++ complete -F _longopt a2ps awk base64 bash bc bison cat chroot colordiff cp csplit cut date df diff dir du enscript env expand fmt fold gperf grep grub head irb ld ldd less ln ls m4 md5sum mkdir mkfifo mknod mv netstat nl nm objcopy objdump od paste pr ptx readelf rm rmdir sed seq shasum sha1sum sha224sum sha256sum sha384sum sha512sum shar sort split strip sum tac tail tee texindex touch tr uname unexpand uniq units vdir wc who
++++ [[ 5 -gt 4 ]]
++++ declare -Ag _xspecs
++++ _install_xspec '!*.?(t)bz?(2)' bunzip2 bzcat pbunzip2 pbzcat lbunzip2 lbzcat
++++ local 'xspec=!*.?(t)bz?(2)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.?(t)bz?(2)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.?(t)bz?(2)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.?(t)bz?(2)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.?(t)bz?(2)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.?(t)bz?(2)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.?(t)bz?(2)'
++++ _install_xspec '!*.@(zip|[egjswx]ar|exe|pk3|wsz|zargo|xpi|s[tx][cdiw]|sx[gm]|o[dt][tspgfc]|od[bm]|oxt|epub|apk|aab|ipa|do[ct][xm]|p[op]t[mx]|xl[st][xm]|pyz|whl)' unzip zipinfo
++++ local 'xspec=!*.@(zip|[egjswx]ar|exe|pk3|wsz|zargo|xpi|s[tx][cdiw]|sx[gm]|o[dt][tspgfc]|od[bm]|oxt|epub|apk|aab|ipa|do[ct][xm]|p[op]t[mx]|xl[st][xm]|pyz|whl)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(zip|[egjswx]ar|exe|pk3|wsz|zargo|xpi|s[tx][cdiw]|sx[gm]|o[dt][tspgfc]|od[bm]|oxt|epub|apk|aab|ipa|do[ct][xm]|p[op]t[mx]|xl[st][xm]|pyz|whl)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(zip|[egjswx]ar|exe|pk3|wsz|zargo|xpi|s[tx][cdiw]|sx[gm]|o[dt][tspgfc]|od[bm]|oxt|epub|apk|aab|ipa|do[ct][xm]|p[op]t[mx]|xl[st][xm]|pyz|whl)'
++++ _install_xspec '*.Z' compress znew
++++ local 'xspec=*.Z' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='*.Z'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='*.Z'
++++ _install_xspec '!*.@(Z|[gGd]z|t[ag]z)' gunzip zcat
++++ local 'xspec=!*.@(Z|[gGd]z|t[ag]z)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(Z|[gGd]z|t[ag]z)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(Z|[gGd]z|t[ag]z)'
++++ _install_xspec '!*.@(Z|[gGdz]z|t[ag]z)' unpigz
++++ local 'xspec=!*.@(Z|[gGdz]z|t[ag]z)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(Z|[gGdz]z|t[ag]z)'
++++ _install_xspec '!*.Z' uncompress
++++ local 'xspec=!*.Z' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.Z'
++++ _install_xspec '!*.@(tlz|lzma)' lzcat lzegrep lzfgrep lzgrep lzless lzmore unlzma
++++ local 'xspec=!*.@(tlz|lzma)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(tlz|lzma)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(tlz|lzma)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(tlz|lzma)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(tlz|lzma)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(tlz|lzma)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(tlz|lzma)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(tlz|lzma)'
++++ _install_xspec '!*.@(?(t)xz|tlz|lzma)' unxz xzcat
++++ local 'xspec=!*.@(?(t)xz|tlz|lzma)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?(t)xz|tlz|lzma)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?(t)xz|tlz|lzma)'
++++ _install_xspec '!*.lrz' lrunzip
++++ local 'xspec=!*.lrz' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.lrz'
++++ _install_xspec '!*.@(gif|jp?(e)g|miff|tif?(f)|pn[gm]|p[bgp]m|bmp|xpm|ico|xwd|tga|pcx)' ee
++++ local 'xspec=!*.@(gif|jp?(e)g|miff|tif?(f)|pn[gm]|p[bgp]m|bmp|xpm|ico|xwd|tga|pcx)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(gif|jp?(e)g|miff|tif?(f)|pn[gm]|p[bgp]m|bmp|xpm|ico|xwd|tga|pcx)'
++++ _install_xspec '!*.@(gif|jp?(e)g|tif?(f)|png|p[bgp]m|bmp|x[bp]m|rle|rgb|pcx|fits|pm|svg)' qiv
++++ local 'xspec=!*.@(gif|jp?(e)g|tif?(f)|png|p[bgp]m|bmp|x[bp]m|rle|rgb|pcx|fits|pm|svg)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(gif|jp?(e)g|tif?(f)|png|p[bgp]m|bmp|x[bp]m|rle|rgb|pcx|fits|pm|svg)'
++++ _install_xspec '!*.@(gif|jp?(e)g?(2)|j2[ck]|jp[2f]|tif?(f)|png|p[bgp]m|bmp|x[bp]m|rle|rgb|pcx|fits|pm|?(e)ps)' xv
++++ local 'xspec=!*.@(gif|jp?(e)g?(2)|j2[ck]|jp[2f]|tif?(f)|png|p[bgp]m|bmp|x[bp]m|rle|rgb|pcx|fits|pm|?(e)ps)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(gif|jp?(e)g?(2)|j2[ck]|jp[2f]|tif?(f)|png|p[bgp]m|bmp|x[bp]m|rle|rgb|pcx|fits|pm|?(e)ps)'
++++ _install_xspec '!*.@(@(?(e)ps|?(E)PS|pdf|PDF)?(.gz|.GZ|.bz2|.BZ2|.Z))' gv ggv kghostview
++++ local 'xspec=!*.@(@(?(e)ps|?(E)PS|pdf|PDF)?(.gz|.GZ|.bz2|.BZ2|.Z))' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(@(?(e)ps|?(E)PS|pdf|PDF)?(.gz|.GZ|.bz2|.BZ2|.Z))'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(@(?(e)ps|?(E)PS|pdf|PDF)?(.gz|.GZ|.bz2|.BZ2|.Z))'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(@(?(e)ps|?(E)PS|pdf|PDF)?(.gz|.GZ|.bz2|.BZ2|.Z))'
++++ _install_xspec '!*.@(dvi|DVI)?(.@(gz|Z|bz2))' xdvi kdvi
++++ local 'xspec=!*.@(dvi|DVI)?(.@(gz|Z|bz2))' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(dvi|DVI)?(.@(gz|Z|bz2))'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(dvi|DVI)?(.@(gz|Z|bz2))'
++++ _install_xspec '!*.dvi' dvips dviselect dvitype dvipdf advi dvipdfm dvipdfmx
++++ local 'xspec=!*.dvi' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.dvi'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.dvi'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.dvi'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.dvi'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.dvi'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.dvi'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.dvi'
++++ _install_xspec '!*.[pf]df' acroread gpdf
++++ local 'xspec=!*.[pf]df' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.[pf]df'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.[pf]df'
++++ _install_xspec '!*.@(pdf|fdf)?(.@(gz|GZ|bz2|BZ2|Z))' xpdf
++++ local 'xspec=!*.@(pdf|fdf)?(.@(gz|GZ|bz2|BZ2|Z))' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(pdf|fdf)?(.@(gz|GZ|bz2|BZ2|Z))'
++++ _install_xspec '!*.@(?(e)ps|pdf)' kpdf
++++ local 'xspec=!*.@(?(e)ps|pdf)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?(e)ps|pdf)'
++++ _install_xspec '!*.@(okular|@(?(e|x)ps|?(E|X)PS|[pf]df|[PF]DF|dvi|DVI|cb[rz]|CB[RZ]|djv?(u)|DJV?(U)|dvi|DVI|gif|jp?(e)g|miff|tif?(f)|pn[gm]|p[bgp]m|bmp|xpm|ico|xwd|tga|pcx|GIF|JP?(E)G|MIFF|TIF?(F)|PN[GM]|P[BGP]M|BMP|XPM|ICO|XWD|TGA|PCX|epub|EPUB|odt|ODT|fb?(2)|FB?(2)|mobi|MOBI|g3|G3|chm|CHM)?(.?(gz|GZ|bz2|BZ2|xz|XZ)))' okular
++++ local 'xspec=!*.@(okular|@(?(e|x)ps|?(E|X)PS|[pf]df|[PF]DF|dvi|DVI|cb[rz]|CB[RZ]|djv?(u)|DJV?(U)|dvi|DVI|gif|jp?(e)g|miff|tif?(f)|pn[gm]|p[bgp]m|bmp|xpm|ico|xwd|tga|pcx|GIF|JP?(E)G|MIFF|TIF?(F)|PN[GM]|P[BGP]M|BMP|XPM|ICO|XWD|TGA|PCX|epub|EPUB|odt|ODT|fb?(2)|FB?(2)|mobi|MOBI|g3|G3|chm|CHM)?(.?(gz|GZ|bz2|BZ2|xz|XZ)))' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(okular|@(?(e|x)ps|?(E|X)PS|[pf]df|[PF]DF|dvi|DVI|cb[rz]|CB[RZ]|djv?(u)|DJV?(U)|dvi|DVI|gif|jp?(e)g|miff|tif?(f)|pn[gm]|p[bgp]m|bmp|xpm|ico|xwd|tga|pcx|GIF|JP?(E)G|MIFF|TIF?(F)|PN[GM]|P[BGP]M|BMP|XPM|ICO|XWD|TGA|PCX|epub|EPUB|odt|ODT|fb?(2)|FB?(2)|mobi|MOBI|g3|G3|chm|CHM)?(.?(gz|GZ|bz2|BZ2|xz|XZ)))'
++++ _install_xspec '!*.pdf' epdfview pdfunite
++++ local 'xspec=!*.pdf' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.pdf'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.pdf'
++++ _install_xspec '!*.@(cb[rz7t]|djv?(u)|?(e)ps|pdf)' zathura
++++ local 'xspec=!*.@(cb[rz7t]|djv?(u)|?(e)ps|pdf)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(cb[rz7t]|djv?(u)|?(e)ps|pdf)'
++++ _install_xspec '!*.@(?(e)ps|pdf)' ps2pdf ps2pdf12 ps2pdf13 ps2pdf14 ps2pdfwr
++++ local 'xspec=!*.@(?(e)ps|pdf)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?(e)ps|pdf)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?(e)ps|pdf)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?(e)ps|pdf)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?(e)ps|pdf)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?(e)ps|pdf)'
++++ _install_xspec '!*.texi*' makeinfo texi2html
++++ local 'xspec=!*.texi*' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.texi*'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.texi*'
++++ _install_xspec '!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)' tex latex slitex jadetex pdfjadetex pdftex pdflatex texi2dvi xetex xelatex luatex lualatex
++++ local 'xspec=!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)'
++++ _install_xspec '!*.mp3' mpg123 mpg321 madplay
++++ local 'xspec=!*.mp3' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.mp3'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.mp3'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.mp3'
++++ _install_xspec '!*@(.@(mp?(e)g|MP?(E)G|wm[av]|WM[AV]|avi|AVI|asf|vob|VOB|bin|dat|divx|DIVX|vcd|ps|pes|fli|flv|FLV|fxm|FXM|viv|rm|ram|yuv|mov|MOV|qt|QT|web[am]|WEB[AM]|mp[234]|MP[234]|m?(p)4[av]|M?(P)4[AV]|mkv|MKV|og[agmvx]|OG[AGMVX]|t[ps]|T[PS]|m2t?(s)|M2T?(S)|mts|MTS|wav|WAV|flac|FLAC|asx|ASX|mng|MNG|srt|m[eo]d|M[EO]D|s[3t]m|S[3T]M|it|IT|xm|XM)|+([0-9]).@(vdr|VDR))?(.@(crdownload|part))' xine aaxine fbxine
++++ local 'xspec=!*@(.@(mp?(e)g|MP?(E)G|wm[av]|WM[AV]|avi|AVI|asf|vob|VOB|bin|dat|divx|DIVX|vcd|ps|pes|fli|flv|FLV|fxm|FXM|viv|rm|ram|yuv|mov|MOV|qt|QT|web[am]|WEB[AM]|mp[234]|MP[234]|m?(p)4[av]|M?(P)4[AV]|mkv|MKV|og[agmvx]|OG[AGMVX]|t[ps]|T[PS]|m2t?(s)|M2T?(S)|mts|MTS|wav|WAV|flac|FLAC|asx|ASX|mng|MNG|srt|m[eo]d|M[EO]D|s[3t]m|S[3T]M|it|IT|xm|XM)|+([0-9]).@(vdr|VDR))?(.@(crdownload|part))' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*@(.@(mp?(e)g|MP?(E)G|wm[av]|WM[AV]|avi|AVI|asf|vob|VOB|bin|dat|divx|DIVX|vcd|ps|pes|fli|flv|FLV|fxm|FXM|viv|rm|ram|yuv|mov|MOV|qt|QT|web[am]|WEB[AM]|mp[234]|MP[234]|m?(p)4[av]|M?(P)4[AV]|mkv|MKV|og[agmvx]|OG[AGMVX]|t[ps]|T[PS]|m2t?(s)|M2T?(S)|mts|MTS|wav|WAV|flac|FLAC|asx|ASX|mng|MNG|srt|m[eo]d|M[EO]D|s[3t]m|S[3T]M|it|IT|xm|XM)|+([0-9]).@(vdr|VDR))?(.@(crdownload|part))'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*@(.@(mp?(e)g|MP?(E)G|wm[av]|WM[AV]|avi|AVI|asf|vob|VOB|bin|dat|divx|DIVX|vcd|ps|pes|fli|flv|FLV|fxm|FXM|viv|rm|ram|yuv|mov|MOV|qt|QT|web[am]|WEB[AM]|mp[234]|MP[234]|m?(p)4[av]|M?(P)4[AV]|mkv|MKV|og[agmvx]|OG[AGMVX]|t[ps]|T[PS]|m2t?(s)|M2T?(S)|mts|MTS|wav|WAV|flac|FLAC|asx|ASX|mng|MNG|srt|m[eo]d|M[EO]D|s[3t]m|S[3T]M|it|IT|xm|XM)|+([0-9]).@(vdr|VDR))?(.@(crdownload|part))'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*@(.@(mp?(e)g|MP?(E)G|wm[av]|WM[AV]|avi|AVI|asf|vob|VOB|bin|dat|divx|DIVX|vcd|ps|pes|fli|flv|FLV|fxm|FXM|viv|rm|ram|yuv|mov|MOV|qt|QT|web[am]|WEB[AM]|mp[234]|MP[234]|m?(p)4[av]|M?(P)4[AV]|mkv|MKV|og[agmvx]|OG[AGMVX]|t[ps]|T[PS]|m2t?(s)|M2T?(S)|mts|MTS|wav|WAV|flac|FLAC|asx|ASX|mng|MNG|srt|m[eo]d|M[EO]D|s[3t]m|S[3T]M|it|IT|xm|XM)|+([0-9]).@(vdr|VDR))?(.@(crdownload|part))'
++++ _install_xspec '!*@(.@(mp?(e)g|MP?(E)G|wm[av]|WM[AV]|avi|AVI|asf|vob|VOB|bin|dat|divx|DIVX|vcd|ps|pes|fli|flv|FLV|fxm|FXM|viv|rm|ram|yuv|mov|MOV|qt|QT|web[am]|WEB[AM]|mp[234]|MP[234]|m?(p)4[av]|M?(P)4[AV]|mkv|MKV|og[agmvx]|OG[AGMVX]|t[ps]|T[PS]|m2t?(s)|M2T?(S)|mts|MTS|wav|WAV|flac|FLAC|asx|ASX|mng|MNG|srt|m[eo]d|M[EO]D|s[3t]m|S[3T]M|it|IT|xm|XM|iso|ISO)|+([0-9]).@(vdr|VDR))?(.@(crdownload|part))' kaffeine dragon
++++ local 'xspec=!*@(.@(mp?(e)g|MP?(E)G|wm[av]|WM[AV]|avi|AVI|asf|vob|VOB|bin|dat|divx|DIVX|vcd|ps|pes|fli|flv|FLV|fxm|FXM|viv|rm|ram|yuv|mov|MOV|qt|QT|web[am]|WEB[AM]|mp[234]|MP[234]|m?(p)4[av]|M?(P)4[AV]|mkv|MKV|og[agmvx]|OG[AGMVX]|t[ps]|T[PS]|m2t?(s)|M2T?(S)|mts|MTS|wav|WAV|flac|FLAC|asx|ASX|mng|MNG|srt|m[eo]d|M[EO]D|s[3t]m|S[3T]M|it|IT|xm|XM|iso|ISO)|+([0-9]).@(vdr|VDR))?(.@(crdownload|part))' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*@(.@(mp?(e)g|MP?(E)G|wm[av]|WM[AV]|avi|AVI|asf|vob|VOB|bin|dat|divx|DIVX|vcd|ps|pes|fli|flv|FLV|fxm|FXM|viv|rm|ram|yuv|mov|MOV|qt|QT|web[am]|WEB[AM]|mp[234]|MP[234]|m?(p)4[av]|M?(P)4[AV]|mkv|MKV|og[agmvx]|OG[AGMVX]|t[ps]|T[PS]|m2t?(s)|M2T?(S)|mts|MTS|wav|WAV|flac|FLAC|asx|ASX|mng|MNG|srt|m[eo]d|M[EO]D|s[3t]m|S[3T]M|it|IT|xm|XM|iso|ISO)|+([0-9]).@(vdr|VDR))?(.@(crdownload|part))'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*@(.@(mp?(e)g|MP?(E)G|wm[av]|WM[AV]|avi|AVI|asf|vob|VOB|bin|dat|divx|DIVX|vcd|ps|pes|fli|flv|FLV|fxm|FXM|viv|rm|ram|yuv|mov|MOV|qt|QT|web[am]|WEB[AM]|mp[234]|MP[234]|m?(p)4[av]|M?(P)4[AV]|mkv|MKV|og[agmvx]|OG[AGMVX]|t[ps]|T[PS]|m2t?(s)|M2T?(S)|mts|MTS|wav|WAV|flac|FLAC|asx|ASX|mng|MNG|srt|m[eo]d|M[EO]D|s[3t]m|S[3T]M|it|IT|xm|XM|iso|ISO)|+([0-9]).@(vdr|VDR))?(.@(crdownload|part))'
++++ _install_xspec '!*.@(avi|asf|wmv)' aviplay
++++ local 'xspec=!*.@(avi|asf|wmv)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(avi|asf|wmv)'
++++ _install_xspec '!*.@(rm?(j)|ra?(m)|smi?(l))' realplay
++++ local 'xspec=!*.@(rm?(j)|ra?(m)|smi?(l))' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(rm?(j)|ra?(m)|smi?(l))'
++++ _install_xspec '!*.@(mpg|mpeg|avi|mov|qt)' xanim
++++ local 'xspec=!*.@(mpg|mpeg|avi|mov|qt)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(mpg|mpeg|avi|mov|qt)'
++++ _install_xspec '!*.@(og[ag]|m3u|flac|spx)' ogg123
++++ local 'xspec=!*.@(og[ag]|m3u|flac|spx)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(og[ag]|m3u|flac|spx)'
++++ _install_xspec '!*.@(mp3|og[ag]|pls|m3u)' gqmpeg freeamp
++++ local 'xspec=!*.@(mp3|og[ag]|pls|m3u)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(mp3|og[ag]|pls|m3u)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(mp3|og[ag]|pls|m3u)'
++++ _install_xspec '!*.fig' xfig
++++ local 'xspec=!*.fig' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.fig'
++++ _install_xspec '!*.@(mid?(i)|cmf)' playmidi
++++ local 'xspec=!*.@(mid?(i)|cmf)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(mid?(i)|cmf)'
++++ _install_xspec '!*.@(mid?(i)|rmi|rcp|[gr]36|g18|mod|xm|it|x3m|s[3t]m|kar)' timidity
++++ local 'xspec=!*.@(mid?(i)|rmi|rcp|[gr]36|g18|mod|xm|it|x3m|s[3t]m|kar)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(mid?(i)|rmi|rcp|[gr]36|g18|mod|xm|it|x3m|s[3t]m|kar)'
++++ _install_xspec '!*.@(669|abc|am[fs]|d[bs]m|dmf|far|it|mdl|m[eo]d|mid?(i)|mt[2m]|oct|okt?(a)|p[st]m|s[3t]m|ult|umx|wav|xm)' modplugplay modplug123
++++ local 'xspec=!*.@(669|abc|am[fs]|d[bs]m|dmf|far|it|mdl|m[eo]d|mid?(i)|mt[2m]|oct|okt?(a)|p[st]m|s[3t]m|ult|umx|wav|xm)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(669|abc|am[fs]|d[bs]m|dmf|far|it|mdl|m[eo]d|mid?(i)|mt[2m]|oct|okt?(a)|p[st]m|s[3t]m|ult|umx|wav|xm)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(669|abc|am[fs]|d[bs]m|dmf|far|it|mdl|m[eo]d|mid?(i)|mt[2m]|oct|okt?(a)|p[st]m|s[3t]m|ult|umx|wav|xm)'
++++ _install_xspec '*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)' vi vim gvim rvim view rview rgvim rgview gview emacs xemacs sxemacs kate kwrite
++++ local 'xspec=*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)'
++++ _install_xspec '!*.@(zip|z|gz|tgz)' bzme
++++ local 'xspec=!*.@(zip|z|gz|tgz)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(zip|z|gz|tgz)'
++++ _install_xspec '!*.@(?([xX]|[sS])[hH][tT][mM]?([lL]))' netscape mozilla lynx galeon dillo elinks amaya epiphany
++++ local 'xspec=!*.@(?([xX]|[sS])[hH][tT][mM]?([lL]))' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?([xX]|[sS])[hH][tT][mM]?([lL]))'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?([xX]|[sS])[hH][tT][mM]?([lL]))'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?([xX]|[sS])[hH][tT][mM]?([lL]))'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?([xX]|[sS])[hH][tT][mM]?([lL]))'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?([xX]|[sS])[hH][tT][mM]?([lL]))'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?([xX]|[sS])[hH][tT][mM]?([lL]))'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?([xX]|[sS])[hH][tT][mM]?([lL]))'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(?([xX]|[sS])[hH][tT][mM]?([lL]))'
++++ _install_xspec '!*.@(sxw|stw|sxg|sgl|doc?([mx])|dot?([mx])|rtf|txt|htm|html|?(f)odt|ott|odm|pdf)' oowriter lowriter
++++ local 'xspec=!*.@(sxw|stw|sxg|sgl|doc?([mx])|dot?([mx])|rtf|txt|htm|html|?(f)odt|ott|odm|pdf)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(sxw|stw|sxg|sgl|doc?([mx])|dot?([mx])|rtf|txt|htm|html|?(f)odt|ott|odm|pdf)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(sxw|stw|sxg|sgl|doc?([mx])|dot?([mx])|rtf|txt|htm|html|?(f)odt|ott|odm|pdf)'
++++ _install_xspec '!*.@(sxi|sti|pps?(x)|ppt?([mx])|pot?([mx])|?(f)odp|otp)' ooimpress loimpress
++++ local 'xspec=!*.@(sxi|sti|pps?(x)|ppt?([mx])|pot?([mx])|?(f)odp|otp)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(sxi|sti|pps?(x)|ppt?([mx])|pot?([mx])|?(f)odp|otp)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(sxi|sti|pps?(x)|ppt?([mx])|pot?([mx])|?(f)odp|otp)'
++++ _install_xspec '!*.@(sxc|stc|xls?([bmx])|xlw|xlt?([mx])|[ct]sv|?(f)ods|ots)' oocalc localc
++++ local 'xspec=!*.@(sxc|stc|xls?([bmx])|xlw|xlt?([mx])|[ct]sv|?(f)ods|ots)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(sxc|stc|xls?([bmx])|xlw|xlt?([mx])|[ct]sv|?(f)ods|ots)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(sxc|stc|xls?([bmx])|xlw|xlt?([mx])|[ct]sv|?(f)ods|ots)'
++++ _install_xspec '!*.@(sxd|std|sda|sdd|?(f)odg|otg)' oodraw lodraw
++++ local 'xspec=!*.@(sxd|std|sda|sdd|?(f)odg|otg)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(sxd|std|sda|sdd|?(f)odg|otg)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(sxd|std|sda|sdd|?(f)odg|otg)'
++++ _install_xspec '!*.@(sxm|smf|mml|odf)' oomath lomath
++++ local 'xspec=!*.@(sxm|smf|mml|odf)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(sxm|smf|mml|odf)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(sxm|smf|mml|odf)'
++++ _install_xspec '!*.odb' oobase lobase
++++ local 'xspec=!*.odb' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.odb'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.odb'
++++ _install_xspec '!*.[rs]pm' rpm2cpio
++++ local 'xspec=!*.[rs]pm' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.[rs]pm'
++++ _install_xspec '!*.aux' bibtex
++++ local 'xspec=!*.aux' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.aux'
++++ _install_xspec '!*.po' poedit gtranslator kbabel lokalize
++++ local 'xspec=!*.po' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.po'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.po'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.po'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.po'
++++ _install_xspec '!*.@([Pp][Rr][Gg]|[Cc][Ll][Pp])' harbour gharbour hbpp
++++ local 'xspec=!*.@([Pp][Rr][Gg]|[Cc][Ll][Pp])' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@([Pp][Rr][Gg]|[Cc][Ll][Pp])'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@([Pp][Rr][Gg]|[Cc][Ll][Pp])'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@([Pp][Rr][Gg]|[Cc][Ll][Pp])'
++++ _install_xspec '!*.[Hh][Rr][Bb]' hbrun
++++ local 'xspec=!*.[Hh][Rr][Bb]' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.[Hh][Rr][Bb]'
++++ _install_xspec '!*.ly' lilypond ly2dvi
++++ local 'xspec=!*.ly' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.ly'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.ly'
++++ _install_xspec '!*.@(dif?(f)|?(d)patch)?(.@([gx]z|bz2|lzma))' cdiff
++++ local 'xspec=!*.@(dif?(f)|?(d)patch)?(.@([gx]z|bz2|lzma))' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(dif?(f)|?(d)patch)?(.@([gx]z|bz2|lzma))'
++++ _install_xspec '!@(*.@(ks|jks|jceks|p12|pfx|bks|ubr|gkr|cer|crt|cert|p7b|pkipath|pem|p10|csr|crl)|cacerts)' portecle
++++ local 'xspec=!@(*.@(ks|jks|jceks|p12|pfx|bks|ubr|gkr|cer|crt|cert|p7b|pkipath|pem|p10|csr|crl)|cacerts)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!@(*.@(ks|jks|jceks|p12|pfx|bks|ubr|gkr|cer|crt|cert|p7b|pkipath|pem|p10|csr|crl)|cacerts)'
++++ _install_xspec '!*.@(mp[234c]|og[ag]|@(fl|a)ac|m4[abp]|spx|tta|w?(a)v|wma|aif?(f)|asf|ape)' kid3 kid3-qt
++++ local 'xspec=!*.@(mp[234c]|og[ag]|@(fl|a)ac|m4[abp]|spx|tta|w?(a)v|wma|aif?(f)|asf|ape)' cmd
++++ shift
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(mp[234c]|og[ag]|@(fl|a)ac|m4[abp]|spx|tta|w?(a)v|wma|aif?(f)|asf|ape)'
++++ for cmd in "$@"
++++ _xspecs[$cmd]='!*.@(mp[234c]|og[ag]|@(fl|a)ac|m4[abp]|spx|tta|w?(a)v|wma|aif?(f)|asf|ape)'
++++ unset -f _install_xspec
++++ complete -F _minimal ''
++++ complete -D -F _completion_loader
++++ compat_dir=/etc/bash_completion.d
++++ [[ -d /etc/bash_completion.d ]]
++++ [[ -r /etc/bash_completion.d ]]
++++ [[ -x /etc/bash_completion.d ]]
++++ for i in "$compat_dir"/*
++++ [[ apport_completion != @(@(#*#|*@(~|.@(bak|orig|rej|swp|dpkg*|rpm@(orig|new|save))))|Makefile*|@(acroread.sh)) ]]
++++ [[ -f /etc/bash_completion.d/apport_completion ]]
++++ [[ -r /etc/bash_completion.d/apport_completion ]]
++++ . /etc/bash_completion.d/apport_completion
+++++ complete -F _apport-bug -o filenames -o dirnames ubuntu-bug
+++++ complete -F _apport-bug -o filenames -o dirnames apport-bug
+++++ complete -F _apport-cli -o filenames -o dirnames apport-cli
+++++ complete -F _apport-unpack -o filenames -o dirnames apport-unpack
+++++ complete -F _apport-collect apport-collect
++++ for i in "$compat_dir"/*
++++ [[ git-prompt != @(@(#*#|*@(~|.@(bak|orig|rej|swp|dpkg*|rpm@(orig|new|save))))|Makefile*|@(acroread.sh)) ]]
++++ [[ -f /etc/bash_completion.d/git-prompt ]]
++++ [[ -r /etc/bash_completion.d/git-prompt ]]
++++ . /etc/bash_completion.d/git-prompt
+++++ [[ -e /usr/lib/git-core/git-sh-prompt ]]
+++++ . /usr/lib/git-core/git-sh-prompt
++++++ __git_printf_supports_v=
++++++ printf -v __git_printf_supports_v -- %s yes
++++ unset compat_dir i _blacklist_glob
++++ user_completion=/admin/home/loubna/.bash_completion
++++ [[ /usr/share/bash-completion/bash_completion != /admin/home/loubna/.bash_completion ]]
++++ [[ -r /admin/home/loubna/.bash_completion ]]
++++ unset user_completion
++++ unset -f have
++++ unset have
++++ set +v
++++ unset BASH_COMPLETION_ORIGINAL_V_VALUE
+++ export PATH=/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/envs/eval-harness/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
+++ PATH=/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/envs/eval-harness/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
++++ /fsx/loubna/miniconda3/bin/conda shell.bash hook
+++ __conda_setup='export CONDA_EXE='\''/fsx/loubna/miniconda3/bin/conda'\''
+export _CE_M='\'''\''
+export _CE_CONDA='\'''\''
+export CONDA_PYTHON_EXE='\''/fsx/loubna/miniconda3/bin/python'\''
+
+# Copyright (C) 2012 Anaconda, Inc
+# SPDX-License-Identifier: BSD-3-Clause
+
+__conda_exe() (
+    "$CONDA_EXE" $_CE_M $_CE_CONDA "$@"
+)
+
+__conda_hashr() {
+    if [ -n "${ZSH_VERSION:+x}" ]; then
+        \rehash
+    elif [ -n "${POSH_VERSION:+x}" ]; then
+        :  # pass
+    else
+        \hash -r
+    fi
+}
+
+__conda_activate() {
+    if [ -n "${CONDA_PS1_BACKUP:+x}" ]; then
+        # Handle transition from shell activated with conda <= 4.3 to a subsequent activation
+        # after conda updated to >= 4.4. See issue #6173.
+        PS1="$CONDA_PS1_BACKUP"
+        \unset CONDA_PS1_BACKUP
+    fi
+    \local ask_conda
+    ask_conda="$(PS1="${PS1:-}" __conda_exe shell.posix "$@")" || \return
+    \eval "$ask_conda"
+    __conda_hashr
+}
+
+__conda_reactivate() {
+    \local ask_conda
+    ask_conda="$(PS1="${PS1:-}" __conda_exe shell.posix reactivate)" || \return
+    \eval "$ask_conda"
+    __conda_hashr
+}
+
+conda() {
+    \local cmd="${1-__missing__}"
+    case "$cmd" in
+        activate|deactivate)
+            __conda_activate "$@"
+            ;;
+        install|update|upgrade|remove|uninstall)
+            __conda_exe "$@" || \return
+            __conda_reactivate
+            ;;
+        *)
+            __conda_exe "$@"
+            ;;
+    esac
+}
+
+if [ -z "${CONDA_SHLVL+x}" ]; then
+    \export CONDA_SHLVL=0
+    # In dev-mode CONDA_EXE is python.exe and on Windows
+    # it is in a different relative location to condabin.
+    if [ -n "${_CE_CONDA:+x}" ] && [ -n "${WINDIR+x}" ]; then
+        PATH="$(\dirname "$CONDA_EXE")/condabin${PATH:+":${PATH}"}"
+    else
+        PATH="$(\dirname "$(\dirname "$CONDA_EXE")")/condabin${PATH:+":${PATH}"}"
+    fi
+    \export PATH
+
+    # We'\''re not allowing PS1 to be unbound. It must at least be set.
+    # However, we'\''re not exporting it, which can cause problems when starting a second shell
+    # via a first shell (i.e. starting zsh from bash).
+    if [ -z "${PS1+x}" ]; then
+        PS1=
+    fi
+fi
+
+conda activate base'
+++ '[' 0 -eq 0 ']'
+++ eval 'export CONDA_EXE='\''/fsx/loubna/miniconda3/bin/conda'\''
+export _CE_M='\'''\''
+export _CE_CONDA='\'''\''
+export CONDA_PYTHON_EXE='\''/fsx/loubna/miniconda3/bin/python'\''
+
+# Copyright (C) 2012 Anaconda, Inc
+# SPDX-License-Identifier: BSD-3-Clause
+
+__conda_exe() (
+    "$CONDA_EXE" $_CE_M $_CE_CONDA "$@"
+)
+
+__conda_hashr() {
+    if [ -n "${ZSH_VERSION:+x}" ]; then
+        \rehash
+    elif [ -n "${POSH_VERSION:+x}" ]; then
+        :  # pass
+    else
+        \hash -r
+    fi
+}
+
+__conda_activate() {
+    if [ -n "${CONDA_PS1_BACKUP:+x}" ]; then
+        # Handle transition from shell activated with conda <= 4.3 to a subsequent activation
+        # after conda updated to >= 4.4. See issue #6173.
+        PS1="$CONDA_PS1_BACKUP"
+        \unset CONDA_PS1_BACKUP
+    fi
+    \local ask_conda
+    ask_conda="$(PS1="${PS1:-}" __conda_exe shell.posix "$@")" || \return
+    \eval "$ask_conda"
+    __conda_hashr
+}
+
+__conda_reactivate() {
+    \local ask_conda
+    ask_conda="$(PS1="${PS1:-}" __conda_exe shell.posix reactivate)" || \return
+    \eval "$ask_conda"
+    __conda_hashr
+}
+
+conda() {
+    \local cmd="${1-__missing__}"
+    case "$cmd" in
+        activate|deactivate)
+            __conda_activate "$@"
+            ;;
+        install|update|upgrade|remove|uninstall)
+            __conda_exe "$@" || \return
+            __conda_reactivate
+            ;;
+        *)
+            __conda_exe "$@"
+            ;;
+    esac
+}
+
+if [ -z "${CONDA_SHLVL+x}" ]; then
+    \export CONDA_SHLVL=0
+    # In dev-mode CONDA_EXE is python.exe and on Windows
+    # it is in a different relative location to condabin.
+    if [ -n "${_CE_CONDA:+x}" ] && [ -n "${WINDIR+x}" ]; then
+        PATH="$(\dirname "$CONDA_EXE")/condabin${PATH:+":${PATH}"}"
+    else
+        PATH="$(\dirname "$(\dirname "$CONDA_EXE")")/condabin${PATH:+":${PATH}"}"
+    fi
+    \export PATH
+
+    # We'\''re not allowing PS1 to be unbound. It must at least be set.
+    # However, we'\''re not exporting it, which can cause problems when starting a second shell
+    # via a first shell (i.e. starting zsh from bash).
+    if [ -z "${PS1+x}" ]; then
+        PS1=
+    fi
+fi
+
+conda activate base'
++++ export CONDA_EXE=/fsx/loubna/miniconda3/bin/conda
++++ CONDA_EXE=/fsx/loubna/miniconda3/bin/conda
++++ export _CE_M=
++++ _CE_M=
++++ export _CE_CONDA=
++++ _CE_CONDA=
++++ export CONDA_PYTHON_EXE=/fsx/loubna/miniconda3/bin/python
++++ CONDA_PYTHON_EXE=/fsx/loubna/miniconda3/bin/python
++++ '[' -z x ']'
++++ conda activate base
++++ local cmd=activate
++++ case "$cmd" in
++++ __conda_activate activate base
++++ '[' -n '' ']'
++++ local ask_conda
+++++ PS1='\[\e]0;\u@\h: \w\a\]${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '
+++++ __conda_exe shell.posix activate base
+++++ /fsx/loubna/miniconda3/bin/conda shell.posix activate base
++++ ask_conda='PS1='\''(base) \[\e]0;\u@\h: \w\a\]${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '\''
+export PATH='\''/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin'\''
+export CONDA_PREFIX='\''/fsx/loubna/miniconda3'\''
+export CONDA_SHLVL='\''3'\''
+export CONDA_DEFAULT_ENV='\''base'\''
+export CONDA_PROMPT_MODIFIER='\''(base) '\''
+export CONDA_PREFIX_2='\''/fsx/loubna/miniconda3/envs/eval-harness'\''
+export CONDA_EXE='\''/fsx/loubna/miniconda3/bin/conda'\''
+export _CE_M='\'''\''
+export _CE_CONDA='\'''\''
+export CONDA_PYTHON_EXE='\''/fsx/loubna/miniconda3/bin/python'\'''
++++ eval 'PS1='\''(base) \[\e]0;\u@\h: \w\a\]${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '\''
+export PATH='\''/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin'\''
+export CONDA_PREFIX='\''/fsx/loubna/miniconda3'\''
+export CONDA_SHLVL='\''3'\''
+export CONDA_DEFAULT_ENV='\''base'\''
+export CONDA_PROMPT_MODIFIER='\''(base) '\''
+export CONDA_PREFIX_2='\''/fsx/loubna/miniconda3/envs/eval-harness'\''
+export CONDA_EXE='\''/fsx/loubna/miniconda3/bin/conda'\''
+export _CE_M='\'''\''
+export _CE_CONDA='\'''\''
+export CONDA_PYTHON_EXE='\''/fsx/loubna/miniconda3/bin/python'\'''
+++++ PS1='(base) \[\e]0;\u@\h: \w\a\]${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '
+++++ export PATH=/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
+++++ PATH=/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
+++++ export CONDA_PREFIX=/fsx/loubna/miniconda3
+++++ CONDA_PREFIX=/fsx/loubna/miniconda3
+++++ export CONDA_SHLVL=3
+++++ CONDA_SHLVL=3
+++++ export CONDA_DEFAULT_ENV=base
+++++ CONDA_DEFAULT_ENV=base
+++++ export 'CONDA_PROMPT_MODIFIER=(base) '
+++++ CONDA_PROMPT_MODIFIER='(base) '
+++++ export CONDA_PREFIX_2=/fsx/loubna/miniconda3/envs/eval-harness
+++++ CONDA_PREFIX_2=/fsx/loubna/miniconda3/envs/eval-harness
+++++ export CONDA_EXE=/fsx/loubna/miniconda3/bin/conda
+++++ CONDA_EXE=/fsx/loubna/miniconda3/bin/conda
+++++ export _CE_M=
+++++ _CE_M=
+++++ export _CE_CONDA=
+++++ _CE_CONDA=
+++++ export CONDA_PYTHON_EXE=/fsx/loubna/miniconda3/bin/python
+++++ CONDA_PYTHON_EXE=/fsx/loubna/miniconda3/bin/python
++++ __conda_hashr
++++ '[' -n '' ']'
++++ '[' -n '' ']'
++++ hash -r
+++ unset __conda_setup
+++ export WANDB_CACHE_DIR=/fsx/loubna/.tmp/wandb
+++ WANDB_CACHE_DIR=/fsx/loubna/.tmp/wandb
+++ export TMPDIR=/fsx/loubna/.tmp
+++ TMPDIR=/fsx/loubna/.tmp
+++ export HUGGINGFACE_HUB_CACHE=/fsx/loubna/.cache
+++ HUGGINGFACE_HUB_CACHE=/fsx/loubna/.cache
+++ export HF_DATASETS_CACHE=/fsx/loubna/.cache
+++ HF_DATASETS_CACHE=/fsx/loubna/.cache
+++ '[' -f /fsx/loubna/google-cloud-sdk/path.bash.inc ']'
+++ . /fsx/loubna/google-cloud-sdk/path.bash.inc
+++++ command readlink /fsx/loubna/google-cloud-sdk/path.bash.inc
+++++ readlink /fsx/loubna/google-cloud-sdk/path.bash.inc
++++ script_link=
++++ script_link=/fsx/loubna/google-cloud-sdk/path.bash.inc
++++ apparent_sdk_dir=/fsx/loubna/google-cloud-sdk
++++ '[' /fsx/loubna/google-cloud-sdk == /fsx/loubna/google-cloud-sdk/path.bash.inc ']'
+++++ command cd -P /fsx/loubna/google-cloud-sdk
+++++ cd -P /fsx/loubna/google-cloud-sdk
+++++ command pwd -P
+++++ pwd -P
++++ sdk_dir=/fsx/loubna/google-cloud-sdk
++++ bin_path=/fsx/loubna/google-cloud-sdk/bin
++++ [[ :/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin: != *\:\/\f\s\x\/\l\o\u\b\n\a\/\g\o\o\g\l\e\-\c\l\o\u\d\-\s\d\k\/\b\i\n\:* ]]
+++ '[' -f /fsx/loubna/google-cloud-sdk/completion.bash.inc ']'
+++ . /fsx/loubna/google-cloud-sdk/completion.bash.inc
++++ complete -o nospace -F _python_argcomplete gcloud
++++ unset bq_COMMANDS
++++ complete -F _bq_completer bq
++++ complete -o nospace -F _python_argcomplete gsutil
++ conda activate megatron
++ local cmd=activate
++ case "$cmd" in
++ __conda_activate activate megatron
++ '[' -n '' ']'
++ local ask_conda
+++ PS1='(base) \[\e]0;\u@\h: \w\a\]${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '
+++ __conda_exe shell.posix activate megatron
+++ /fsx/loubna/miniconda3/bin/conda shell.posix activate megatron
++ ask_conda='PS1='\''(megatron) \[\e]0;\u@\h: \w\a\]${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '\''
+export PATH='\''/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/envs/megatron/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin'\''
+export CONDA_PREFIX='\''/fsx/loubna/miniconda3/envs/megatron'\''
+export CONDA_SHLVL='\''4'\''
+export CONDA_DEFAULT_ENV='\''megatron'\''
+export CONDA_PROMPT_MODIFIER='\''(megatron) '\''
+export CONDA_PREFIX_3='\''/fsx/loubna/miniconda3'\''
+export CONDA_EXE='\''/fsx/loubna/miniconda3/bin/conda'\''
+export _CE_M='\'''\''
+export _CE_CONDA='\'''\''
+export CONDA_PYTHON_EXE='\''/fsx/loubna/miniconda3/bin/python'\'''
++ eval 'PS1='\''(megatron) \[\e]0;\u@\h: \w\a\]${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '\''
+export PATH='\''/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/envs/megatron/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin'\''
+export CONDA_PREFIX='\''/fsx/loubna/miniconda3/envs/megatron'\''
+export CONDA_SHLVL='\''4'\''
+export CONDA_DEFAULT_ENV='\''megatron'\''
+export CONDA_PROMPT_MODIFIER='\''(megatron) '\''
+export CONDA_PREFIX_3='\''/fsx/loubna/miniconda3'\''
+export CONDA_EXE='\''/fsx/loubna/miniconda3/bin/conda'\''
+export _CE_M='\'''\''
+export _CE_CONDA='\'''\''
+export CONDA_PYTHON_EXE='\''/fsx/loubna/miniconda3/bin/python'\'''
+++ PS1='(megatron) \[\e]0;\u@\h: \w\a\]${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '
+++ export PATH=/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/envs/megatron/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
+++ PATH=/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/envs/megatron/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
+++ export CONDA_PREFIX=/fsx/loubna/miniconda3/envs/megatron
+++ CONDA_PREFIX=/fsx/loubna/miniconda3/envs/megatron
+++ export CONDA_SHLVL=4
+++ CONDA_SHLVL=4
+++ export CONDA_DEFAULT_ENV=megatron
+++ CONDA_DEFAULT_ENV=megatron
+++ export 'CONDA_PROMPT_MODIFIER=(megatron) '
+++ CONDA_PROMPT_MODIFIER='(megatron) '
+++ export CONDA_PREFIX_3=/fsx/loubna/miniconda3
+++ CONDA_PREFIX_3=/fsx/loubna/miniconda3
+++ export CONDA_EXE=/fsx/loubna/miniconda3/bin/conda
+++ CONDA_EXE=/fsx/loubna/miniconda3/bin/conda
+++ export _CE_M=
+++ _CE_M=
+++ export _CE_CONDA=
+++ _CE_CONDA=
+++ export CONDA_PYTHON_EXE=/fsx/loubna/miniconda3/bin/python
+++ CONDA_PYTHON_EXE=/fsx/loubna/miniconda3/bin/python
++ __conda_hashr
++ '[' -n '' ']'
++ '[' -n '' ']'
++ hash -r
+++ date
++ echo 'START TIME: Wed Jun 21 17:26:40 UTC 2023'
+START TIME: Wed Jun 21 17:26:40 UTC 2023
++ SCRIPT_REPO=/fsx/loubna/code/Megatron-LM
++ pushd /fsx/loubna/code/Megatron-LM
+/fsx/loubna/code/Megatron-LM /fsx/loubna/code/fork/brrr/examples/gpt2_mqa/hub_logs
++ LOG_PATH=/fsx/loubna/code/Megatron-LM/main_log.txt
++ GPUS_PER_NODE=8
+++ head -n 1
+++ scontrol show hostnames 'ip-26-0-150-[19,31,54,70,122],ip-26-0-151-187,ip-26-0-155-[46,69]'
++ MASTER_ADDR=ip-26-0-150-19
++ MASTER_PORT=6000
++ NNODES=8
++ NODE_RANK=0
++ WORLD_SIZE=64
++ CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/1b-starcoder
++ TOKENIZER_FILE=/fsx/loubna/starcoder-tokenizer/15b/tokenizer.json
++ WEIGHTS_TRAIN=/fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/train_data_paths.txt.tmp
++ WEIGHTS_VALID=/fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/valid_data_paths.txt.tmp
++ mkdir -p /fsx/bigcode/experiments/pretraining/1b-starcoder/tensorboard
++ GPT_ARGS='       --tensor-model-parallel-size 1        --pipeline-model-parallel-size 1        --num-layers 24        --hidden-size 2048        --num-attention-heads 16        --attention-head-type multiquery        --init-method-std 0.02209        --seq-length 8192        --max-position-embeddings 8192        --attention-dropout 0.1        --hidden-dropout 0.1        --micro-batch-size 1        --global-batch-size 64        --lr 0.0003        --min-lr 0.00003        --train-iters 150000        --lr-decay-iters 150000        --lr-decay-style cosine        --lr-warmup-iters 2000        --weight-decay .1        --adam-beta2 .95        --clip-grad 1.0        --bf16        --use-flash-attn        --log-interval 10        --save-interval 10000        --eval-interval 10000        --eval-iters 2        --valid-num-workers 0 '
++ TENSORBOARD_ARGS='--tensorboard-dir /fsx/loubna/br4-experiments/tensorboard/debug'
++ CMD='     /fsx/loubna/code/Megatron-LM/pretrain_gpt.py            --tensor-model-parallel-size 1        --pipeline-model-parallel-size 1        --num-layers 24        --hidden-size 2048        --num-attention-heads 16        --attention-head-type multiquery        --init-method-std 0.02209        --seq-length 8192        --max-position-embeddings 8192        --attention-dropout 0.1        --hidden-dropout 0.1        --micro-batch-size 1        --global-batch-size 64        --lr 0.0003        --min-lr 0.00003        --train-iters 150000        --lr-decay-iters 150000        --lr-decay-style cosine        --lr-warmup-iters 2000        --weight-decay .1        --adam-beta2 .95        --clip-grad 1.0        --bf16        --use-flash-attn        --log-interval 10        --save-interval 10000        --eval-interval 10000        --eval-iters 2        --valid-num-workers 0      --tokenizer-type TokenizerFromFile     --tokenizer-file /fsx/loubna/starcoder-tokenizer/15b/tokenizer.json     --save /fsx/bigcode/experiments/pretraining/1b-starcoder     --load /fsx/bigcode/experiments/pretraining/1b-starcoder     --train-weighted-split-paths-path /fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/train_data_paths.txt.tmp     --valid-weighted-split-paths-path /fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/valid_data_paths.txt.tmp     --structured-logs     --structured-logs-dir /fsx/bigcode/experiments/pretraining/1b-starcoder/logs     --tensorboard-dir /fsx/loubna/br4-experiments/tensorboard/debug     --wandb-entity-name loubnabnl     --wandb-project-name 1b-model     '
++ export 'LAUNCHER=python -u -m torch.distributed.run     --nproc_per_node 8     --nnodes 8     --rdzv_endpoint ip-26-0-150-19:6000     --rdzv_backend c10d     --max_restarts 0     --tee 3     '
++ LAUNCHER='python -u -m torch.distributed.run     --nproc_per_node 8     --nnodes 8     --rdzv_endpoint ip-26-0-150-19:6000     --rdzv_backend c10d     --max_restarts 0     --tee 3     '
++ echo /fsx/loubna/code/Megatron-LM/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 2048 --num-attention-heads 16 --attention-head-type multiquery --init-method-std 0.02209 --seq-length 8192 --max-position-embeddings 8192 --attention-dropout 0.1 --hidden-dropout 0.1 --micro-batch-size 1 --global-batch-size 64 --lr 0.0003 --min-lr 0.00003 --train-iters 150000 --lr-decay-iters 150000 --lr-decay-style cosine --lr-warmup-iters 2000 --weight-decay .1 --adam-beta2 .95 --clip-grad 1.0 --bf16 --use-flash-attn --log-interval 10 --save-interval 10000 --eval-interval 10000 --eval-iters 2 --valid-num-workers 0 --tokenizer-type TokenizerFromFile --tokenizer-file /fsx/loubna/starcoder-tokenizer/15b/tokenizer.json --save /fsx/bigcode/experiments/pretraining/1b-starcoder --load /fsx/bigcode/experiments/pretraining/1b-starcoder --train-weighted-split-paths-path /fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/train_data_paths.txt.tmp --valid-weighted-split-paths-path /fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/valid_data_paths.txt.tmp --structured-logs --structured-logs-dir /fsx/bigcode/experiments/pretraining/1b-starcoder/logs --tensorboard-dir /fsx/loubna/br4-experiments/tensorboard/debug --wandb-entity-name loubnabnl --wandb-project-name 1b-model
+/fsx/loubna/code/Megatron-LM/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 2048 --num-attention-heads 16 --attention-head-type multiquery --init-method-std 0.02209 --seq-length 8192 --max-position-embeddings 8192 --attention-dropout 0.1 --hidden-dropout 0.1 --micro-batch-size 1 --global-batch-size 64 --lr 0.0003 --min-lr 0.00003 --train-iters 150000 --lr-decay-iters 150000 --lr-decay-style cosine --lr-warmup-iters 2000 --weight-decay .1 --adam-beta2 .95 --clip-grad 1.0 --bf16 --use-flash-attn --log-interval 10 --save-interval 10000 --eval-interval 10000 --eval-iters 2 --valid-num-workers 0 --tokenizer-type TokenizerFromFile --tokenizer-file /fsx/loubna/starcoder-tokenizer/15b/tokenizer.json --save /fsx/bigcode/experiments/pretraining/1b-starcoder --load /fsx/bigcode/experiments/pretraining/1b-starcoder --train-weighted-split-paths-path /fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/train_data_paths.txt.tmp --valid-weighted-split-paths-path /fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/valid_data_paths.txt.tmp --structured-logs --structured-logs-dir /fsx/bigcode/experiments/pretraining/1b-starcoder/logs --tensorboard-dir /fsx/loubna/br4-experiments/tensorboard/debug --wandb-entity-name loubnabnl --wandb-project-name 1b-model
++ export NCCL_ASYNC_ERROR_HANDLING=1
++ NCCL_ASYNC_ERROR_HANDLING=1
++ export NCCL_PROTO=simple
++ NCCL_PROTO=simple
++ export RDMAV_FORK_SAFE=1
++ RDMAV_FORK_SAFE=1
++ export FI_EFA_FORK_SAFE=1
++ FI_EFA_FORK_SAFE=1
++ export FI_EFA_USE_DEVICE_RDMA=1
++ FI_EFA_USE_DEVICE_RDMA=1
++ export FI_PROVIDER=efa
++ FI_PROVIDER=efa
++ export FI_LOG_LEVEL=1
++ FI_LOG_LEVEL=1
++ export NCCL_IB_DISABLE=1
++ NCCL_IB_DISABLE=1
++ export NCCL_SOCKET_IFNAME=ens
++ NCCL_SOCKET_IFNAME=ens
++ export CUDA_HOME=/usr/local/cuda-11.6
++ CUDA_HOME=/usr/local/cuda-11.6
++ SRUN_ARGS='     --wait=60     --kill-on-bad-exit=1     '
++ clear
+[H[2J[3J+ srun --wait=60 --kill-on-bad-exit=1 --jobid 161653 bash -c 'python -u -m torch.distributed.run     --nproc_per_node 8     --nnodes 8     --rdzv_endpoint ip-26-0-150-19:6000     --rdzv_backend c10d     --max_restarts 0     --tee 3      --node_rank $SLURM_PROCID --role $SLURMD_NODENAME:      /fsx/loubna/code/Megatron-LM/pretrain_gpt.py            --tensor-model-parallel-size 1        --pipeline-model-parallel-size 1        --num-layers 24        --hidden-size 2048        --num-attention-heads 16        --attention-head-type multiquery        --init-method-std 0.02209        --seq-length 8192        --max-position-embeddings 8192        --attention-dropout 0.1        --hidden-dropout 0.1        --micro-batch-size 1        --global-batch-size 64        --lr 0.0003        --min-lr 0.00003        --train-iters 150000        --lr-decay-iters 150000        --lr-decay-style cosine        --lr-warmup-iters 2000        --weight-decay .1        --adam-beta2 .95        --clip-grad 1.0        --bf16        --use-flash-attn        --log-interval 10        --save-interval 10000        --eval-interval 10000        --eval-iters 2        --valid-num-workers 0      --tokenizer-type TokenizerFromFile     --tokenizer-file /fsx/loubna/starcoder-tokenizer/15b/tokenizer.json     --save /fsx/bigcode/experiments/pretraining/1b-starcoder     --load /fsx/bigcode/experiments/pretraining/1b-starcoder     --train-weighted-split-paths-path /fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/train_data_paths.txt.tmp     --valid-weighted-split-paths-path /fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/valid_data_paths.txt.tmp     --structured-logs     --structured-logs-dir /fsx/bigcode/experiments/pretraining/1b-starcoder/logs     --tensorboard-dir /fsx/loubna/br4-experiments/tensorboard/debug     --wandb-entity-name loubnabnl     --wandb-project-name 1b-model     '
++ tee /fsx/loubna/code/Megatron-LM/main_log.txt
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+WARNING:__main__:
+*****************************************
+Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+*****************************************
+[ip-26-0-150-122:0]:using world size: 64, data-parallel-size: 64, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 
+[ip-26-0-150-122:0]:WARNING: overriding default arguments for tokenizer_type:GPT2BPETokenizer                        with tokenizer_type:TokenizerFromFile
+[ip-26-0-150-122:0]:accumulate and all-reduce gradients in fp32 for bfloat16 data type.
+[ip-26-0-150-122:0]:using torch.bfloat16 for parameters ...
+[ip-26-0-150-122:0]:------------------------ arguments ------------------------
+[ip-26-0-150-122:0]:  accumulate_allreduce_grads_in_fp32 .............. True
+[ip-26-0-150-122:0]:  adam_beta1 ...................................... 0.9
+[ip-26-0-150-122:0]:  adam_beta2 ...................................... 0.95
+[ip-26-0-150-122:0]:  adam_eps ........................................ 1e-08
+[ip-26-0-150-122:0]:  adlr_autoresume ................................. False
+[ip-26-0-150-122:0]:  adlr_autoresume_interval ........................ 1000
+[ip-26-0-150-122:0]:  apply_query_key_layer_scaling ................... True
+[ip-26-0-150-122:0]:  apply_residual_connection_post_layernorm ........ False
+[ip-26-0-150-122:0]:  async_tensor_model_parallel_allreduce ........... True
+[ip-26-0-150-122:0]:  attention_dropout ............................... 0.1
+[ip-26-0-150-122:0]:  attention_head_type ............................. multiquery
+[ip-26-0-150-122:0]:  attention_softmax_in_fp32 ....................... False
+[ip-26-0-150-122:0]:  bert_binary_head ................................ True
+[ip-26-0-150-122:0]:  bert_load ....................................... None
+[ip-26-0-150-122:0]:  bf16 ............................................ True
+[ip-26-0-150-122:0]:  bias_dropout_fusion ............................. True
+[ip-26-0-150-122:0]:  bias_gelu_fusion ................................ True
+[ip-26-0-150-122:0]:  biencoder_projection_dim ........................ 0
+[ip-26-0-150-122:0]:  biencoder_shared_query_context_model ............ False
+[ip-26-0-150-122:0]:  block_data_path ................................. None
+[ip-26-0-150-122:0]:  classes_fraction ................................ 1.0
+[ip-26-0-150-122:0]:  clip_grad ....................................... 1.0
+[ip-26-0-150-122:0]:  consumed_train_samples .......................... 0
+[ip-26-0-150-122:0]:  consumed_valid_samples .......................... 0
+[ip-26-0-150-122:0]:  data_impl ....................................... infer
+[ip-26-0-150-122:0]:  data_parallel_random_init ....................... False
+[ip-26-0-150-122:0]:  data_parallel_size .............................. 64
+[ip-26-0-150-122:0]:  data_path ....................................... None
+[ip-26-0-150-122:0]:  data_per_class_fraction ......................... 1.0
+[ip-26-0-150-122:0]:  data_sharding ................................... True
+[ip-26-0-150-122:0]:  dataloader_type ................................. single
+[ip-26-0-150-122:0]:  DDP_impl ........................................ local
+[ip-26-0-150-122:0]:  decoder_seq_length .............................. None
+[ip-26-0-150-122:0]:  dino_bottleneck_size ............................ 256
+[ip-26-0-150-122:0]:  dino_freeze_last_layer .......................... 1
+[ip-26-0-150-122:0]:  dino_head_hidden_size ........................... 2048
+[ip-26-0-150-122:0]:  dino_local_crops_number ......................... 10
+[ip-26-0-150-122:0]:  dino_local_img_size ............................. 96
+[ip-26-0-150-122:0]:  dino_norm_last_layer ............................ False
+[ip-26-0-150-122:0]:  dino_teacher_temp ............................... 0.07
+[ip-26-0-150-122:0]:  dino_warmup_teacher_temp ........................ 0.04
+[ip-26-0-150-122:0]:  dino_warmup_teacher_temp_epochs ................. 30
+[ip-26-0-150-122:0]:  distribute_saved_activations .................... False
+[ip-26-0-150-122:0]:  distributed_backend ............................. nccl
+[ip-26-0-150-122:0]:  distributed_timeout ............................. 600
+[ip-26-0-150-122:0]:  embedding_path .................................. None
+[ip-26-0-150-122:0]:  empty_unused_memory_level ....................... 0
+[ip-26-0-150-122:0]:  encoder_seq_length .............................. 8192
+[ip-26-0-150-122:0]:  end_weight_decay ................................ 0.1
+[ip-26-0-150-122:0]:  eod_mask_loss ................................... False
+[ip-26-0-150-122:0]:  eval_interval ................................... 10000
+[ip-26-0-150-122:0]:  eval_iters ...................................... 2
+[ip-26-0-150-122:0]:  evidence_data_path .............................. None
+[ip-26-0-150-122:0]:  exit_duration_in_mins ........................... None
+[ip-26-0-150-122:0]:  exit_interval ................................... None
+[ip-26-0-150-122:0]:  exit_signal_handler ............................. False
+[ip-26-0-150-122:0]:  ffn_hidden_size ................................. 8192
+[ip-26-0-150-122:0]:  fim_rate ........................................ 0.0
+[ip-26-0-150-122:0]:  fim_spm_rate .................................... 0.5
+[ip-26-0-150-122:0]:  finetune ........................................ False
+[ip-26-0-150-122:0]:  finetune_from ................................... None
+[ip-26-0-150-122:0]:  fp16 ............................................ False
+[ip-26-0-150-122:0]:  fp16_lm_cross_entropy ........................... False
+[ip-26-0-150-122:0]:  fp32_residual_connection ........................ False
+[ip-26-0-150-122:0]:  global_batch_size ............................... 64
+[ip-26-0-150-122:0]:  glu_activation .................................. None
+[ip-26-0-150-122:0]:  gradient_accumulation_fusion .................... True
+[ip-26-0-150-122:0]:  head_lr_mult .................................... 1.0
+[ip-26-0-150-122:0]:  hidden_dropout .................................. 0.1
+[ip-26-0-150-122:0]:  hidden_size ..................................... 2048
+[ip-26-0-150-122:0]:  hysteresis ...................................... 2
+[ip-26-0-150-122:0]:  ict_head_size ................................... None
+[ip-26-0-150-122:0]:  ict_load ........................................ None
+[ip-26-0-150-122:0]:  img_h ........................................... 224
+[ip-26-0-150-122:0]:  img_w ........................................... 224
+[ip-26-0-150-122:0]:  indexer_batch_size .............................. 128
+[ip-26-0-150-122:0]:  indexer_log_interval ............................ 1000
+[ip-26-0-150-122:0]:  inference_batch_times_seqlen_threshold .......... 512
+[ip-26-0-150-122:0]:  init_method_std ................................. 0.02209
+[ip-26-0-150-122:0]:  init_method_xavier_uniform ...................... False
+[ip-26-0-150-122:0]:  initial_loss_scale .............................. 4294967296
+[ip-26-0-150-122:0]:  iter_per_epoch .................................. 1250
+[ip-26-0-150-122:0]:  kv_channels ..................................... 128
+[ip-26-0-150-122:0]:  layernorm_epsilon ............................... 1e-05
+[ip-26-0-150-122:0]:  lazy_mpu_init ................................... None
+[ip-26-0-150-122:0]:  load ............................................ /fsx/bigcode/experiments/pretraining/1b-starcoder
+[ip-26-0-150-122:0]:  local_rank ...................................... 0
+[ip-26-0-150-122:0]:  log_batch_size_to_tensorboard ................... False
+[ip-26-0-150-122:0]:  log_interval .................................... 10
+[ip-26-0-150-122:0]:  log_learning_rate_to_tensorboard ................ True
+[ip-26-0-150-122:0]:  log_loss_scale_to_tensorboard ................... True
+[ip-26-0-150-122:0]:  log_memory_to_tensorboard ....................... False
+[ip-26-0-150-122:0]:  log_num_zeros_in_grad ........................... False
+[ip-26-0-150-122:0]:  log_params_norm ................................. False
+[ip-26-0-150-122:0]:  log_timers_to_tensorboard ....................... False
+[ip-26-0-150-122:0]:  log_validation_ppl_to_tensorboard ............... False
+[ip-26-0-150-122:0]:  log_world_size_to_tensorboard ................... False
+[ip-26-0-150-122:0]:  loss_scale ...................................... None
+[ip-26-0-150-122:0]:  loss_scale_window ............................... 1000
+[ip-26-0-150-122:0]:  lr .............................................. 0.0003
+[ip-26-0-150-122:0]:  lr_decay_iters .................................. 150000
+[ip-26-0-150-122:0]:  lr_decay_samples ................................ None
+[ip-26-0-150-122:0]:  lr_decay_style .................................. cosine
+[ip-26-0-150-122:0]:  lr_warmup_fraction .............................. None
+[ip-26-0-150-122:0]:  lr_warmup_iters ................................. 2000
+[ip-26-0-150-122:0]:  lr_warmup_samples ............................... 0
+[ip-26-0-150-122:0]:  make_vocab_size_divisible_by .................... 128
+[ip-26-0-150-122:0]:  mask_factor ..................................... 1.0
+[ip-26-0-150-122:0]:  mask_prob ....................................... 0.15
+[ip-26-0-150-122:0]:  mask_type ....................................... random
+[ip-26-0-150-122:0]:  masked_softmax_fusion ........................... True
+[ip-26-0-150-122:0]:  max_position_embeddings ......................... 8192
+[ip-26-0-150-122:0]:  merge_file ...................................... None
+[ip-26-0-150-122:0]:  micro_batch_size ................................ 1
+[ip-26-0-150-122:0]:  min_loss_scale .................................. 1.0
+[ip-26-0-150-122:0]:  min_lr .......................................... 3e-05
+[ip-26-0-150-122:0]:  mmap_warmup ..................................... False
+[ip-26-0-150-122:0]:  no_load_optim ................................... None
+[ip-26-0-150-122:0]:  no_load_rng ..................................... None
+[ip-26-0-150-122:0]:  no_persist_layer_norm ........................... False
+[ip-26-0-150-122:0]:  no_save_optim ................................... None
+[ip-26-0-150-122:0]:  no_save_rng ..................................... None
+[ip-26-0-150-122:0]:  num_attention_heads ............................. 16
+[ip-26-0-150-122:0]:  num_channels .................................... 3
+[ip-26-0-150-122:0]:  num_classes ..................................... 1000
+[ip-26-0-150-122:0]:  num_experts ..................................... None
+[ip-26-0-150-122:0]:  num_layers ...................................... 24
+[ip-26-0-150-122:0]:  num_layers_per_virtual_pipeline_stage ........... None
+[ip-26-0-150-122:0]:  num_workers ..................................... 2
+[ip-26-0-150-122:0]:  onnx_safe ....................................... None
+[ip-26-0-150-122:0]:  openai_gelu ..................................... False
+[ip-26-0-150-122:0]:  optimizer ....................................... adam
+[ip-26-0-150-122:0]:  override_opt_param_scheduler .................... False
+[ip-26-0-150-122:0]:  params_dtype .................................... torch.bfloat16
+[ip-26-0-150-122:0]:  patch_dim ....................................... 16
+[ip-26-0-150-122:0]:  perform_initialization .......................... True
+[ip-26-0-150-122:0]:  pipeline_model_parallel_size .................... 1
+[ip-26-0-150-122:0]:  pipeline_model_parallel_split_rank .............. None
+[ip-26-0-150-122:0]:  position_embedding_type ......................... PositionEmbeddingType.absolute
+[ip-26-0-150-122:0]:  query_in_block_prob ............................. 0.1
+[ip-26-0-150-122:0]:  rampup_batch_size ............................... None
+[ip-26-0-150-122:0]:  rank ............................................ 0
+[ip-26-0-150-122:0]:  recompute_granularity ........................... None
+[ip-26-0-150-122:0]:  recompute_method ................................ None
+[ip-26-0-150-122:0]:  recompute_num_layers ............................ 1
+[ip-26-0-150-122:0]:  reset_attention_mask ............................ False
+[ip-26-0-150-122:0]:  reset_position_ids .............................. False
+[ip-26-0-150-122:0]:  retriever_report_topk_accuracies ................ []
+[ip-26-0-150-122:0]:  retriever_score_scaling ......................... False
+[ip-26-0-150-122:0]:  retriever_seq_length ............................ 256
+[ip-26-0-150-122:0]:  sample_rate ..................................... 1.0
+[ip-26-0-150-122:0]:  save ............................................ /fsx/bigcode/experiments/pretraining/1b-starcoder
+[ip-26-0-150-122:0]:  save_interval ................................... 10000
+[ip-26-0-150-122:0]:  scatter_gather_tensors_in_pipeline .............. True
+[ip-26-0-150-122:0]:  seed ............................................ 1234
+[ip-26-0-150-122:0]:  seq_length ...................................... 8192
+[ip-26-0-150-122:0]:  sequence_parallel ............................... False
+[ip-26-0-150-122:0]:  sgd_momentum .................................... 0.9
+[ip-26-0-150-122:0]:  short_seq_prob .................................. 0.1
+[ip-26-0-150-122:0]:  split ........................................... None
+[ip-26-0-150-122:0]:  standalone_embedding_stage ...................... False
+[ip-26-0-150-122:0]:  start_weight_decay .............................. 0.1
+[ip-26-0-150-122:0]:  structured_logs ................................. True
+[ip-26-0-150-122:0]:  structured_logs_dir ............................. /fsx/bigcode/experiments/pretraining/1b-starcoder/logs
+[ip-26-0-150-122:0]:  swin_backbone_type .............................. tiny
+[ip-26-0-150-122:0]:  tensor_model_parallel_size ...................... 1
+[ip-26-0-150-122:0]:  tensorboard_dir ................................. /fsx/loubna/br4-experiments/tensorboard/debug
+[ip-26-0-150-122:0]:  tensorboard_log_interval ........................ 1
+[ip-26-0-150-122:0]:  tensorboard_queue_size .......................... 1000
+[ip-26-0-150-122:0]:  test_weighted_split_paths ....................... None
+[ip-26-0-150-122:0]:  test_weighted_split_paths_path .................. None
+[ip-26-0-150-122:0]:  titles_data_path ................................ None
+[ip-26-0-150-122:0]:  tokenizer_file .................................. /fsx/loubna/starcoder-tokenizer/15b/tokenizer.json
+[ip-26-0-150-122:0]:  tokenizer_type .................................. TokenizerFromFile
+[ip-26-0-150-122:0]:  train_iters ..................................... 150000
+[ip-26-0-150-122:0]:  train_samples ................................... None
+[ip-26-0-150-122:0]:  train_weighted_split_names ...................... ['TRAIN']
+[ip-26-0-150-122:0]:  train_weighted_split_paths ...................... [['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document']]
+[ip-26-0-150-122:0]:  train_weighted_split_paths_path ................. None
+[ip-26-0-150-122:0]:  train_weighted_split_splits ..................... [['0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969']]
+[ip-26-0-150-122:0]:  train_weighted_split_weights .................... [['3.0', '0.01', '53.89', '1.78', '0.85', '5.68', '0.01', '1.31', '0.98', '0.08', '0.03', '0.09', '1.12', '23.78', '0.7', '0.61', '0.26', '1.68', '2.23', '0.3', '0.31', '0.45', '0.12', '6.81', '9.11', '0.06', '44.66', '0.58', '2.23', '0.01', '1.25', '1.03', '1.31', '2.87', '0.01', '0.05', '3.32', '0.03', '0.19', '0.39', '5.2', '0.02', '1.56', '0.01', '0.07', '0.41', '3.66', '0.56', '0.03', '0.001', '0.23', '0.02', '0.01', '4.69', '0.35', '0.33', '0.01', '3.09', '0.46', '0.2', '0.05', '0.04', '11.09', '0.4', '0.3', '0.42', '48.92', '0.64', '1.4', '0.71', '0.91', '29.36', '86.94', '64.71', '74.93', '60.89', '60.4', '26.52', '0.001', '1.42', '0.94', '0.01', '0.0002', '0.11', '0.18', '0.05', '1.0', '1.0', '54.4', '32.0', '7.12', '6.0']]
+[ip-26-0-150-122:0]:  transformer_pipeline_model_parallel_size ........ 1
+[ip-26-0-150-122:0]:  transformer_timers .............................. False
+[ip-26-0-150-122:0]:  use_checkpoint_args ............................. False
+[ip-26-0-150-122:0]:  use_checkpoint_opt_param_scheduler .............. False
+[ip-26-0-150-122:0]:  use_contiguous_buffers_in_local_ddp ............. True
+[ip-26-0-150-122:0]:  use_cpu_initialization .......................... None
+[ip-26-0-150-122:0]:  use_distributed_optimizer ....................... False
+[ip-26-0-150-122:0]:  use_flash_attn .................................. True
+[ip-26-0-150-122:0]:  use_one_sent_docs ............................... False
+[ip-26-0-150-122:0]:  valid_num_workers ............................... 0
+[ip-26-0-150-122:0]:  valid_weighted_split_names ...................... ['VALID_css', 'VALID_prolog', 'VALID_c', 'VALID_fortran', 'VALID_solidity', 'VALID_kotlin', 'VALID_literate-agda', 'VALID_julia', 'VALID_java-server-pages', 'VALID_isabelle', 'VALID_idris', 'VALID_lean', 'VALID_powershell', 'VALID_go', 'VALID_erlang', 'VALID_f-sharp', 'VALID_ada', 'VALID_pascal', 'VALID_perl', 'VALID_r', 'VALID_protocol-buffer', 'VALID_cmake', 'VALID_sas', 'VALID_ruby', 'VALID_rust', 'VALID_rmarkdown', 'VALID_c-sharp', 'VALID_smalltalk', 'VALID_haskell', 'VALID_maple', 'VALID_mathematica', 'VALID_ocaml', 'VALID_makefile', 'VALID_lua', 'VALID_literate-coffeescript', 'VALID_literate-haskell', 'VALID_restructuredtext', 'VALID_racket', 'VALID_standard-ml', 'VALID_systemverilog', 'VALID_tex', 'VALID_awk', 'VALID_assembly', 'VALID_alloy', 'VALID_agda', 'VALID_emacs-lisp', 'VALID_dart', 'VALID_cuda', 'VALID_bluespec', 'VALID_augeas', 'VALID_batchfile', 'VALID_tcsh', 'VALID_stan', 'VALID_scala', 'VALID_tcl', 'VALID_stata', 'VALID_applescript', 'VALID_shell', 'VALID_clojure', 'VALID_scheme', 'VALID_antlr', 'VALID_sparql', 'VALID_sql', 'VALID_glsl', 'VALID_elm', 'VALID_dockerfile', 'VALID_cpp', 'VALID_coffeescript', 'VALID_common-lisp', 'VALID_elixir', 'VALID_groovy', 'VALID_html', 'VALID_java', 'VALID_javascript', 'VALID_markdown', 'VALID_php', 'VALID_python', 'VALID_typescript', 'VALID_verilog', 'VALID_visual-basic', 'VALID_vhdl', 'VALID_thrift', 'VALID_matlab', 'VALID_yacc', 'VALID_zig', 'VALID_xslt', 'VALID_json', 'VALID_yaml', 'VALID_gh_issues', 'VALID_gh_commits', 'VALID_notebook_scripts', 'VALID_notebook_structured', 'VALID_all_sources_weighted']
+[ip-26-0-150-122:0]:  valid_weighted_split_paths ...................... [['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document']]
+[ip-26-0-150-122:0]:  valid_weighted_split_paths_path ................. None
+[ip-26-0-150-122:0]:  valid_weighted_split_splits ..................... [['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999']]
+[ip-26-0-150-122:0]:  valid_weighted_split_weights .................... [['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['3.0', '0.01', '53.89', '1.78', '0.85', '5.68', '0.01', '1.31', '0.98', '0.08', '0.03', '0.09', '1.12', '23.78', '0.7', '0.61', '0.26', '1.68', '2.23', '0.3', '0.31', '0.45', '0.12', '6.81', '9.11', '0.06', '44.66', '0.58', '2.23', '0.01', '1.25', '1.03', '1.31', '2.87', '0.01', '0.05', '3.32', '0.03', '0.19', '0.39', '5.2', '0.02', '1.56', '0.01', '0.07', '0.41', '3.66', '0.56', '0.03', '0.001', '0.23', '0.02', '0.01', '4.69', '0.35', '0.33', '0.01', '3.09', '0.46', '0.2', '0.05', '0.04', '11.09', '0.4', '0.3', '0.42', '48.92', '0.64', '1.4', '0.71', '0.91', '29.36', '86.94', '64.71', '74.93', '60.89', '60.4', '26.52', '0.001', '1.42', '0.94', '0.01', '0.0002', '0.11', '0.18', '0.05', '1.0', '1.0', '54.4', '32.0', '7.12', '6.0']]
+[ip-26-0-150-122:0]:  virtual_pipeline_model_parallel_size ............ None
+[ip-26-0-150-122:0]:  vision_backbone_type ............................ vit
+[ip-26-0-150-122:0]:  vision_pretraining .............................. False
+[ip-26-0-150-122:0]:  vision_pretraining_type ......................... classify
+[ip-26-0-150-122:0]:  vocab_extra_ids ................................. 0
+[ip-26-0-150-122:0]:  vocab_file ...................................... None
+[ip-26-0-150-122:0]:  wandb_entity_name ............................... loubnabnl
+[ip-26-0-150-122:0]:  wandb_project_name .............................. 1b-model
+[ip-26-0-150-122:0]:  weight_decay .................................... 0.1
+[ip-26-0-150-122:0]:  weight_decay_incr_style ......................... constant
+[ip-26-0-150-122:0]:  world_size ...................................... 64
+[ip-26-0-150-122:0]:-------------------- end of arguments ---------------------
+[ip-26-0-150-122:0]:setting number of micro-batches to constant 1
+[ip-26-0-150-122:0]:> building TokenizerFromFile tokenizer ...
+[ip-26-0-150-122:0]: > padded vocab (size: 49152) with 0 dummy tokens (new size: 49152)
+[ip-26-0-150-122:0]:> initializing torch distributed ...
+[ip-26-0-155-69:7]:> setting tensorboard ...
+[ip-26-0-150-122:0]:> initializing tensor model parallel with size 1
+[ip-26-0-150-122:0]:> initializing pipeline model parallel with size 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:07,667 [Rank 0]: > setting random seeds to 1234 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:07,669 [Rank 0]: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234
+[ip-26-0-150-122:0]:2023-06-21 17:27:07,669 [Rank 0]: > compiling dataset index builder ...
+[ip-26-0-150-122:0]:make: Entering directory '/fsx/loubna/code/Megatron-LM/megatron/data'
+[ip-26-0-150-122:0]:make: Nothing to be done for 'default'.
+[ip-26-0-150-122:0]:make: Leaving directory '/fsx/loubna/code/Megatron-LM/megatron/data'
+[ip-26-0-150-122:0]:2023-06-21 17:27:07,730 [Rank 0]: >>> done with dataset index builder. Compilation time: 0.061 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:07,730 [Rank 0]: > compiling and loading fused kernels ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:07,843 [Rank 0]: Detected CUDA files, patching ldflags
+[ip-26-0-150-122:0]:2023-06-21 17:27:07,843 [Rank 0]: Emitting ninja build file /fsx/loubna/code/Megatron-LM/megatron/fused_kernels/build/build.ninja...
+[ip-26-0-150-122:0]:2023-06-21 17:27:07,848 [Rank 0]: Building extension module scaled_upper_triang_masked_softmax_cuda...
+[ip-26-0-150-122:0]:2023-06-21 17:27:07,848 [Rank 0]: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+[ip-26-0-150-122:0]:ninja: no work to do.
+[ip-26-0-150-122:0]:2023-06-21 17:27:07,966 [Rank 0]: Loading extension module scaled_upper_triang_masked_softmax_cuda...
+[ip-26-0-150-122:0]:2023-06-21 17:27:08,063 [Rank 0]: Detected CUDA files, patching ldflags
+[ip-26-0-150-122:0]:2023-06-21 17:27:08,063 [Rank 0]: Emitting ninja build file /fsx/loubna/code/Megatron-LM/megatron/fused_kernels/build/build.ninja...
+[ip-26-0-150-122:0]:2023-06-21 17:27:08,066 [Rank 0]: Building extension module scaled_masked_softmax_cuda...
+[ip-26-0-150-122:0]:2023-06-21 17:27:08,066 [Rank 0]: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+[ip-26-0-150-122:0]:ninja: no work to do.
+[ip-26-0-150-122:0]:2023-06-21 17:27:08,184 [Rank 0]: Loading extension module scaled_masked_softmax_cuda...
+[ip-26-0-150-122:0]:2023-06-21 17:27:08,283 [Rank 0]: Detected CUDA files, patching ldflags
+[ip-26-0-150-122:0]:2023-06-21 17:27:08,283 [Rank 0]: Emitting ninja build file /fsx/loubna/code/Megatron-LM/megatron/fused_kernels/build/build.ninja...
+[ip-26-0-150-122:0]:2023-06-21 17:27:08,285 [Rank 0]: Building extension module scaled_softmax_cuda...
+[ip-26-0-150-122:0]:2023-06-21 17:27:08,285 [Rank 0]: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+[ip-26-0-150-122:0]:ninja: no work to do.
+[ip-26-0-150-122:0]:2023-06-21 17:27:08,400 [Rank 0]: Loading extension module scaled_softmax_cuda...
+[ip-26-0-150-122:0]:2023-06-21 17:27:08,523 [Rank 0]: Detected CUDA files, patching ldflags
+[ip-26-0-150-122:0]:2023-06-21 17:27:08,523 [Rank 0]: Emitting ninja build file /fsx/loubna/code/Megatron-LM/megatron/fused_kernels/build/build.ninja...
+[ip-26-0-150-122:0]:2023-06-21 17:27:08,525 [Rank 0]: Building extension module fused_mix_prec_layer_norm_cuda...
+[ip-26-0-150-122:0]:2023-06-21 17:27:08,525 [Rank 0]: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+[ip-26-0-150-122:0]:ninja: no work to do.
+[ip-26-0-150-122:0]:2023-06-21 17:27:08,639 [Rank 0]: Loading extension module fused_mix_prec_layer_norm_cuda...
+[ip-26-0-150-122:0]:2023-06-21 17:27:08,734 [Rank 0]: Detected CUDA files, patching ldflags
+[ip-26-0-150-122:0]:2023-06-21 17:27:08,734 [Rank 0]: Emitting ninja build file /fsx/loubna/code/Megatron-LM/megatron/fused_kernels/build/build.ninja...
+[ip-26-0-150-122:0]:2023-06-21 17:27:08,736 [Rank 0]: Building extension module fused_dense_cuda...
+[ip-26-0-150-122:0]:2023-06-21 17:27:08,736 [Rank 0]: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+[ip-26-0-150-122:0]:ninja: no work to do.
+[ip-26-0-150-122:0]:2023-06-21 17:27:08,853 [Rank 0]: Loading extension module fused_dense_cuda...
+[ip-26-0-150-122:0]:2023-06-21 17:27:20,633 [Rank 0]: >>> done with compiling and loading fused kernels. Compilation time: 12.903 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,218 [Rank 0]: time to initialize megatron (seconds): 20.911
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,220 [Rank 0]: [after megatron is initialized] datetime: 2023-06-21 17:27:22 
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,220 [Rank 0]: building GPT model ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,697 [Rank 0]:  > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1137207296
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,894 [Rank 0]: > learning rate decay style: cosine
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,908 [Rank 0]: WARNING: could not find the metadata file /fsx/bigcode/experiments/pretraining/1b-starcoder/latest_checkpointed_iteration.txt 
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,909 [Rank 0]:     will not load any checkpoints and will start from random
+[ip-26-0-155-69:7]:2023-06-21 17:27:22,912 [Rank 63]: time (ms) | load-checkpoint: 5.72
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,912 [Rank 0]: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-06-21 17:27:22 
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,912 [Rank 0]: > building train, validation, and test datasets ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,912 [Rank 0]:  > datasets target sizes (minimum size):
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,912 [Rank 0]:     train:      9600000
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,912 [Rank 0]:     validation: 2048
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,912 [Rank 0]:     test:       128
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,912 [Rank 0]: > building train, validation, and test datasets for GPT ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,912 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,930 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,930 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,930 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,930 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,930 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,930 [Rank 0]:  > finished creating indexed dataset in 0.017856 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,930 [Rank 0]:     number of documents: 2721616
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,930 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,931 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,931 [Rank 0]:      document indices in [0, 2637246) total of 2637246 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,982 [Rank 0]:  > Tokens per epoch: 4672499910
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,983 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:22,983 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:23,104 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.121029
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       2637246
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   570373
+[ip-26-0-150-122:0]:2023-06-21 17:27:23,137 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.032361
+[ip-26-0-150-122:0]:2023-06-21 17:27:23,137 [Rank 0]:  > building shuffle index with split [0, 570373) and [570373, 570373) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:23,156 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.018469
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,339 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document_TRAIN_indexmap_37739ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,343 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document_TRAIN_indexmap_37739ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,345 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document_TRAIN_indexmap_37739ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,346 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,346 [Rank 0]:     total number of samples: 570374
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,346 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,430 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,430 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,430 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,431 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,431 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,431 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,431 [Rank 0]:  > finished creating indexed dataset in 0.000770 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,431 [Rank 0]:     number of documents: 968
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,431 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,431 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,431 [Rank 0]:      document indices in [0, 938) total of 938 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,431 [Rank 0]:  > Tokens per epoch: 3695701
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,432 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,432 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,436 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003263
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       938
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   451
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,438 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002331
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,438 [Rank 0]:  > building shuffle index with split [0, 451) and [451, 451) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,440 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001790
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,470 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,475 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,479 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,482 [Rank 0]:     loaded indexed file in 0.012 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,482 [Rank 0]:     total number of samples: 452
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,482 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,565 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,579 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,580 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,580 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,580 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,580 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,580 [Rank 0]:  > finished creating indexed dataset in 0.014644 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,580 [Rank 0]:     number of documents: 8536791
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,580 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,580 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,580 [Rank 0]:      document indices in [0, 8272150) total of 8272150 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,680 [Rank 0]:  > Tokens per epoch: 19732817127
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,681 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:26,681 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,074 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.392686
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       8272150
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2408791
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,203 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.128198
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,203 [Rank 0]:  > building shuffle index with split [0, 2408791) and [2408791, 2408791) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,271 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.068081
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,272 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document_TRAIN_indexmap_677919ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,296 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document_TRAIN_indexmap_677919ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,311 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document_TRAIN_indexmap_677919ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,315 [Rank 0]:     loaded indexed file in 0.043 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,315 [Rank 0]:     total number of samples: 2408792
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,315 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,400 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,402 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,402 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,402 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,402 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,402 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,402 [Rank 0]:  > finished creating indexed dataset in 0.001769 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,402 [Rank 0]:     number of documents: 158792
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,403 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,403 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,403 [Rank 0]:      document indices in [0, 153869) total of 153869 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,404 [Rank 0]:  > Tokens per epoch: 654520539
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,405 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,405 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,413 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.007231
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       153869
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   79897
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,418 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.005225
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,418 [Rank 0]:  > building shuffle index with split [0, 79897) and [79897, 79897) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,422 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004096
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,499 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document_TRAIN_indexmap_22392ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,508 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document_TRAIN_indexmap_22392ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,509 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document_TRAIN_indexmap_22392ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,509 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,509 [Rank 0]:     total number of samples: 79898
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,509 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,592 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,604 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,604 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,604 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,605 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,605 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,605 [Rank 0]:  > finished creating indexed dataset in 0.012752 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,605 [Rank 0]:     number of documents: 153194
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,605 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,605 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,605 [Rank 0]:      document indices in [0, 148445) total of 148445 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,607 [Rank 0]:  > Tokens per epoch: 277062287
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,608 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,608 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,616 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.007522
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       148445
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   33821
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,619 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003392
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,619 [Rank 0]:  > building shuffle index with split [0, 33821) and [33821, 33821) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,623 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003356
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,664 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document_TRAIN_indexmap_10693ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,673 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document_TRAIN_indexmap_10693ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,673 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document_TRAIN_indexmap_10693ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,674 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,674 [Rank 0]:     total number of samples: 33822
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,674 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,757 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,769 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,769 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,769 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,769 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,770 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,770 [Rank 0]:  > finished creating indexed dataset in 0.012699 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,770 [Rank 0]:     number of documents: 2239354
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,770 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,770 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,770 [Rank 0]:      document indices in [0, 2169934) total of 2169934 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,816 [Rank 0]:  > Tokens per epoch: 1397148734
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,818 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,818 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,911 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.093259
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       2169934
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   170550
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,925 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.013573
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,925 [Rank 0]:  > building shuffle index with split [0, 170550) and [170550, 170550) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,931 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.006101
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,932 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document_TRAIN_indexmap_71453ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,949 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document_TRAIN_indexmap_71453ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,952 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document_TRAIN_indexmap_71453ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,954 [Rank 0]:     loaded indexed file in 0.022 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,954 [Rank 0]:     total number of samples: 170551
+[ip-26-0-150-122:0]:2023-06-21 17:27:27,954 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,037 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,046 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,046 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,046 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,046 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,047 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,047 [Rank 0]:  > finished creating indexed dataset in 0.009268 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,047 [Rank 0]:     number of documents: 523
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,047 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,047 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,047 [Rank 0]:      document indices in [0, 507) total of 507 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,047 [Rank 0]:  > Tokens per epoch: 1923547
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,048 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,048 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,051 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002938
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       507
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   234
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,054 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003389
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,054 [Rank 0]:  > building shuffle index with split [0, 234) and [234, 234) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,057 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002384
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,060 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,064 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,065 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,067 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,067 [Rank 0]:     total number of samples: 235
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,067 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,151 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,166 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,166 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,166 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,166 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,167 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,167 [Rank 0]:  > finished creating indexed dataset in 0.015570 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,167 [Rank 0]:     number of documents: 295364
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,167 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,167 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,167 [Rank 0]:      document indices in [0, 286208) total of 286208 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,169 [Rank 0]:  > Tokens per epoch: 465259290
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,170 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,170 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,183 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.012969
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       286208
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   56794
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,188 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004081
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,188 [Rank 0]:  > building shuffle index with split [0, 56794) and [56794, 56794) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,191 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003397
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,213 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document_TRAIN_indexmap_16480ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,220 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document_TRAIN_indexmap_16480ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,226 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document_TRAIN_indexmap_16480ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,226 [Rank 0]:     loaded indexed file in 0.014 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,226 [Rank 0]:     total number of samples: 56795
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,226 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,310 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,322 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,322 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,322 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,322 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,323 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,323 [Rank 0]:  > finished creating indexed dataset in 0.013181 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,323 [Rank 0]:     number of documents: 210816
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,323 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,323 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,323 [Rank 0]:      document indices in [0, 204281) total of 204281 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,325 [Rank 0]:  > Tokens per epoch: 280134685
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,326 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,326 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,336 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.009267
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       204281
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   34196
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,339 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003559
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,339 [Rank 0]:  > building shuffle index with split [0, 34196) and [34196, 34196) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,342 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002761
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,375 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document_TRAIN_indexmap_12329ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,386 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document_TRAIN_indexmap_12329ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,386 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document_TRAIN_indexmap_12329ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,387 [Rank 0]:     loaded indexed file in 0.012 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,387 [Rank 0]:     total number of samples: 34197
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,387 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,470 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,473 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,473 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,473 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,473 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,474 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,474 [Rank 0]:  > finished creating indexed dataset in 0.004061 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,474 [Rank 0]:     number of documents: 5001
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,474 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,475 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,475 [Rank 0]:      document indices in [0, 4846) total of 4846 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,475 [Rank 0]:  > Tokens per epoch: 30040727
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,475 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,476 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,478 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002589
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       4846
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   3667
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,481 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002957
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,481 [Rank 0]:  > building shuffle index with split [0, 3667) and [3667, 3667) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,483 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001854
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,493 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document_TRAIN_indexmap_1007ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,497 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document_TRAIN_indexmap_1007ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,498 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document_TRAIN_indexmap_1007ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,500 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,500 [Rank 0]:     total number of samples: 3668
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,500 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,583 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,592 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,592 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,592 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,592 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,592 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,593 [Rank 0]:  > finished creating indexed dataset in 0.009055 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,593 [Rank 0]:     number of documents: 8042
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,593 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,593 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,593 [Rank 0]:      document indices in [0, 7793) total of 7793 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,593 [Rank 0]:  > Tokens per epoch: 9515228
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,594 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,594 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,597 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002761
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       7793
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   1161
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,600 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002955
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,600 [Rank 0]:  > building shuffle index with split [0, 1161) and [1161, 1161) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,602 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001982
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,607 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document_TRAIN_indexmap_378ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,611 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document_TRAIN_indexmap_378ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,611 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document_TRAIN_indexmap_378ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,612 [Rank 0]:     loaded indexed file in 0.005 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,612 [Rank 0]:     total number of samples: 1162
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,612 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,695 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,708 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,708 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,708 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,708 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,708 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,708 [Rank 0]:  > finished creating indexed dataset in 0.012909 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,708 [Rank 0]:     number of documents: 16870
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,709 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,709 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,709 [Rank 0]:      document indices in [0, 16347) total of 16347 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,709 [Rank 0]:  > Tokens per epoch: 37114704
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,709 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,710 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,713 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003713
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       16347
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   4530
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,718 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004682
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,718 [Rank 0]:  > building shuffle index with split [0, 4530) and [4530, 4530) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,720 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002166
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,746 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document_TRAIN_indexmap_1133ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,751 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document_TRAIN_indexmap_1133ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,752 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document_TRAIN_indexmap_1133ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,754 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,754 [Rank 0]:     total number of samples: 4531
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,754 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,838 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,855 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,855 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,855 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,855 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,856 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,856 [Rank 0]:  > finished creating indexed dataset in 0.017919 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,856 [Rank 0]:     number of documents: 267627
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,856 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,856 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,856 [Rank 0]:      document indices in [0, 259331) total of 259331 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,858 [Rank 0]:  > Tokens per epoch: 277947540
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,860 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,860 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,871 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.011573
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       259331
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   33929
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,875 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003671
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,875 [Rank 0]:  > building shuffle index with split [0, 33929) and [33929, 33929) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,878 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002755
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,907 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document_TRAIN_indexmap_14090ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,918 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document_TRAIN_indexmap_14090ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,918 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document_TRAIN_indexmap_14090ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,919 [Rank 0]:     loaded indexed file in 0.012 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,919 [Rank 0]:     total number of samples: 33930
+[ip-26-0-150-122:0]:2023-06-21 17:27:28,919 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,002 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,016 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,016 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,016 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,016 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,016 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,016 [Rank 0]:  > finished creating indexed dataset in 0.014051 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,017 [Rank 0]:     number of documents: 4700526
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,017 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,017 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,017 [Rank 0]:      document indices in [0, 4554810) total of 4554810 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,086 [Rank 0]:  > Tokens per epoch: 8260498119
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,086 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,087 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,312 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.225591
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       4554810
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   1008361
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,365 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.052079
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,365 [Rank 0]:  > building shuffle index with split [0, 1008361) and [1008361, 1008361) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,399 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.034506
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,400 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document_TRAIN_indexmap_299145ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,419 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document_TRAIN_indexmap_299145ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,421 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document_TRAIN_indexmap_299145ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,424 [Rank 0]:     loaded indexed file in 0.024 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,424 [Rank 0]:     total number of samples: 1008362
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,424 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,506 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,507 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,507 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,507 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,507 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,509 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,509 [Rank 0]:  > finished creating indexed dataset in 0.002917 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,509 [Rank 0]:     number of documents: 98447
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,509 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,509 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,509 [Rank 0]:      document indices in [0, 95395) total of 95395 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,510 [Rank 0]:  > Tokens per epoch: 218848651
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,510 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,511 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,515 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.004892
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       95395
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   26714
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,520 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004217
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,520 [Rank 0]:  > building shuffle index with split [0, 26714) and [26714, 26714) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,523 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002582
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,555 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document_TRAIN_indexmap_8806ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,563 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document_TRAIN_indexmap_8806ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,564 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document_TRAIN_indexmap_8806ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,564 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,564 [Rank 0]:     total number of samples: 26715
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,564 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,648 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,660 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,660 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,660 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,660 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,661 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,661 [Rank 0]:  > finished creating indexed dataset in 0.013133 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,661 [Rank 0]:     number of documents: 124066
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,661 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,661 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,661 [Rank 0]:      document indices in [0, 120220) total of 120220 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,662 [Rank 0]:  > Tokens per epoch: 158541495
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,663 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,663 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,669 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.006329
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       120220
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   19353
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,673 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004047
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,674 [Rank 0]:  > building shuffle index with split [0, 19353) and [19353, 19353) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,676 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002883
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,677 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document_TRAIN_indexmap_7674ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,686 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document_TRAIN_indexmap_7674ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,686 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document_TRAIN_indexmap_7674ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,686 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,687 [Rank 0]:     total number of samples: 19354
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,687 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,770 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,777 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,777 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,778 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,778 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,778 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,778 [Rank 0]:  > finished creating indexed dataset in 0.008467 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,778 [Rank 0]:     number of documents: 30934
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,778 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,778 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,779 [Rank 0]:      document indices in [0, 29975) total of 29975 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,779 [Rank 0]:  > Tokens per epoch: 67801957
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,780 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,780 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,783 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003628
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       29975
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   8276
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,788 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004615
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,788 [Rank 0]:  > building shuffle index with split [0, 8276) and [8276, 8276) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,790 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002294
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,832 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document_TRAIN_indexmap_3271ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,837 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document_TRAIN_indexmap_3271ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,838 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document_TRAIN_indexmap_3271ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,838 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,838 [Rank 0]:     total number of samples: 8277
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,838 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,922 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,935 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,935 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,935 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,935 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,936 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,936 [Rank 0]:  > finished creating indexed dataset in 0.013900 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,936 [Rank 0]:     number of documents: 110981
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,936 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,936 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,936 [Rank 0]:      document indices in [0, 107541) total of 107541 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,937 [Rank 0]:  > Tokens per epoch: 664777580
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,938 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,938 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,945 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.006143
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       107541
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   81149
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,949 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004054
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,949 [Rank 0]:  > building shuffle index with split [0, 81149) and [81149, 81149) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,953 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003844
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,953 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document_TRAIN_indexmap_21134ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,961 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document_TRAIN_indexmap_21134ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,962 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document_TRAIN_indexmap_21134ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,963 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,963 [Rank 0]:     total number of samples: 81150
+[ip-26-0-150-122:0]:2023-06-21 17:27:29,963 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,046 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,062 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,062 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,062 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,062 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,062 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,062 [Rank 0]:  > finished creating indexed dataset in 0.015842 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,062 [Rank 0]:     number of documents: 365491
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,062 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,062 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,062 [Rank 0]:      document indices in [0, 354161) total of 354161 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,065 [Rank 0]:  > Tokens per epoch: 785360896
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,066 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,066 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,082 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.015095
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       354161
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   95869
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,087 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.005352
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,087 [Rank 0]:  > building shuffle index with split [0, 95869) and [95869, 95869) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,092 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.005175
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,093 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document_TRAIN_indexmap_28053ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,103 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document_TRAIN_indexmap_28053ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,105 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document_TRAIN_indexmap_28053ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,105 [Rank 0]:     loaded indexed file in 0.013 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,105 [Rank 0]:     total number of samples: 95870
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,106 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,189 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,201 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,201 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,201 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,201 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,202 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,203 [Rank 0]:  > finished creating indexed dataset in 0.013115 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,203 [Rank 0]:     number of documents: 39042
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,203 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,203 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,203 [Rank 0]:      document indices in [0, 37832) total of 37832 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,203 [Rank 0]:  > Tokens per epoch: 101034661
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,204 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,204 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,208 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003872
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       37832
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   12333
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,211 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002852
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,211 [Rank 0]:  > building shuffle index with split [0, 12333) and [12333, 12333) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,214 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002238
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,256 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document_TRAIN_indexmap_3774ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,260 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document_TRAIN_indexmap_3774ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,261 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document_TRAIN_indexmap_3774ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,261 [Rank 0]:     loaded indexed file in 0.005 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,261 [Rank 0]:     total number of samples: 12334
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,261 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,345 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,359 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,359 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,359 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,359 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,360 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,360 [Rank 0]:  > finished creating indexed dataset in 0.015110 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,360 [Rank 0]:     number of documents: 97167
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,360 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,360 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,360 [Rank 0]:      document indices in [0, 94155) total of 94155 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,361 [Rank 0]:  > Tokens per epoch: 97494653
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,363 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,363 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,369 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.006694
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       94155
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   11901
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,375 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.005147
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,375 [Rank 0]:  > building shuffle index with split [0, 11901) and [11901, 11901) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,377 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001879
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,377 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document_TRAIN_indexmap_3900ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,386 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document_TRAIN_indexmap_3900ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,386 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document_TRAIN_indexmap_3900ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,387 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,387 [Rank 0]:     total number of samples: 11902
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,387 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,471 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,483 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,483 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,483 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,484 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,484 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,484 [Rank 0]:  > finished creating indexed dataset in 0.012784 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,484 [Rank 0]:     number of documents: 186375
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,484 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,484 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,484 [Rank 0]:      document indices in [0, 180597) total of 180597 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,485 [Rank 0]:  > Tokens per epoch: 146595317
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,486 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,486 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,495 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.008443
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       180597
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   17894
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,499 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003904
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,499 [Rank 0]:  > building shuffle index with split [0, 17894) and [17894, 17894) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,502 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002908
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,502 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document_TRAIN_indexmap_5661ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,511 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document_TRAIN_indexmap_5661ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,511 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document_TRAIN_indexmap_5661ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,512 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,512 [Rank 0]:     total number of samples: 17895
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,512 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,595 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,599 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,600 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,600 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,600 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,601 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,601 [Rank 0]:  > finished creating indexed dataset in 0.005655 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,601 [Rank 0]:     number of documents: 9226
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,601 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,601 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,601 [Rank 0]:      document indices in [0, 8940) total of 8940 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,601 [Rank 0]:  > Tokens per epoch: 51420995
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,602 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,602 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,605 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003138
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       8940
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   6276
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,608 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002635
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,608 [Rank 0]:  > building shuffle index with split [0, 6276) and [6276, 6276) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,610 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001816
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,617 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document_TRAIN_indexmap_1510ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,623 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document_TRAIN_indexmap_1510ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,624 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document_TRAIN_indexmap_1510ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,624 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,624 [Rank 0]:     total number of samples: 6277
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,624 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,708 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,727 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,727 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,728 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,728 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,728 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,728 [Rank 0]:  > finished creating indexed dataset in 0.019740 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,728 [Rank 0]:     number of documents: 3390320
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,728 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,728 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,728 [Rank 0]:      document indices in [0, 3285220) total of 3285220 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,786 [Rank 0]:  > Tokens per epoch: 1939961305
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,787 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,787 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,932 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.144127
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       3285220
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   236811
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,957 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.024971
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,957 [Rank 0]:  > building shuffle index with split [0, 236811) and [236811, 236811) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,965 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.008261
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,966 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document_TRAIN_indexmap_85668ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,984 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document_TRAIN_indexmap_85668ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,986 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document_TRAIN_indexmap_85668ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,988 [Rank 0]:     loaded indexed file in 0.022 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,988 [Rank 0]:     total number of samples: 236812
+[ip-26-0-150-122:0]:2023-06-21 17:27:30,988 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,070 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,086 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,086 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,086 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,086 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,087 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,087 [Rank 0]:  > finished creating indexed dataset in 0.016757 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,087 [Rank 0]:     number of documents: 1380468
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,087 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,087 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,087 [Rank 0]:      document indices in [0, 1337673) total of 1337673 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,119 [Rank 0]:  > Tokens per epoch: 2604422294
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,121 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,121 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,173 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.052313
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1337673
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   317922
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,185 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.011786
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,185 [Rank 0]:  > building shuffle index with split [0, 317922) and [317922, 317922) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,196 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.010673
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,197 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document_TRAIN_indexmap_114601ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,212 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document_TRAIN_indexmap_114601ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,214 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document_TRAIN_indexmap_114601ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,216 [Rank 0]:     loaded indexed file in 0.019 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,216 [Rank 0]:     total number of samples: 317923
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,216 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,299 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,309 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,309 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,309 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,309 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,310 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,310 [Rank 0]:  > finished creating indexed dataset in 0.011284 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,311 [Rank 0]:     number of documents: 5386
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,311 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,311 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,311 [Rank 0]:      document indices in [0, 5219) total of 5219 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,311 [Rank 0]:  > Tokens per epoch: 18878105
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,311 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,312 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,314 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002104
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       5219
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2304
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,317 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002797
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,317 [Rank 0]:  > building shuffle index with split [0, 2304) and [2304, 2304) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,319 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002167
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,327 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document_TRAIN_indexmap_755ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,331 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document_TRAIN_indexmap_755ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,333 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document_TRAIN_indexmap_755ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,334 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,334 [Rank 0]:     total number of samples: 2305
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,334 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,418 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,434 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,434 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,435 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,435 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,435 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,435 [Rank 0]:  > finished creating indexed dataset in 0.017124 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,435 [Rank 0]:     number of documents: 10801285
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,435 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,435 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,435 [Rank 0]:      document indices in [0, 10466445) total of 10466445 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,562 [Rank 0]:  > Tokens per epoch: 10146940270
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,563 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:31,563 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,100 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.536307
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       10466445
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   1238640
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,212 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.112106
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,212 [Rank 0]:  > building shuffle index with split [0, 1238640) and [1238640, 1238640) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,251 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.038956
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,252 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document_TRAIN_indexmap_561808ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,279 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document_TRAIN_indexmap_561808ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,283 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document_TRAIN_indexmap_561808ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,285 [Rank 0]:     loaded indexed file in 0.033 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,285 [Rank 0]:     total number of samples: 1238641
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,285 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,372 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,374 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,374 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,375 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,375 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,375 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,375 [Rank 0]:  > finished creating indexed dataset in 0.002100 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,375 [Rank 0]:     number of documents: 587748
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,375 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,375 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,375 [Rank 0]:      document indices in [0, 569528) total of 569528 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,378 [Rank 0]:  > Tokens per epoch: 191397544
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,380 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,380 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,403 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.022985
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       569528
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   23363
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,408 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004616
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,408 [Rank 0]:  > building shuffle index with split [0, 23363) and [23363, 23363) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,411 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003384
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,430 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document_TRAIN_indexmap_7297ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,443 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document_TRAIN_indexmap_7297ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,443 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document_TRAIN_indexmap_7297ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,444 [Rank 0]:     loaded indexed file in 0.013 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,444 [Rank 0]:     total number of samples: 23364
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,444 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,527 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,536 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,537 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,537 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,537 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,538 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,538 [Rank 0]:  > finished creating indexed dataset in 0.010590 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,538 [Rank 0]:     number of documents: 541454
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,538 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,538 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,538 [Rank 0]:      document indices in [0, 524669) total of 524669 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,542 [Rank 0]:  > Tokens per epoch: 632376464
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,543 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,543 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,563 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.020094
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       524669
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   77194
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,569 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.005697
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,569 [Rank 0]:  > building shuffle index with split [0, 77194) and [77194, 77194) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,574 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004523
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,574 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document_TRAIN_indexmap_28053ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,583 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document_TRAIN_indexmap_28053ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,586 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document_TRAIN_indexmap_28053ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,587 [Rank 0]:     loaded indexed file in 0.012 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,587 [Rank 0]:     total number of samples: 77195
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,587 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,668 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,675 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,676 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,676 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,676 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,676 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,676 [Rank 0]:  > finished creating indexed dataset in 0.007905 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,676 [Rank 0]:     number of documents: 1152
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,676 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,676 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,676 [Rank 0]:      document indices in [0, 1116) total of 1116 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,677 [Rank 0]:  > Tokens per epoch: 1580323
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,677 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,677 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,679 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.001840
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1116
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   192
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,683 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003268
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,683 [Rank 0]:  > building shuffle index with split [0, 192) and [192, 192) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,685 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002161
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,724 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,732 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,735 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,735 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,736 [Rank 0]:     total number of samples: 193
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,736 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,819 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,830 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,830 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,830 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,830 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,832 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,832 [Rank 0]:  > finished creating indexed dataset in 0.012427 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,832 [Rank 0]:     number of documents: 22653
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,832 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,832 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,832 [Rank 0]:      document indices in [0, 21951) total of 21951 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,832 [Rank 0]:  > Tokens per epoch: 493660881
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,834 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,834 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,838 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.004048
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       21951
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   60261
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,842 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003935
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,842 [Rank 0]:  > building shuffle index with split [0, 60261) and [60261, 60261) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,846 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004030
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,847 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document_TRAIN_indexmap_15725ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,852 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document_TRAIN_indexmap_15725ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,854 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document_TRAIN_indexmap_15725ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,856 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,856 [Rank 0]:     total number of samples: 60262
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,856 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,940 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,956 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,956 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,956 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,956 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,957 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,957 [Rank 0]:  > finished creating indexed dataset in 0.017241 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,957 [Rank 0]:     number of documents: 158356
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,957 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,957 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,957 [Rank 0]:      document indices in [0, 153447) total of 153447 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,958 [Rank 0]:  > Tokens per epoch: 324030434
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,959 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,959 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,967 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.007948
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       153447
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   39554
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,972 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004329
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,972 [Rank 0]:  > building shuffle index with split [0, 39554) and [39554, 39554) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,975 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002937
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,975 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document_TRAIN_indexmap_12958ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,981 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document_TRAIN_indexmap_12958ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,982 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document_TRAIN_indexmap_12958ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,983 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,983 [Rank 0]:     total number of samples: 39555
+[ip-26-0-150-122:0]:2023-06-21 17:27:32,983 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,066 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,082 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,082 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,082 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,082 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,083 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,083 [Rank 0]:  > finished creating indexed dataset in 0.016469 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,083 [Rank 0]:     number of documents: 657349
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,083 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,083 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,083 [Rank 0]:      document indices in [0, 636971) total of 636971 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,087 [Rank 0]:  > Tokens per epoch: 483958770
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,089 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,089 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,113 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.024414
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       636971
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   59076
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,118 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004994
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,119 [Rank 0]:  > building shuffle index with split [0, 59076) and [59076, 59076) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,122 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003517
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,125 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document_TRAIN_indexmap_16480ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,137 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document_TRAIN_indexmap_16480ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,138 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document_TRAIN_indexmap_16480ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,138 [Rank 0]:     loaded indexed file in 0.013 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,138 [Rank 0]:     total number of samples: 59077
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,138 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,223 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,232 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,232 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,233 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,233 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,233 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,233 [Rank 0]:  > finished creating indexed dataset in 0.009945 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,233 [Rank 0]:     number of documents: 549459
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,233 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,234 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,234 [Rank 0]:      document indices in [0, 532426) total of 532426 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,237 [Rank 0]:  > Tokens per epoch: 991398359
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,238 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,238 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,259 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.020328
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       532426
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   121020
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,265 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.005836
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,265 [Rank 0]:  > building shuffle index with split [0, 121020) and [121020, 121020) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,270 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.005562
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,297 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document_TRAIN_indexmap_36104ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,313 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document_TRAIN_indexmap_36104ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,321 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document_TRAIN_indexmap_36104ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,321 [Rank 0]:     loaded indexed file in 0.024 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,322 [Rank 0]:     total number of samples: 121021
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,322 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,403 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,404 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,404 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,404 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,404 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,404 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,404 [Rank 0]:  > finished creating indexed dataset in 0.000715 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,404 [Rank 0]:     number of documents: 1133
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,405 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,405 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,405 [Rank 0]:      document indices in [0, 1098) total of 1098 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,405 [Rank 0]:  > Tokens per epoch: 1211172
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,406 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,406 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,408 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002220
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1098
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   147
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,410 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002143
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,410 [Rank 0]:  > building shuffle index with split [0, 147) and [147, 147) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,413 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002741
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,445 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,452 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,452 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,452 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,452 [Rank 0]:     total number of samples: 148
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,453 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,536 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,542 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,542 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,542 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,542 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,543 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,543 [Rank 0]:  > finished creating indexed dataset in 0.006200 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,543 [Rank 0]:     number of documents: 6104
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,543 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,543 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,543 [Rank 0]:      document indices in [0, 5915) total of 5915 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,543 [Rank 0]:  > Tokens per epoch: 16061021
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,544 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,544 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,547 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002938
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       5915
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   1960
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,549 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002421
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,550 [Rank 0]:  > building shuffle index with split [0, 1960) and [1960, 1960) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,553 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003209
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,553 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document_TRAIN_indexmap_629ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,557 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document_TRAIN_indexmap_629ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,558 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document_TRAIN_indexmap_629ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,558 [Rank 0]:     loaded indexed file in 0.005 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,558 [Rank 0]:     total number of samples: 1961
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,558 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,642 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,660 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,660 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,660 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,660 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,661 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,662 [Rank 0]:  > finished creating indexed dataset in 0.019252 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,662 [Rank 0]:     number of documents: 896880
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,662 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,662 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,662 [Rank 0]:      document indices in [0, 869077) total of 869077 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,667 [Rank 0]:  > Tokens per epoch: 1011350209
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,668 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,668 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,702 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.033750
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       869077
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   123455
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,710 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.007142
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,710 [Rank 0]:  > building shuffle index with split [0, 123455) and [123455, 123455) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,718 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.008262
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,718 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document_TRAIN_indexmap_41765ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,732 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document_TRAIN_indexmap_41765ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,733 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document_TRAIN_indexmap_41765ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,733 [Rank 0]:     loaded indexed file in 0.015 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,733 [Rank 0]:     total number of samples: 123456
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,734 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,816 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,821 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,822 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,822 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,822 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,822 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,822 [Rank 0]:  > finished creating indexed dataset in 0.005900 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,822 [Rank 0]:     number of documents: 3688
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,823 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,823 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,823 [Rank 0]:      document indices in [0, 3574) total of 3574 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,823 [Rank 0]:  > Tokens per epoch: 7491397
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,824 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,824 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,826 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002629
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       3574
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   914
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,829 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002932
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,829 [Rank 0]:  > building shuffle index with split [0, 914) and [914, 914) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,832 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002666
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,881 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document_TRAIN_indexmap_378ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,885 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document_TRAIN_indexmap_378ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,886 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document_TRAIN_indexmap_378ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,886 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,886 [Rank 0]:     total number of samples: 915
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,886 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,970 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,981 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,981 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,981 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,981 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,982 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,982 [Rank 0]:  > finished creating indexed dataset in 0.011794 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,982 [Rank 0]:     number of documents: 19630
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,982 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,982 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,982 [Rank 0]:      document indices in [0, 19021) total of 19021 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,983 [Rank 0]:  > Tokens per epoch: 64556260
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,984 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,984 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,987 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003319
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       19021
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   7880
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,990 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002864
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,990 [Rank 0]:  > building shuffle index with split [0, 7880) and [7880, 7880) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:33,993 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002328
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,032 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document_TRAIN_indexmap_2391ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,036 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document_TRAIN_indexmap_2391ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,037 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document_TRAIN_indexmap_2391ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,040 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,040 [Rank 0]:     total number of samples: 7881
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,040 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,124 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,133 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,133 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,133 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,133 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,134 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,135 [Rank 0]:  > finished creating indexed dataset in 0.010661 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,135 [Rank 0]:     number of documents: 46270
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,135 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,135 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,135 [Rank 0]:      document indices in [0, 44836) total of 44836 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,135 [Rank 0]:  > Tokens per epoch: 145587797
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,136 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,136 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,140 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.004174
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       44836
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   17771
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,144 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003956
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,144 [Rank 0]:  > building shuffle index with split [0, 17771) and [17771, 17771) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,147 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002519
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,187 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document_TRAIN_indexmap_4907ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,192 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document_TRAIN_indexmap_4907ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,192 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document_TRAIN_indexmap_4907ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,193 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,193 [Rank 0]:     total number of samples: 17772
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,193 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,277 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,295 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,295 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,295 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,295 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,295 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,295 [Rank 0]:  > finished creating indexed dataset in 0.018105 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,295 [Rank 0]:     number of documents: 522778
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,295 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,295 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,295 [Rank 0]:      document indices in [0, 506572) total of 506572 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,299 [Rank 0]:  > Tokens per epoch: 1833973827
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,301 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,301 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,321 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.020087
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       506572
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   223873
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,329 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.007429
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,329 [Rank 0]:  > building shuffle index with split [0, 223873) and [223873, 223873) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,337 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.007948
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,337 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document_TRAIN_indexmap_65415ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,344 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document_TRAIN_indexmap_65415ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,345 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document_TRAIN_indexmap_65415ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,346 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,346 [Rank 0]:     total number of samples: 223874
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,346 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,430 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,441 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,441 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,441 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,441 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,442 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,442 [Rank 0]:  > finished creating indexed dataset in 0.011031 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,442 [Rank 0]:     number of documents: 10289
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,442 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,442 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,442 [Rank 0]:      document indices in [0, 9970) total of 9970 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,442 [Rank 0]:  > Tokens per epoch: 7959007
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,443 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,444 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,446 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002748
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       9970
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   971
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,449 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002232
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,449 [Rank 0]:  > building shuffle index with split [0, 971) and [971, 971) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,450 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001675
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,465 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document_TRAIN_indexmap_252ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,471 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document_TRAIN_indexmap_252ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,479 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document_TRAIN_indexmap_252ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,479 [Rank 0]:     loaded indexed file in 0.014 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,479 [Rank 0]:     total number of samples: 972
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,479 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,563 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,577 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,577 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,577 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,577 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,578 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,578 [Rank 0]:  > finished creating indexed dataset in 0.014680 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,578 [Rank 0]:     number of documents: 247919
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,578 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,578 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,578 [Rank 0]:      document indices in [0, 240234) total of 240234 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,580 [Rank 0]:  > Tokens per epoch: 774529956
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,581 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,581 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,592 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.011126
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       240234
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   94547
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,597 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004907
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,598 [Rank 0]:  > building shuffle index with split [0, 94547) and [94547, 94547) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,602 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004326
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,624 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document_TRAIN_indexmap_19625ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,635 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document_TRAIN_indexmap_19625ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,635 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document_TRAIN_indexmap_19625ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,636 [Rank 0]:     loaded indexed file in 0.012 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,636 [Rank 0]:     total number of samples: 94548
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,636 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,720 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,727 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,727 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,727 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,727 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,727 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,727 [Rank 0]:  > finished creating indexed dataset in 0.007609 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,728 [Rank 0]:     number of documents: 5368
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,728 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,728 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,728 [Rank 0]:      document indices in [0, 5202) total of 5202 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,728 [Rank 0]:  > Tokens per epoch: 3049652
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,729 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,729 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,732 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002340
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       5202
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   372
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,734 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002822
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,735 [Rank 0]:  > building shuffle index with split [0, 372) and [372, 372) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,737 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002442
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,788 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,792 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,792 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,794 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,794 [Rank 0]:     total number of samples: 373
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,794 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,878 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,889 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,889 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,889 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,889 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,890 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,890 [Rank 0]:  > finished creating indexed dataset in 0.011380 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,890 [Rank 0]:     number of documents: 17554
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,890 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,890 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,890 [Rank 0]:      document indices in [0, 17010) total of 17010 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,890 [Rank 0]:  > Tokens per epoch: 31798875
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,892 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,892 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,895 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003489
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       17010
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   3881
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,898 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002290
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,898 [Rank 0]:  > building shuffle index with split [0, 3881) and [3881, 3881) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,900 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002261
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,917 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document_TRAIN_indexmap_881ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,923 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document_TRAIN_indexmap_881ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,926 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document_TRAIN_indexmap_881ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,928 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,928 [Rank 0]:     total number of samples: 3882
+[ip-26-0-150-122:0]:2023-06-21 17:27:34,928 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,012 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,023 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,023 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,023 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,023 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,024 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,024 [Rank 0]:  > finished creating indexed dataset in 0.011803 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,024 [Rank 0]:     number of documents: 52838
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,024 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,024 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,024 [Rank 0]:      document indices in [0, 51200) total of 51200 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,025 [Rank 0]:  > Tokens per epoch: 122908675
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,025 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,026 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,030 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.004827
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       51200
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   15003
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,034 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003850
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,034 [Rank 0]:  > building shuffle index with split [0, 15003) and [15003, 15003) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,038 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003031
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,038 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document_TRAIN_indexmap_5158ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,045 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document_TRAIN_indexmap_5158ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,046 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document_TRAIN_indexmap_5158ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,046 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,046 [Rank 0]:     total number of samples: 15004
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,046 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,130 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,145 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,145 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,145 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,146 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,146 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,146 [Rank 0]:  > finished creating indexed dataset in 0.016123 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,146 [Rank 0]:     number of documents: 928415
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,146 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,147 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,147 [Rank 0]:      document indices in [0, 899634) total of 899634 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,152 [Rank 0]:  > Tokens per epoch: 909176364
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,153 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,153 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,186 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.032443
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       899634
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   110983
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,193 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.006793
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,193 [Rank 0]:  > building shuffle index with split [0, 110983) and [110983, 110983) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,199 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.005465
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,199 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document_TRAIN_indexmap_46042ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,213 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document_TRAIN_indexmap_46042ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,220 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document_TRAIN_indexmap_46042ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,220 [Rank 0]:     loaded indexed file in 0.021 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,221 [Rank 0]:     total number of samples: 110984
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,221 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,304 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,313 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,313 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,313 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,313 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,314 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,314 [Rank 0]:  > finished creating indexed dataset in 0.010065 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,314 [Rank 0]:     number of documents: 58151
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,314 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,314 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,314 [Rank 0]:      document indices in [0, 56348) total of 56348 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,315 [Rank 0]:  > Tokens per epoch: 185296479
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,316 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,316 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,320 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.004477
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       56348
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   22619
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,323 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002550
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,323 [Rank 0]:  > building shuffle index with split [0, 22619) and [22619, 22619) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,326 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003331
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,367 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document_TRAIN_indexmap_7045ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,373 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document_TRAIN_indexmap_7045ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,373 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document_TRAIN_indexmap_7045ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,375 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,375 [Rank 0]:     total number of samples: 22620
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,375 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,460 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,470 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,470 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,470 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,470 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,470 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,470 [Rank 0]:  > finished creating indexed dataset in 0.010700 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,471 [Rank 0]:     number of documents: 5928
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,471 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,471 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,471 [Rank 0]:      document indices in [0, 5744) total of 5744 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,471 [Rank 0]:  > Tokens per epoch: 10076335
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,471 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,472 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,475 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003244
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       5744
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   1230
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,478 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003426
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,479 [Rank 0]:  > building shuffle index with split [0, 1230) and [1230, 1230) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,481 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002004
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,483 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document_TRAIN_indexmap_378ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,487 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document_TRAIN_indexmap_378ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,487 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document_TRAIN_indexmap_378ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,488 [Rank 0]:     loaded indexed file in 0.005 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,488 [Rank 0]:     total number of samples: 1231
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,488 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,572 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,582 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,582 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,582 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,582 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,584 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,584 [Rank 0]:  > finished creating indexed dataset in 0.011400 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,584 [Rank 0]:     number of documents: 180
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,584 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,584 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,584 [Rank 0]:      document indices in [0, 174) total of 174 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,584 [Rank 0]:  > Tokens per epoch: 173017
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,585 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,585 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,588 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002756
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       174
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   21
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,591 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002442
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,591 [Rank 0]:  > building shuffle index with split [0, 21) and [21, 21) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,592 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001762
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,597 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document_TRAIN_indexmap_13ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,601 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document_TRAIN_indexmap_13ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,601 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document_TRAIN_indexmap_13ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,602 [Rank 0]:     loaded indexed file in 0.005 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,602 [Rank 0]:     total number of samples: 22
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,602 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,686 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,706 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,706 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,706 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,706 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,707 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,707 [Rank 0]:  > finished creating indexed dataset in 0.020667 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,707 [Rank 0]:     number of documents: 239568
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,707 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,707 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,707 [Rank 0]:      document indices in [0, 232141) total of 232141 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,709 [Rank 0]:  > Tokens per epoch: 91736699
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,709 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,709 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,720 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.010457
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       232141
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   11198
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,723 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003149
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,723 [Rank 0]:  > building shuffle index with split [0, 11198) and [11198, 11198) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,725 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001997
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,726 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document_TRAIN_indexmap_2894ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,736 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document_TRAIN_indexmap_2894ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,737 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document_TRAIN_indexmap_2894ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,737 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,737 [Rank 0]:     total number of samples: 11199
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,737 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,821 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,829 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,829 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,829 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,829 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,831 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,831 [Rank 0]:  > finished creating indexed dataset in 0.009548 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,831 [Rank 0]:     number of documents: 4806
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,831 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,831 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,831 [Rank 0]:      document indices in [0, 4657) total of 4657 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,831 [Rank 0]:  > Tokens per epoch: 6417550
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,832 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,833 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,872 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.039575
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       4657
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   783
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,876 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003780
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,876 [Rank 0]:  > building shuffle index with split [0, 783) and [783, 783) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,883 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.006587
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,883 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document_TRAIN_indexmap_252ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,888 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document_TRAIN_indexmap_252ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,891 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document_TRAIN_indexmap_252ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,893 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,893 [Rank 0]:     total number of samples: 784
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,893 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,978 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,988 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,988 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,988 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,988 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,990 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,990 [Rank 0]:  > finished creating indexed dataset in 0.011637 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,990 [Rank 0]:     number of documents: 5429
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,990 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,990 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,990 [Rank 0]:      document indices in [0, 5261) total of 5261 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,991 [Rank 0]:  > Tokens per epoch: 5171243
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,992 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,992 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,995 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003284
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       5261
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   631
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,999 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003735
+[ip-26-0-150-122:0]:2023-06-21 17:27:35,999 [Rank 0]:  > building shuffle index with split [0, 631) and [631, 631) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,002 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002413
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,002 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,008 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,009 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,009 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,009 [Rank 0]:     total number of samples: 632
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,009 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,093 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,109 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,109 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,109 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,109 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,110 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,110 [Rank 0]:  > finished creating indexed dataset in 0.016389 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,110 [Rank 0]:     number of documents: 1355788
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,110 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,110 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,110 [Rank 0]:      document indices in [0, 1313759) total of 1313759 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,136 [Rank 0]:  > Tokens per epoch: 1259346636
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,137 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,137 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,188 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.050271
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1313759
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   153728
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,198 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.009984
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,198 [Rank 0]:  > building shuffle index with split [0, 153728) and [153728, 153728) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,204 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.006081
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,205 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document_TRAIN_indexmap_58999ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,221 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document_TRAIN_indexmap_58999ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,222 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document_TRAIN_indexmap_58999ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,223 [Rank 0]:     loaded indexed file in 0.018 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,223 [Rank 0]:     total number of samples: 153729
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,223 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,306 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,317 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,317 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,317 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,317 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,318 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,318 [Rank 0]:  > finished creating indexed dataset in 0.011732 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,318 [Rank 0]:     number of documents: 49335
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,318 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,318 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,319 [Rank 0]:      document indices in [0, 47806) total of 47806 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,319 [Rank 0]:  > Tokens per epoch: 118964691
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,320 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,320 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,324 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.004637
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       47806
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   14522
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,328 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003610
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,328 [Rank 0]:  > building shuffle index with split [0, 14522) and [14522, 14522) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,333 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004287
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,333 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document_TRAIN_indexmap_4403ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,341 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document_TRAIN_indexmap_4403ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,341 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document_TRAIN_indexmap_4403ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,341 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,342 [Rank 0]:     total number of samples: 14523
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,342 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,425 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,432 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,432 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,432 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,432 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,432 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,432 [Rank 0]:  > finished creating indexed dataset in 0.006905 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,433 [Rank 0]:     number of documents: 24208
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,433 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,433 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,433 [Rank 0]:      document indices in [0, 23458) total of 23458 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,433 [Rank 0]:  > Tokens per epoch: 211084584
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,434 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,434 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,437 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003057
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       23458
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   25767
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,440 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003007
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,440 [Rank 0]:  > building shuffle index with split [0, 25767) and [25767, 25767) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,443 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003457
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,448 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document_TRAIN_indexmap_4152ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,453 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document_TRAIN_indexmap_4152ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,454 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document_TRAIN_indexmap_4152ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,455 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,455 [Rank 0]:     total number of samples: 25768
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,455 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,539 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,547 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,547 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,547 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,548 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,548 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,548 [Rank 0]:  > finished creating indexed dataset in 0.008639 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,548 [Rank 0]:     number of documents: 4737
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,548 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,548 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,548 [Rank 0]:      document indices in [0, 4590) total of 4590 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,548 [Rank 0]:  > Tokens per epoch: 2509212
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,549 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,550 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,552 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002849
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       4590
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   306
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,555 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002691
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,555 [Rank 0]:  > building shuffle index with split [0, 306) and [306, 306) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,557 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002040
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,562 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,566 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,567 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,567 [Rank 0]:     loaded indexed file in 0.005 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,567 [Rank 0]:     total number of samples: 307
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,567 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,652 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,670 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,670 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,670 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,670 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,670 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,671 [Rank 0]:  > finished creating indexed dataset in 0.018015 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,671 [Rank 0]:     number of documents: 2206327
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,671 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,671 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,671 [Rank 0]:      document indices in [0, 2137931) total of 2137931 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,715 [Rank 0]:  > Tokens per epoch: 1047952508
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,718 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,718 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,806 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.088241
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       2137931
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   127923
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,821 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.014389
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,821 [Rank 0]:  > building shuffle index with split [0, 127923) and [127923, 127923) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,827 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.005672
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,827 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document_TRAIN_indexmap_38872ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,844 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document_TRAIN_indexmap_38872ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,852 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document_TRAIN_indexmap_38872ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,852 [Rank 0]:     loaded indexed file in 0.025 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,852 [Rank 0]:     total number of samples: 127924
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,852 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,936 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,948 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,948 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,948 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,948 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,949 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,949 [Rank 0]:  > finished creating indexed dataset in 0.013536 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,949 [Rank 0]:     number of documents: 125163
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,950 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,950 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,950 [Rank 0]:      document indices in [0, 121283) total of 121283 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,951 [Rank 0]:  > Tokens per epoch: 130456741
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,952 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,952 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,959 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.006978
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       121283
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   15924
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,963 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004428
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,963 [Rank 0]:  > building shuffle index with split [0, 15924) and [15924, 15924) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,967 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003343
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,967 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document_TRAIN_indexmap_5787ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,976 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document_TRAIN_indexmap_5787ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,977 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document_TRAIN_indexmap_5787ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,977 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,977 [Rank 0]:     total number of samples: 15925
+[ip-26-0-150-122:0]:2023-06-21 17:27:36,977 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,061 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,070 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,070 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,070 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,070 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,071 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,071 [Rank 0]:  > finished creating indexed dataset in 0.009922 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,071 [Rank 0]:     number of documents: 41890
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,071 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,071 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,072 [Rank 0]:      document indices in [0, 40591) total of 40591 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,072 [Rank 0]:  > Tokens per epoch: 63430707
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,073 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,073 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,078 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.004754
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       40591
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   7743
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,081 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003213
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,081 [Rank 0]:  > building shuffle index with split [0, 7743) and [7743, 7743) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,084 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002101
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,084 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document_TRAIN_indexmap_2516ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,089 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document_TRAIN_indexmap_2516ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,090 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document_TRAIN_indexmap_2516ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,090 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,090 [Rank 0]:     total number of samples: 7744
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,090 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,174 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,183 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,183 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,183 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,183 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,184 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,184 [Rank 0]:  > finished creating indexed dataset in 0.009798 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,184 [Rank 0]:     number of documents: 7917
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,184 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,184 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,185 [Rank 0]:      document indices in [0, 7672) total of 7672 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,185 [Rank 0]:  > Tokens per epoch: 16598658
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,186 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,186 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,190 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003758
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       7672
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2026
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,194 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003965
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,194 [Rank 0]:  > building shuffle index with split [0, 2026) and [2026, 2026) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,196 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002376
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,197 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document_TRAIN_indexmap_629ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,202 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document_TRAIN_indexmap_629ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,203 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document_TRAIN_indexmap_629ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,203 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,203 [Rank 0]:     total number of samples: 2027
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,203 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,287 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,298 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,298 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,298 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,298 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,299 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,299 [Rank 0]:  > finished creating indexed dataset in 0.011681 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,299 [Rank 0]:     number of documents: 13716
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,299 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,299 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,299 [Rank 0]:      document indices in [0, 13291) total of 13291 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,299 [Rank 0]:  > Tokens per epoch: 15425176
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,300 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,300 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,303 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003304
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       13291
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   1882
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,308 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004424
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,308 [Rank 0]:  > building shuffle index with split [0, 1882) and [1882, 1882) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,310 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002266
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,351 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document_TRAIN_indexmap_504ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,356 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document_TRAIN_indexmap_504ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,357 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document_TRAIN_indexmap_504ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,357 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,357 [Rank 0]:     total number of samples: 1883
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,357 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,441 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,460 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,460 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,460 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,460 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,461 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,461 [Rank 0]:  > finished creating indexed dataset in 0.019475 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,461 [Rank 0]:     number of documents: 975420
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,461 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,461 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,461 [Rank 0]:      document indices in [0, 945182) total of 945182 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,467 [Rank 0]:  > Tokens per epoch: 5267734886
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,469 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,469 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,503 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.033926
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       945182
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   643034
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,518 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.015037
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,518 [Rank 0]:  > building shuffle index with split [0, 643034) and [643034, 643034) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,538 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.019418
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,538 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document_TRAIN_indexmap_139509ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,554 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document_TRAIN_indexmap_139509ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,558 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document_TRAIN_indexmap_139509ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,559 [Rank 0]:     loaded indexed file in 0.021 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,559 [Rank 0]:     total number of samples: 643035
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,559 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,642 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,648 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,648 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,648 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,649 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,649 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,649 [Rank 0]:  > finished creating indexed dataset in 0.007028 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,649 [Rank 0]:     number of documents: 167701
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,649 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,649 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,649 [Rank 0]:      document indices in [0, 162502) total of 162502 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,651 [Rank 0]:  > Tokens per epoch: 170250515
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,651 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,652 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,659 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.007797
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       162502
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   20782
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,663 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003554
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,663 [Rank 0]:  > building shuffle index with split [0, 20782) and [20782, 20782) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,666 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002317
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,686 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document_TRAIN_indexmap_5032ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,696 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document_TRAIN_indexmap_5032ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,697 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document_TRAIN_indexmap_5032ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,697 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,697 [Rank 0]:     total number of samples: 20783
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,697 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,781 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,789 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,790 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,790 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,790 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,791 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,791 [Rank 0]:  > finished creating indexed dataset in 0.009533 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,791 [Rank 0]:     number of documents: 62033
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,791 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,791 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,791 [Rank 0]:      document indices in [0, 60110) total of 60110 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,791 [Rank 0]:  > Tokens per epoch: 73268168
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,793 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,793 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,797 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.004324
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       60110
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   8943
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,800 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003085
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,800 [Rank 0]:  > building shuffle index with split [0, 8943) and [8943, 8943) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,803 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002478
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,842 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document_TRAIN_indexmap_3774ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,847 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document_TRAIN_indexmap_3774ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,847 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document_TRAIN_indexmap_3774ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,847 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,848 [Rank 0]:     total number of samples: 8944
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,848 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,932 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,949 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,949 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,949 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,949 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,950 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,951 [Rank 0]:  > finished creating indexed dataset in 0.018653 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,951 [Rank 0]:     number of documents: 571506
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,951 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,951 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,951 [Rank 0]:      document indices in [0, 553789) total of 553789 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,954 [Rank 0]:  > Tokens per epoch: 142265394
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,955 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,955 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,976 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.020984
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       553789
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   17366
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,981 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004753
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,981 [Rank 0]:  > building shuffle index with split [0, 17366) and [17366, 17366) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,985 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003595
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,985 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document_TRAIN_indexmap_5284ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,991 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document_TRAIN_indexmap_5284ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,993 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document_TRAIN_indexmap_5284ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,994 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,994 [Rank 0]:     total number of samples: 17367
+[ip-26-0-150-122:0]:2023-06-21 17:27:37,994 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,078 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,095 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,095 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,095 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,095 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,096 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,096 [Rank 0]:  > finished creating indexed dataset in 0.017258 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,096 [Rank 0]:     number of documents: 6353527
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,096 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,096 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,096 [Rank 0]:      document indices in [0, 6156568) total of 6156568 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,177 [Rank 0]:  > Tokens per epoch: 15680764197
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,178 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,179 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,481 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.302279
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       6156568
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   1914155
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,559 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.077728
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,559 [Rank 0]:  > building shuffle index with split [0, 1914155) and [1914155, 1914155) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,612 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.052875
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,613 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document_TRAIN_indexmap_615398ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,634 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document_TRAIN_indexmap_615398ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,637 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document_TRAIN_indexmap_615398ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,640 [Rank 0]:     loaded indexed file in 0.027 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,640 [Rank 0]:     total number of samples: 1914156
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,640 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,725 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,728 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,728 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,728 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,728 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,728 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,728 [Rank 0]:  > finished creating indexed dataset in 0.003068 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,728 [Rank 0]:     number of documents: 226209
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,728 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,728 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,728 [Rank 0]:      document indices in [0, 219197) total of 219197 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,730 [Rank 0]:  > Tokens per epoch: 179407601
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,731 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,731 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,740 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.009260
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       219197
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   21900
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,744 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003982
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,744 [Rank 0]:  > building shuffle index with split [0, 21900) and [21900, 21900) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,749 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004086
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,781 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document_TRAIN_indexmap_8051ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,791 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document_TRAIN_indexmap_8051ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,791 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document_TRAIN_indexmap_8051ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,792 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,792 [Rank 0]:     total number of samples: 21901
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,792 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,925 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,927 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,927 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,927 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,927 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,927 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,928 [Rank 0]:  > finished creating indexed dataset in 0.001779 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,928 [Rank 0]:     number of documents: 98733
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,928 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,928 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,928 [Rank 0]:      document indices in [0, 95672) total of 95672 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,929 [Rank 0]:  > Tokens per epoch: 476152050
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,931 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,931 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,939 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.007621
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       95672
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   58124
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,943 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003995
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,943 [Rank 0]:  > building shuffle index with split [0, 58124) and [58124, 58124) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,948 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004955
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,948 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document_TRAIN_indexmap_17612ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,956 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document_TRAIN_indexmap_17612ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,957 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document_TRAIN_indexmap_17612ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,959 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,959 [Rank 0]:     total number of samples: 58125
+[ip-26-0-150-122:0]:2023-06-21 17:27:38,959 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,075 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,077 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,077 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,077 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,077 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,077 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,078 [Rank 0]:  > finished creating indexed dataset in 0.001933 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,078 [Rank 0]:     number of documents: 281016
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,078 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,078 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,078 [Rank 0]:      document indices in [0, 272305) total of 272305 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,080 [Rank 0]:  > Tokens per epoch: 212250969
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,080 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,080 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,093 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.012500
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       272305
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   25909
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,097 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003708
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,097 [Rank 0]:  > building shuffle index with split [0, 25909) and [25909, 25909) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,099 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002505
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,100 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document_TRAIN_indexmap_8932ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,106 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document_TRAIN_indexmap_8932ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,107 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document_TRAIN_indexmap_8932ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,109 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,109 [Rank 0]:     total number of samples: 25910
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,109 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,194 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,210 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,210 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,210 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,210 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,210 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,210 [Rank 0]:  > finished creating indexed dataset in 0.016318 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,210 [Rank 0]:     number of documents: 250834
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,210 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,210 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,210 [Rank 0]:      document indices in [0, 243058) total of 243058 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,212 [Rank 0]:  > Tokens per epoch: 222150396
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,213 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,213 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,224 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.011011
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       243058
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   27117
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,228 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004154
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,228 [Rank 0]:  > building shuffle index with split [0, 27117) and [27117, 27117) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,231 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002729
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,265 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document_TRAIN_indexmap_11448ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,275 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document_TRAIN_indexmap_11448ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,276 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document_TRAIN_indexmap_11448ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,276 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,276 [Rank 0]:     total number of samples: 27118
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,276 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,361 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,373 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,373 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,374 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,374 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,374 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,374 [Rank 0]:  > finished creating indexed dataset in 0.012663 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,374 [Rank 0]:     number of documents: 3299965
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,374 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,374 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,374 [Rank 0]:      document indices in [0, 3197666) total of 3197666 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,434 [Rank 0]:  > Tokens per epoch: 9536019084
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,435 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,435 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,577 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.141893
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       3197666
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   1164064
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,615 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.037362
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,615 [Rank 0]:  > building shuffle index with split [0, 1164064) and [1164064, 1164064) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,649 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.034142
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,650 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document_TRAIN_indexmap_369339ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,670 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document_TRAIN_indexmap_369339ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,674 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document_TRAIN_indexmap_369339ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,676 [Rank 0]:     loaded indexed file in 0.026 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,676 [Rank 0]:     total number of samples: 1164065
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,676 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,759 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,762 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,763 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,763 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,763 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,764 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,764 [Rank 0]:  > finished creating indexed dataset in 0.004384 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,764 [Rank 0]:     number of documents: 20071773
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,764 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,764 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:39,764 [Rank 0]:      document indices in [0, 19449548) total of 19449548 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:40,012 [Rank 0]:  > Tokens per epoch: 21964883896
+[ip-26-0-150-122:0]:2023-06-21 17:27:40,014 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:40,014 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,160 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 1.146600
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       19449548
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2681260
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,425 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.264485
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,425 [Rank 0]:  > building shuffle index with split [0, 2681260) and [2681260, 2681260) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,508 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.082354
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,508 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document_TRAIN_indexmap_1093676ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,535 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document_TRAIN_indexmap_1093676ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,539 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document_TRAIN_indexmap_1093676ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,541 [Rank 0]:     loaded indexed file in 0.033 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,541 [Rank 0]:     total number of samples: 2681261
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,541 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,625 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,643 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,643 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,643 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,643 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,643 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,644 [Rank 0]:  > finished creating indexed dataset in 0.018318 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,644 [Rank 0]:     number of documents: 19544285
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,644 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,644 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,644 [Rank 0]:      document indices in [0, 18938412) total of 18938412 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,862 [Rank 0]:  > Tokens per epoch: 18328788838
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,863 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:41,863 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:42,981 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 1.117496
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       18938412
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2237400
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,231 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.249909
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,231 [Rank 0]:  > building shuffle index with split [0, 2237400) and [2237400, 2237400) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,295 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.063840
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,296 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document_TRAIN_indexmap_814030ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,328 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document_TRAIN_indexmap_814030ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,332 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document_TRAIN_indexmap_814030ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,334 [Rank 0]:     loaded indexed file in 0.039 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,335 [Rank 0]:     total number of samples: 2237401
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,335 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,421 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,437 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,437 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,437 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,437 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,438 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,438 [Rank 0]:  > finished creating indexed dataset in 0.016427 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,438 [Rank 0]:     number of documents: 21029287
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,438 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,438 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,438 [Rank 0]:      document indices in [0, 20377379) total of 20377379 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,674 [Rank 0]:  > Tokens per epoch: 24642614919
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,675 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:43,675 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:44,910 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 1.234511
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       20377379
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   3008131
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,186 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.276196
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,186 [Rank 0]:  > building shuffle index with split [0, 3008131) and [3008131, 3008131) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,274 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.087909
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,275 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document_TRAIN_indexmap_942595ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,282 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document_TRAIN_indexmap_942595ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,285 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document_TRAIN_indexmap_942595ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,287 [Rank 0]:     loaded indexed file in 0.013 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,288 [Rank 0]:     total number of samples: 3008132
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,288 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,373 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,396 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,396 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,396 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,396 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,396 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,396 [Rank 0]:  > finished creating indexed dataset in 0.023523 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,396 [Rank 0]:     number of documents: 15683017
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,396 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,397 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,397 [Rank 0]:      document indices in [0, 15196843) total of 15196843 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,570 [Rank 0]:  > Tokens per epoch: 16296942573
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,572 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:45,572 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,384 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.812244
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       15196843
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   1989372
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,573 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.188375
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,573 [Rank 0]:  > building shuffle index with split [0, 1989372) and [1989372, 1989372) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,629 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.055413
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,629 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document_TRAIN_indexmap_765976ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,638 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document_TRAIN_indexmap_765976ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,640 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document_TRAIN_indexmap_765976ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,642 [Rank 0]:     loaded indexed file in 0.013 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,642 [Rank 0]:     total number of samples: 1989373
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,642 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,727 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,744 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,744 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,744 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,744 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,745 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,745 [Rank 0]:  > finished creating indexed dataset in 0.017557 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,745 [Rank 0]:     number of documents: 12866649
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,745 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,745 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,745 [Rank 0]:      document indices in [0, 12467783) total of 12467783 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,898 [Rank 0]:  > Tokens per epoch: 17087509450
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,899 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:46,899 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:47,537 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.637768
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       12467783
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2085877
+[ip-26-0-150-122:0]:2023-06-21 17:27:47,700 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.163215
+[ip-26-0-150-122:0]:2023-06-21 17:27:47,700 [Rank 0]:  > building shuffle index with split [0, 2085877) and [2085877, 2085877) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:47,761 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.060786
+[ip-26-0-150-122:0]:2023-06-21 17:27:47,762 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document_TRAIN_indexmap_759812ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:47,788 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document_TRAIN_indexmap_759812ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:47,793 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document_TRAIN_indexmap_759812ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:47,795 [Rank 0]:     loaded indexed file in 0.033 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:47,795 [Rank 0]:     total number of samples: 2085878
+[ip-26-0-150-122:0]:2023-06-21 17:27:47,795 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:47,879 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:47,896 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:47,897 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:47,897 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:47,897 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:47,897 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:47,897 [Rank 0]:  > finished creating indexed dataset in 0.017633 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:47,897 [Rank 0]:     number of documents: 10547331
+[ip-26-0-150-122:0]:2023-06-21 17:27:47,897 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:47,897 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:47,897 [Rank 0]:      document indices in [0, 10220364) total of 10220364 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,059 [Rank 0]:  > Tokens per epoch: 7178711685
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,060 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,061 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,570 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.509525
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       10220364
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   876307
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,675 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.104243
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,675 [Rank 0]:  > building shuffle index with split [0, 876307) and [876307, 876307) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,701 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.026336
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,702 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document_TRAIN_indexmap_333613ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,727 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document_TRAIN_indexmap_333613ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,730 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document_TRAIN_indexmap_333613ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,732 [Rank 0]:     loaded indexed file in 0.030 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,732 [Rank 0]:     total number of samples: 876308
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,732 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,814 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,815 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,815 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,815 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,815 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,815 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,815 [Rank 0]:  > finished creating indexed dataset in 0.000670 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,815 [Rank 0]:     number of documents: 75
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,815 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,815 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,815 [Rank 0]:      document indices in [0, 73) total of 73 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,816 [Rank 0]:  > Tokens per epoch: 153326
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,817 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,817 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,820 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002213
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       73
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   18
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,825 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.005342
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,825 [Rank 0]:  > building shuffle index with split [0, 18) and [18, 18) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,827 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002082
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,868 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document_TRAIN_indexmap_13ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,872 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document_TRAIN_indexmap_13ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,872 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document_TRAIN_indexmap_13ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,874 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,874 [Rank 0]:     total number of samples: 19
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,874 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,959 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,973 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,973 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,973 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,973 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,974 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,974 [Rank 0]:  > finished creating indexed dataset in 0.014628 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,974 [Rank 0]:     number of documents: 161239
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,974 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,974 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,974 [Rank 0]:      document indices in [0, 156241) total of 156241 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,976 [Rank 0]:  > Tokens per epoch: 362410000
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,978 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,978 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,986 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.007948
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       156241
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   44239
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,989 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003449
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,989 [Rank 0]:  > building shuffle index with split [0, 44239) and [44239, 44239) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,992 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002924
+[ip-26-0-150-122:0]:2023-06-21 17:27:48,993 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document_TRAIN_indexmap_17864ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,003 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document_TRAIN_indexmap_17864ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,003 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document_TRAIN_indexmap_17864ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,004 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,004 [Rank 0]:     total number of samples: 44240
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,004 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,086 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,093 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,094 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,094 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,094 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,095 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,095 [Rank 0]:  > finished creating indexed dataset in 0.008249 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,095 [Rank 0]:     number of documents: 58208
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,095 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,095 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,095 [Rank 0]:      document indices in [0, 56404) total of 56404 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,095 [Rank 0]:  > Tokens per epoch: 366255320
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,097 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,097 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,101 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.004261
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       56404
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   44708
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,105 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004413
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,106 [Rank 0]:  > building shuffle index with split [0, 44708) and [44708, 44708) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,109 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003122
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,146 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document_TRAIN_indexmap_11825ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,154 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document_TRAIN_indexmap_11825ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,154 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document_TRAIN_indexmap_11825ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,155 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,155 [Rank 0]:     total number of samples: 44709
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,155 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,239 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,246 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,247 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,247 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,247 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,248 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,248 [Rank 0]:  > finished creating indexed dataset in 0.008633 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,248 [Rank 0]:     number of documents: 4661
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,248 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,248 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,248 [Rank 0]:      document indices in [0, 4517) total of 4517 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,248 [Rank 0]:  > Tokens per epoch: 3469924
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,249 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,250 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,253 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003701
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       4517
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   423
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,256 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003052
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,257 [Rank 0]:  > building shuffle index with split [0, 423) and [423, 423) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,259 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002293
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,262 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,266 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,267 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,268 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,268 [Rank 0]:     total number of samples: 424
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,268 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,353 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,364 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,365 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,365 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,365 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,365 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,365 [Rank 0]:  > finished creating indexed dataset in 0.011978 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,365 [Rank 0]:     number of documents: 93
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,365 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,365 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,365 [Rank 0]:      document indices in [0, 90) total of 90 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,365 [Rank 0]:  > Tokens per epoch: 74220
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,366 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,366 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,369 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002468
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       90
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   9
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,372 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003386
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,372 [Rank 0]:  > building shuffle index with split [0, 9) and [9, 9) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,375 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002402
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,375 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document_TRAIN_indexmap_3ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,379 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document_TRAIN_indexmap_3ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,382 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document_TRAIN_indexmap_3ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,383 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,383 [Rank 0]:     total number of samples: 10
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,383 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,467 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,475 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,475 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,475 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,475 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,476 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,476 [Rank 0]:  > finished creating indexed dataset in 0.008526 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,476 [Rank 0]:     number of documents: 7451
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,476 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,476 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,476 [Rank 0]:      document indices in [0, 7220) total of 7220 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,477 [Rank 0]:  > Tokens per epoch: 35201031
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,478 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,478 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,482 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.004081
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       7220
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   4297
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,486 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003545
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,486 [Rank 0]:  > building shuffle index with split [0, 4297) and [4297, 4297) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,488 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002185
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,488 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document_TRAIN_indexmap_1384ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,493 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document_TRAIN_indexmap_1384ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,496 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document_TRAIN_indexmap_1384ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,498 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,498 [Rank 0]:     total number of samples: 4298
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,499 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,583 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,593 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,594 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,594 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,594 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,594 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,594 [Rank 0]:  > finished creating indexed dataset in 0.010884 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,594 [Rank 0]:     number of documents: 15850
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,594 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,594 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,594 [Rank 0]:      document indices in [0, 15359) total of 15359 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,595 [Rank 0]:  > Tokens per epoch: 55447717
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,596 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,596 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,604 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.007724
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       15359
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   6768
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,608 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003874
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,608 [Rank 0]:  > building shuffle index with split [0, 6768) and [6768, 6768) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,616 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.008218
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,616 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document_TRAIN_indexmap_2265ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,622 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document_TRAIN_indexmap_2265ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,624 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document_TRAIN_indexmap_2265ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,626 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,626 [Rank 0]:     total number of samples: 6769
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,626 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,711 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,717 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,717 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,717 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,718 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,718 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,718 [Rank 0]:  > finished creating indexed dataset in 0.007133 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,718 [Rank 0]:     number of documents: 42103
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,718 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,718 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,718 [Rank 0]:      document indices in [0, 40798) total of 40798 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,719 [Rank 0]:  > Tokens per epoch: 136106399
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,720 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,720 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,724 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.004394
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       40798
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   16614
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,728 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003517
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,728 [Rank 0]:  > building shuffle index with split [0, 16614) and [16614, 16614) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,731 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002808
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,771 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document_TRAIN_indexmap_629ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,779 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document_TRAIN_indexmap_629ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,779 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document_TRAIN_indexmap_629ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,780 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,780 [Rank 0]:     total number of samples: 16615
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,780 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,864 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,878 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,878 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,878 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,878 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,879 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,879 [Rank 0]:  > finished creating indexed dataset in 0.014641 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,879 [Rank 0]:     number of documents: 4751547
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,879 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,879 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,879 [Rank 0]:      document indices in [0, 4604249) total of 4604249 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,952 [Rank 0]:  > Tokens per epoch: 2031305386
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,954 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:49,954 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,150 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.196015
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       4604249
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   247962
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,184 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.033755
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,184 [Rank 0]:  > building shuffle index with split [0, 247962) and [247962, 247962) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,193 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.008546
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,193 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document_TRAIN_indexmap_12580ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,213 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document_TRAIN_indexmap_12580ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,214 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document_TRAIN_indexmap_12580ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,215 [Rank 0]:     loaded indexed file in 0.022 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,215 [Rank 0]:     total number of samples: 247963
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,215 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,297 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,306 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,306 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,306 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,306 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,306 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,306 [Rank 0]:  > finished creating indexed dataset in 0.009135 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,306 [Rank 0]:     number of documents: 3995948
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,307 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,307 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,307 [Rank 0]:      document indices in [0, 3872074) total of 3872074 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,382 [Rank 0]:  > Tokens per epoch: 1165518004
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,384 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,384 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,549 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.165272
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       3872074
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   142275
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,577 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.026964
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,577 [Rank 0]:  > building shuffle index with split [0, 142275) and [142275, 142275) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,582 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.005657
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,583 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document_TRAIN_indexmap_12580ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,590 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document_TRAIN_indexmap_12580ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,591 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document_TRAIN_indexmap_12580ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,593 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,593 [Rank 0]:     total number of samples: 142276
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,593 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,677 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,695 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,695 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,695 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,695 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,696 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,696 [Rank 0]:  > finished creating indexed dataset in 0.018911 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,696 [Rank 0]:     number of documents: 30982955
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,696 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,696 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:50,696 [Rank 0]:      document indices in [0, 30022483) total of 30022483 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:51,045 [Rank 0]:  > Tokens per epoch: 17478333988
+[ip-26-0-150-122:0]:2023-06-21 17:27:51,046 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:51,047 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:52,985 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 1.938788
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       30022483
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2133585
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,391 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.405525
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,391 [Rank 0]:  > building shuffle index with split [0, 2133585) and [2133585, 2133585) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,451 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.060010
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,452 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document_TRAIN_indexmap_684334ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,496 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document_TRAIN_indexmap_684334ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,500 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document_TRAIN_indexmap_684334ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,503 [Rank 0]:     loaded indexed file in 0.051 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,503 [Rank 0]:     total number of samples: 2133586
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,503 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,587 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,603 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,603 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,604 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,604 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,604 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,604 [Rank 0]:  > finished creating indexed dataset in 0.016830 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,604 [Rank 0]:     number of documents: 7634718
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,605 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,605 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,605 [Rank 0]:      document indices in [0, 7398042) total of 7398042 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,700 [Rank 0]:  > Tokens per epoch: 15747857063
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,701 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:53,701 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,072 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.370953
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       7398042
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   1922345
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,181 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.108519
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,181 [Rank 0]:  > building shuffle index with split [0, 1922345) and [1922345, 1922345) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,235 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.053852
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,235 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document_TRAIN_indexmap_402550ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,258 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document_TRAIN_indexmap_402550ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,260 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document_TRAIN_indexmap_402550ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,262 [Rank 0]:     loaded indexed file in 0.027 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,262 [Rank 0]:     total number of samples: 1922346
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,262 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,348 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,351 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,351 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,351 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,351 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,351 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,351 [Rank 0]:  > finished creating indexed dataset in 0.003456 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,351 [Rank 0]:     number of documents: 914510
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,351 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,351 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,351 [Rank 0]:      document indices in [0, 886160) total of 886160 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,358 [Rank 0]:  > Tokens per epoch: 2392372458
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,359 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,360 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,396 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.036130
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       886160
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   292037
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,406 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.010301
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,406 [Rank 0]:  > building shuffle index with split [0, 292037) and [292037, 292037) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,415 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.009125
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,416 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document_TRAIN_indexmap_89568ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,429 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document_TRAIN_indexmap_89568ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,430 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document_TRAIN_indexmap_89568ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,431 [Rank 0]:     loaded indexed file in 0.015 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,431 [Rank 0]:     total number of samples: 292038
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,432 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,516 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,527 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,527 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,527 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,527 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,527 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,527 [Rank 0]:  > finished creating indexed dataset in 0.011625 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,528 [Rank 0]:     number of documents: 668743
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,528 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,528 [Rank 0]:     TRAIN:
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,528 [Rank 0]:      document indices in [0, 648012) total of 648012 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,532 [Rank 0]:  > Tokens per epoch: 1927094062
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,533 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,533 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,557 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.024563
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       648012
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   235240
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,568 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.010390
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,568 [Rank 0]:  > building shuffle index with split [0, 235240) and [235240, 235240) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,576 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.007555
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,576 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document_TRAIN_indexmap_75478ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,588 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document_TRAIN_indexmap_75478ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,589 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document_TRAIN_indexmap_75478ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,590 [Rank 0]:     loaded indexed file in 0.014 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,590 [Rank 0]:     total number of samples: 235241
+[ip-26-0-150-122:0]:2023-06-21 17:27:54,590 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:> building indices for blendable datasets ...
+[ip-26-0-150-122:0]: > sample ratios:
+[ip-26-0-150-122:0]:   dataset 0, input: 0.00391159, achieved: 0.00391159
+[ip-26-0-150-122:0]:   dataset 1, input: 1.30386e-05, achieved: 1.30547e-05
+[ip-26-0-150-122:0]:   dataset 2, input: 0.0702651, achieved: 0.0702651
+[ip-26-0-150-122:0]:   dataset 3, input: 0.00232087, achieved: 0.00232087
+[ip-26-0-150-122:0]:   dataset 4, input: 0.00110828, achieved: 0.00110827
+[ip-26-0-150-122:0]:   dataset 5, input: 0.00740594, achieved: 0.00740593
+[ip-26-0-150-122:0]:   dataset 6, input: 1.30386e-05, achieved: 1.30547e-05
+[ip-26-0-150-122:0]:   dataset 7, input: 0.00170806, achieved: 0.00170807
+[ip-26-0-150-122:0]:   dataset 8, input: 0.00127778, achieved: 0.00127778
+[ip-26-0-150-122:0]:   dataset 9, input: 0.000104309, achieved: 0.000104303
+[ip-26-0-150-122:0]:   dataset 10, input: 3.91159e-05, achieved: 3.91303e-05
+[ip-26-0-150-122:0]:   dataset 11, input: 0.000117348, achieved: 0.000117357
+[ip-26-0-150-122:0]:   dataset 12, input: 0.00146033, achieved: 0.00146034
+[ip-26-0-150-122:0]:   dataset 13, input: 0.0310058, achieved: 0.0310058
+[ip-26-0-150-122:0]:   dataset 14, input: 0.000912704, achieved: 0.000912716
+[ip-26-0-150-122:0]:   dataset 15, input: 0.000795356, achieved: 0.000795359
+[ip-26-0-150-122:0]:   dataset 16, input: 0.000339004, achieved: 0.000339018
+[ip-26-0-150-122:0]:   dataset 17, input: 0.00219049, achieved: 0.00219049
+[ip-26-0-150-122:0]:   dataset 18, input: 0.00290761, achieved: 0.00290762
+[ip-26-0-150-122:0]:   dataset 19, input: 0.000391159, achieved: 0.000391169
+[ip-26-0-150-122:0]:   dataset 20, input: 0.000404197, achieved: 0.00040419
+[ip-26-0-150-122:0]:   dataset 21, input: 0.000586738, achieved: 0.000586753
+[ip-26-0-150-122:0]:   dataset 22, input: 0.000156463, achieved: 0.000156454
+[ip-26-0-150-122:0]:   dataset 23, input: 0.0088793, achieved: 0.00887929
+[ip-26-0-150-122:0]:   dataset 24, input: 0.0118782, achieved: 0.0118782
+[ip-26-0-150-122:0]:   dataset 25, input: 7.82317e-05, achieved: 7.8227e-05
+[ip-26-0-150-122:0]:   dataset 26, input: 0.0582305, achieved: 0.0582305
+[ip-26-0-150-122:0]:   dataset 27, input: 0.00075624, achieved: 0.000756228
+[ip-26-0-150-122:0]:   dataset 28, input: 0.00290761, achieved: 0.00290762
+[ip-26-0-150-122:0]:   dataset 29, input: 1.30386e-05, achieved: 1.30547e-05
+[ip-26-0-150-122:0]:   dataset 30, input: 0.00162983, achieved: 0.00162981
+[ip-26-0-150-122:0]:   dataset 31, input: 0.00134298, achieved: 0.00134298
+[ip-26-0-150-122:0]:   dataset 32, input: 0.00170806, achieved: 0.00170804
+[ip-26-0-150-122:0]:   dataset 33, input: 0.00374208, achieved: 0.00374208
+[ip-26-0-150-122:0]:   dataset 34, input: 1.30386e-05, achieved: 1.30547e-05
+[ip-26-0-150-122:0]:   dataset 35, input: 6.51931e-05, achieved: 6.5206e-05
+[ip-26-0-150-122:0]:   dataset 36, input: 0.00432882, achieved: 0.00432883
+[ip-26-0-150-122:0]:   dataset 37, input: 3.91159e-05, achieved: 3.91303e-05
+[ip-26-0-150-122:0]:   dataset 38, input: 0.000247734, achieved: 0.000247736
+[ip-26-0-150-122:0]:   dataset 39, input: 0.000508506, achieved: 0.000508493
+[ip-26-0-150-122:0]:   dataset 40, input: 0.00678008, achieved: 0.00678008
+[ip-26-0-150-122:0]:   dataset 41, input: 2.60772e-05, achieved: 2.60757e-05
+[ip-26-0-150-122:0]:   dataset 42, input: 0.00203403, achieved: 0.00203404
+[ip-26-0-150-122:0]:   dataset 43, input: 1.30386e-05, achieved: 1.30547e-05
+[ip-26-0-150-122:0]:   dataset 44, input: 9.12704e-05, achieved: 9.12817e-05
+[ip-26-0-150-122:0]:   dataset 45, input: 0.000534584, achieved: 0.000534568
+[ip-26-0-150-122:0]:   dataset 46, input: 0.00477214, achieved: 0.00477212
+[ip-26-0-150-122:0]:   dataset 47, input: 0.000730163, achieved: 0.000730153
+[ip-26-0-150-122:0]:   dataset 48, input: 3.91159e-05, achieved: 3.91303e-05
+[ip-26-0-150-122:0]:   dataset 49, input: 1.30386e-06, achieved: 1.3122e-06
+[ip-26-0-150-122:0]:   dataset 50, input: 0.000299888, achieved: 0.000299887
+[ip-26-0-150-122:0]:   dataset 51, input: 2.60772e-05, achieved: 2.60757e-05
+[ip-26-0-150-122:0]:   dataset 52, input: 1.30386e-05, achieved: 1.30547e-05
+[ip-26-0-150-122:0]:   dataset 53, input: 0.00611511, achieved: 0.0061151
+[ip-26-0-150-122:0]:   dataset 54, input: 0.000456352, achieved: 0.000456341
+[ip-26-0-150-122:0]:   dataset 55, input: 0.000430275, achieved: 0.000430266
+[ip-26-0-150-122:0]:   dataset 56, input: 1.30386e-05, achieved: 1.30547e-05
+[ip-26-0-150-122:0]:   dataset 57, input: 0.00402893, achieved: 0.00402895
+[ip-26-0-150-122:0]:   dataset 58, input: 0.000599777, achieved: 0.000599774
+[ip-26-0-150-122:0]:   dataset 59, input: 0.000260772, achieved: 0.000260757
+[ip-26-0-150-122:0]:   dataset 60, input: 6.51931e-05, achieved: 6.5206e-05
+[ip-26-0-150-122:0]:   dataset 61, input: 5.21545e-05, achieved: 5.21514e-05
+[ip-26-0-150-122:0]:   dataset 62, input: 0.0144598, achieved: 0.0144598
+[ip-26-0-150-122:0]:   dataset 63, input: 0.000521545, achieved: 0.000521547
+[ip-26-0-150-122:0]:   dataset 64, input: 0.000391159, achieved: 0.000391169
+[ip-26-0-150-122:0]:   dataset 65, input: 0.000547622, achieved: 0.000547623
+[ip-26-0-150-122:0]:   dataset 66, input: 0.0637849, achieved: 0.0637849
+[ip-26-0-150-122:0]:   dataset 67, input: 0.000834472, achieved: 0.000834455
+[ip-26-0-150-122:0]:   dataset 68, input: 0.00182541, achieved: 0.0018254
+[ip-26-0-150-122:0]:   dataset 69, input: 0.000925742, achieved: 0.000925737
+[ip-26-0-150-122:0]:   dataset 70, input: 0.00118651, achieved: 0.00118653
+[ip-26-0-150-122:0]:   dataset 71, input: 0.0382814, achieved: 0.0382814
+[ip-26-0-150-122:0]:   dataset 72, input: 0.113358, achieved: 0.113358
+[ip-26-0-150-122:0]:   dataset 73, input: 0.0843729, achieved: 0.0843729
+[ip-26-0-150-122:0]:   dataset 74, input: 0.0976984, achieved: 0.0976984
+[ip-26-0-150-122:0]:   dataset 75, input: 0.0793922, achieved: 0.0793922
+[ip-26-0-150-122:0]:   dataset 76, input: 0.0787533, achieved: 0.0787533
+[ip-26-0-150-122:0]:   dataset 77, input: 0.0345784, achieved: 0.0345784
+[ip-26-0-150-122:0]:   dataset 78, input: 1.30386e-06, achieved: 1.3122e-06
+[ip-26-0-150-122:0]:   dataset 79, input: 0.00185148, achieved: 0.00185147
+[ip-26-0-150-122:0]:   dataset 80, input: 0.00122563, achieved: 0.00122562
+[ip-26-0-150-122:0]:   dataset 81, input: 1.30386e-05, achieved: 1.30547e-05
+[ip-26-0-150-122:0]:   dataset 82, input: 2.60772e-07, achieved: 2.69168e-07
+[ip-26-0-150-122:0]:   dataset 83, input: 0.000143425, achieved: 0.000143433
+[ip-26-0-150-122:0]:   dataset 84, input: 0.000234695, achieved: 0.000234681
+[ip-26-0-150-122:0]:   dataset 85, input: 6.51931e-05, achieved: 6.5206e-05
+[ip-26-0-150-122:0]:   dataset 86, input: 0.00130386, achieved: 0.00130385
+[ip-26-0-150-122:0]:   dataset 87, input: 0.00130386, achieved: 0.00130385
+[ip-26-0-150-122:0]:   dataset 88, input: 0.0709301, achieved: 0.0709301
+[ip-26-0-150-122:0]:   dataset 89, input: 0.0417236, achieved: 0.0417236
+[ip-26-0-150-122:0]:   dataset 90, input: 0.0092835, achieved: 0.00928348
+[ip-26-0-150-122:0]:   dataset 91, input: 0.00782317, achieved: 0.00782318
+[ip-26-0-150-122:0]:2023-06-21 17:27:57,965 [Rank 0]: > elapsed time for building blendable dataset indices: 3.29 (sec)
+[ip-26-0-150-122:0]:2023-06-21 17:27:57,966 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:57,969 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:57,969 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:57,969 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:57,969 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:57,969 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:57,969 [Rank 0]:  > finished creating indexed dataset in 0.003214 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:57,969 [Rank 0]:     number of documents: 2721616
+[ip-26-0-150-122:0]:2023-06-21 17:27:57,969 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:57,969 [Rank 0]:     VALID_css:
+[ip-26-0-150-122:0]:2023-06-21 17:27:57,969 [Rank 0]:      document indices in [2637246, 2718894) total of 81648 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:57,970 [Rank 0]:  > Tokens per epoch: 142752310
+[ip-26-0-150-122:0]:2023-06-21 17:27:57,972 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:57,972 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:57,977 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.005069
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       81648
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   17425
+[ip-26-0-150-122:0]:2023-06-21 17:27:57,980 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002764
+[ip-26-0-150-122:0]:2023-06-21 17:27:57,980 [Rank 0]:  > building shuffle index with split [0, 17425) and [17425, 17425) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:57,983 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002703
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,165 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document_VALID_css_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,174 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document_VALID_css_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,174 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document_VALID_css_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,175 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,175 [Rank 0]:     total number of samples: 17426
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,175 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,259 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]:  > finished creating indexed dataset in 0.000721 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]:     number of documents: 968
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]:     VALID_prolog:
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]:      document indices in [938, 967) total of 29 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]:  > Tokens per epoch: 55028
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,263 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,263 [Rank 0]:  > last epoch number of samples (6) is larger than 80% of number of samples per epoch (6), setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,266 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003192
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       29
+[ip-26-0-150-122:0]:     number of epochs:          305
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2048
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,270 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004219
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,270 [Rank 0]:  > building shuffle index with split [0, 2048) and [2048, 2048) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,273 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002822
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,324 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document_VALID_prolog_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,328 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document_VALID_prolog_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,329 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document_VALID_prolog_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,331 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,331 [Rank 0]:     total number of samples: 2049
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,331 [Rank 0]:     total number of epochs: 305
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,415 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,417 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,417 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,417 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,417 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,417 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,418 [Rank 0]:  > finished creating indexed dataset in 0.002199 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,418 [Rank 0]:     number of documents: 8536791
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,418 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,418 [Rank 0]:     VALID_c:
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,418 [Rank 0]:      document indices in [8272150, 8528254) total of 256104 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,420 [Rank 0]:  > Tokens per epoch: 613576495
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,423 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,423 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,434 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.011330
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       256104
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   74899
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,439 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004647
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,439 [Rank 0]:  > building shuffle index with split [0, 74899) and [74899, 74899) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,446 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.006539
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,458 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document_VALID_c_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,467 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document_VALID_c_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,468 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document_VALID_c_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,468 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,469 [Rank 0]:     total number of samples: 74900
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,469 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,551 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,553 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,553 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,553 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,553 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,553 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,553 [Rank 0]:  > finished creating indexed dataset in 0.001810 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,553 [Rank 0]:     number of documents: 158792
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,553 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,554 [Rank 0]:     VALID_fortran:
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,554 [Rank 0]:      document indices in [153869, 158633) total of 4764 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,554 [Rank 0]:  > Tokens per epoch: 18815887
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,556 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,556 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,560 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003936
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       4764
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2296
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,563 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003311
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,563 [Rank 0]:  > building shuffle index with split [0, 2296) and [2296, 2296) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,566 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003001
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,576 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document_VALID_fortran_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,580 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document_VALID_fortran_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,582 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document_VALID_fortran_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,585 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,585 [Rank 0]:     total number of samples: 2297
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,585 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,667 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,669 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,669 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,669 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,669 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,669 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,670 [Rank 0]:  > finished creating indexed dataset in 0.002075 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,670 [Rank 0]:     number of documents: 153194
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,670 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,670 [Rank 0]:     VALID_solidity:
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,670 [Rank 0]:      document indices in [148445, 153041) total of 4596 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,670 [Rank 0]:  > Tokens per epoch: 8220293
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,672 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,673 [Rank 0]:  > last epoch number of samples (42) is smaller than 80% of number of samples per epoch (1003), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,676 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003494
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       4596
+[ip-26-0-150-122:0]:     number of epochs:          3
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   3010
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,683 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.006305
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,683 [Rank 0]:  > building shuffle index with split [0, 2006) and [2006, 3010) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,686 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003234
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,730 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document_VALID_solidity_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,735 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document_VALID_solidity_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,735 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document_VALID_solidity_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,736 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,736 [Rank 0]:     total number of samples: 3011
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,736 [Rank 0]:     total number of epochs: 3
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,818 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,820 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,820 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,820 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,820 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,821 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,821 [Rank 0]:  > finished creating indexed dataset in 0.002482 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,821 [Rank 0]:     number of documents: 2239354
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,821 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,821 [Rank 0]:     VALID_kotlin:
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,821 [Rank 0]:      document indices in [2169934, 2237115) total of 67181 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,822 [Rank 0]:  > Tokens per epoch: 43085225
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,824 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,824 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,829 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.004974
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       67181
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   5259
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,832 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002326
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,832 [Rank 0]:  > building shuffle index with split [0, 5259) and [5259, 5259) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,837 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004907
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,886 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document_VALID_kotlin_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,892 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document_VALID_kotlin_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,893 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document_VALID_kotlin_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,894 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,894 [Rank 0]:     total number of samples: 5260
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,894 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,976 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,977 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,977 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,977 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,977 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,977 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,977 [Rank 0]:  > finished creating indexed dataset in 0.000769 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,977 [Rank 0]:     number of documents: 523
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,977 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,978 [Rank 0]:     VALID_literate-agda:
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,978 [Rank 0]:      document indices in [507, 522) total of 15 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,978 [Rank 0]:  > Tokens per epoch: 46791
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,980 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,980 [Rank 0]:  > last epoch number of samples (4) is larger than 80% of number of samples per epoch (5), setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,984 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003511
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       15
+[ip-26-0-150-122:0]:     number of epochs:          359
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2050
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,987 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003382
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,987 [Rank 0]:  > building shuffle index with split [0, 2050) and [2050, 2050) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,990 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003127
+[ip-26-0-150-122:0]:2023-06-21 17:27:58,993 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document_VALID_literate-agda_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,000 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document_VALID_literate-agda_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,001 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document_VALID_literate-agda_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,001 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,001 [Rank 0]:     total number of samples: 2051
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,001 [Rank 0]:     total number of epochs: 359
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,084 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,086 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,087 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,087 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,087 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,087 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,087 [Rank 0]:  > finished creating indexed dataset in 0.002306 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,087 [Rank 0]:     number of documents: 295364
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,087 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,087 [Rank 0]:     VALID_julia:
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,087 [Rank 0]:      document indices in [286208, 295069) total of 8861 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,087 [Rank 0]:  > Tokens per epoch: 13589070
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,090 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,090 [Rank 0]:  > last epoch number of samples (390) is smaller than 80% of number of samples per epoch (1658), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,094 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003983
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       8861
+[ip-26-0-150-122:0]:     number of epochs:          2
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   3317
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,099 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004236
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,099 [Rank 0]:  > building shuffle index with split [0, 1658) and [1658, 3317) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,101 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002714
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,102 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document_VALID_julia_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,107 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document_VALID_julia_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,107 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document_VALID_julia_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,109 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,109 [Rank 0]:     total number of samples: 3318
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,109 [Rank 0]:     total number of epochs: 2
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,192 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,194 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,194 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,194 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,194 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,194 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,194 [Rank 0]:  > finished creating indexed dataset in 0.002203 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,195 [Rank 0]:     number of documents: 210816
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,195 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,195 [Rank 0]:     VALID_java-server-pages:
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,195 [Rank 0]:      document indices in [204281, 210605) total of 6324 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,195 [Rank 0]:  > Tokens per epoch: 8481384
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,198 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,198 [Rank 0]:  > last epoch number of samples (1013) is larger than 80% of number of samples per epoch (1035), setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,201 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002964
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       6324
+[ip-26-0-150-122:0]:     number of epochs:          2
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2070
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,205 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003585
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,205 [Rank 0]:  > building shuffle index with split [0, 2070) and [2070, 2070) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,207 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002521
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,213 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document_VALID_java-server-pages_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,218 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document_VALID_java-server-pages_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,218 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document_VALID_java-server-pages_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,220 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,220 [Rank 0]:     total number of samples: 2071
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,220 [Rank 0]:     total number of epochs: 2
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,304 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,304 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,304 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,304 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,305 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,305 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,305 [Rank 0]:  > finished creating indexed dataset in 0.000721 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,305 [Rank 0]:     number of documents: 5001
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,305 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,305 [Rank 0]:     VALID_isabelle:
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,305 [Rank 0]:      document indices in [4846, 4996) total of 150 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,305 [Rank 0]:  > Tokens per epoch: 1014769
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,308 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,308 [Rank 0]:  > last epoch number of samples (67) is smaller than 80% of number of samples per epoch (123), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,311 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003356
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       150
+[ip-26-0-150-122:0]:     number of epochs:          17
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2105
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,314 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002580
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,314 [Rank 0]:  > building shuffle index with split [0, 1981) and [1981, 2105) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,318 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003952
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,319 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document_VALID_isabelle_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,326 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document_VALID_isabelle_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,326 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document_VALID_isabelle_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,327 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,327 [Rank 0]:     total number of samples: 2106
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,327 [Rank 0]:     total number of epochs: 17
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,409 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,410 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,410 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,410 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,410 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,410 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,410 [Rank 0]:  > finished creating indexed dataset in 0.000761 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,410 [Rank 0]:     number of documents: 8042
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,410 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,411 [Rank 0]:     VALID_idris:
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,411 [Rank 0]:      document indices in [7793, 8034) total of 241 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,411 [Rank 0]:  > Tokens per epoch: 225513
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,414 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,414 [Rank 0]:  > last epoch number of samples (11) is smaller than 80% of number of samples per epoch (27), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,418 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003664
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       241
+[ip-26-0-150-122:0]:     number of epochs:          75
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2064
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,420 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002453
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,420 [Rank 0]:  > building shuffle index with split [0, 2037) and [2037, 2064) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,423 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003065
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,424 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document_VALID_idris_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,431 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document_VALID_idris_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,431 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document_VALID_idris_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,432 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,432 [Rank 0]:     total number of samples: 2065
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,432 [Rank 0]:     total number of epochs: 75
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,514 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,515 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,515 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,515 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,515 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,515 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,515 [Rank 0]:  > finished creating indexed dataset in 0.000802 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,515 [Rank 0]:     number of documents: 16870
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,515 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,516 [Rank 0]:     VALID_lean:
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,516 [Rank 0]:      document indices in [16347, 16853) total of 506 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,516 [Rank 0]:  > Tokens per epoch: 1042103
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,518 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,519 [Rank 0]:  > last epoch number of samples (13) is smaller than 80% of number of samples per epoch (127), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,522 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003744
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       506
+[ip-26-0-150-122:0]:     number of epochs:          17
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2162
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,526 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003023
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,526 [Rank 0]:  > building shuffle index with split [0, 2035) and [2035, 2162) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,529 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003336
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,529 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document_VALID_lean_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,534 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document_VALID_lean_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,534 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document_VALID_lean_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,535 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,535 [Rank 0]:     total number of samples: 2163
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,535 [Rank 0]:     total number of epochs: 17
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,618 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,620 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,620 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,620 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,620 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,620 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,621 [Rank 0]:  > finished creating indexed dataset in 0.002193 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,621 [Rank 0]:     number of documents: 267627
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,621 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,621 [Rank 0]:     VALID_powershell:
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,621 [Rank 0]:      document indices in [259331, 267359) total of 8028 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,621 [Rank 0]:  > Tokens per epoch: 8559847
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,624 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,624 [Rank 0]:  > last epoch number of samples (1004) is larger than 80% of number of samples per epoch (1044), setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,627 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002673
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       8028
+[ip-26-0-150-122:0]:     number of epochs:          2
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2089
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,630 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003070
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,630 [Rank 0]:  > building shuffle index with split [0, 2089) and [2089, 2089) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,636 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.005995
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,636 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document_VALID_powershell_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,641 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document_VALID_powershell_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,643 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document_VALID_powershell_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,646 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,646 [Rank 0]:     total number of samples: 2090
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,646 [Rank 0]:     total number of epochs: 2
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,729 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,731 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,731 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,731 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,731 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,731 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,731 [Rank 0]:  > finished creating indexed dataset in 0.002337 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,731 [Rank 0]:     number of documents: 4700526
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,731 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,731 [Rank 0]:     VALID_go:
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,732 [Rank 0]:      document indices in [4554810, 4695825) total of 141015 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,733 [Rank 0]:  > Tokens per epoch: 253353715
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,735 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,735 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,743 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.007701
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       141015
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   30926
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,748 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004221
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,748 [Rank 0]:  > building shuffle index with split [0, 30926) and [30926, 30926) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,752 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004390
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,753 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document_VALID_go_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,758 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document_VALID_go_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,758 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document_VALID_go_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,760 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,760 [Rank 0]:     total number of samples: 30927
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,760 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,842 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,843 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,844 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,844 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,844 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,844 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,844 [Rank 0]:  > finished creating indexed dataset in 0.001406 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,844 [Rank 0]:     number of documents: 98447
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,844 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,844 [Rank 0]:     VALID_erlang:
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,844 [Rank 0]:      document indices in [95395, 98349) total of 2954 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,844 [Rank 0]:  > Tokens per epoch: 6597590
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,846 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,847 [Rank 0]:  > last epoch number of samples (438) is smaller than 80% of number of samples per epoch (805), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,850 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003486
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       2954
+[ip-26-0-150-122:0]:     number of epochs:          3
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2416
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,854 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003487
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,854 [Rank 0]:  > building shuffle index with split [0, 1610) and [1610, 2416) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,857 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003391
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,864 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document_VALID_erlang_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,868 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document_VALID_erlang_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,870 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document_VALID_erlang_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,872 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,873 [Rank 0]:     total number of samples: 2417
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,873 [Rank 0]:     total number of epochs: 3
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,956 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,957 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,957 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,957 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,957 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,957 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,958 [Rank 0]:  > finished creating indexed dataset in 0.001523 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,958 [Rank 0]:     number of documents: 124066
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,958 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,958 [Rank 0]:     VALID_f-sharp:
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,958 [Rank 0]:      document indices in [120220, 123942) total of 3722 documents
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,958 [Rank 0]:  > Tokens per epoch: 4694260
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,961 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,961 [Rank 0]:  > last epoch number of samples (329) is smaller than 80% of number of samples per epoch (573), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,964 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003069
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       3722
+[ip-26-0-150-122:0]:     number of epochs:          4
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2292
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,968 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003844
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,968 [Rank 0]:  > building shuffle index with split [0, 1719) and [1719, 2292) ...
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,970 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002148
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,971 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document_VALID_f-sharp_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,976 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document_VALID_f-sharp_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,978 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document_VALID_f-sharp_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,978 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,979 [Rank 0]:     total number of samples: 2293
+[ip-26-0-150-122:0]:2023-06-21 17:27:59,979 [Rank 0]:     total number of epochs: 4
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,061 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,062 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,062 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,062 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,062 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,062 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,062 [Rank 0]:  > finished creating indexed dataset in 0.001108 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,062 [Rank 0]:     number of documents: 30934
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,062 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,062 [Rank 0]:     VALID_ada:
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,062 [Rank 0]:      document indices in [29975, 30903) total of 928 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,063 [Rank 0]:  > Tokens per epoch: 2230554
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,066 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,066 [Rank 0]:  > last epoch number of samples (143) is smaller than 80% of number of samples per epoch (272), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,070 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.004065
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       928
+[ip-26-0-150-122:0]:     number of epochs:          8
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2178
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,074 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004121
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,074 [Rank 0]:  > building shuffle index with split [0, 1905) and [1905, 2178) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,077 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002964
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,122 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document_VALID_ada_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,131 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document_VALID_ada_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,131 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document_VALID_ada_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,131 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,131 [Rank 0]:     total number of samples: 2179
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,131 [Rank 0]:     total number of epochs: 8
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,215 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,216 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,216 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,216 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,216 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,216 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,217 [Rank 0]:  > finished creating indexed dataset in 0.001481 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,217 [Rank 0]:     number of documents: 110981
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,217 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,217 [Rank 0]:     VALID_pascal:
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,217 [Rank 0]:      document indices in [107541, 110870) total of 3329 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,217 [Rank 0]:  > Tokens per epoch: 21526929
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,219 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,219 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,222 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003074
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       3329
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2627
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,225 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002768
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,225 [Rank 0]:  > building shuffle index with split [0, 2627) and [2627, 2627) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,229 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004297
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,232 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document_VALID_pascal_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,236 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document_VALID_pascal_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,237 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document_VALID_pascal_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,237 [Rank 0]:     loaded indexed file in 0.005 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,237 [Rank 0]:     total number of samples: 2628
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,237 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,320 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,322 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,322 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,322 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,322 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,323 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,323 [Rank 0]:  > finished creating indexed dataset in 0.002411 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,323 [Rank 0]:     number of documents: 365491
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,323 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,323 [Rank 0]:     VALID_perl:
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,323 [Rank 0]:      document indices in [354161, 365126) total of 10965 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,323 [Rank 0]:  > Tokens per epoch: 25729670
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,325 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,325 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,329 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003576
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       10965
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   3140
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,331 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002792
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,332 [Rank 0]:  > building shuffle index with split [0, 3140) and [3140, 3140) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,335 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003522
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,341 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document_VALID_perl_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,346 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document_VALID_perl_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,346 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document_VALID_perl_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,347 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,347 [Rank 0]:     total number of samples: 3141
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,347 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,430 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,430 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,430 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,431 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,431 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,431 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,431 [Rank 0]:  > finished creating indexed dataset in 0.000989 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,431 [Rank 0]:     number of documents: 39042
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,431 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,431 [Rank 0]:     VALID_r:
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,431 [Rank 0]:      document indices in [37832, 39003) total of 1171 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,431 [Rank 0]:  > Tokens per epoch: 2880088
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,434 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,434 [Rank 0]:  > last epoch number of samples (291) is larger than 80% of number of samples per epoch (351), setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,438 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003234
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1171
+[ip-26-0-150-122:0]:     number of epochs:          6
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2109
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,441 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003574
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,441 [Rank 0]:  > building shuffle index with split [0, 2109) and [2109, 2109) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,444 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002730
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,447 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document_VALID_r_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,453 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document_VALID_r_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,458 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document_VALID_r_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,459 [Rank 0]:     loaded indexed file in 0.012 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,459 [Rank 0]:     total number of samples: 2110
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,459 [Rank 0]:     total number of epochs: 6
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,542 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,543 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,543 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,543 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,543 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,543 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,543 [Rank 0]:  > finished creating indexed dataset in 0.001363 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,543 [Rank 0]:     number of documents: 97167
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,544 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,544 [Rank 0]:     VALID_protocol-buffer:
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,544 [Rank 0]:      document indices in [94155, 97070) total of 2915 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,544 [Rank 0]:  > Tokens per epoch: 2614634
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,547 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,547 [Rank 0]:  > last epoch number of samples (133) is smaller than 80% of number of samples per epoch (319), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,551 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003422
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       2915
+[ip-26-0-150-122:0]:     number of epochs:          7
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2234
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,554 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003362
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,554 [Rank 0]:  > building shuffle index with split [0, 1915) and [1915, 2234) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,557 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002918
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,557 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document_VALID_protocol-buffer_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,562 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document_VALID_protocol-buffer_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,562 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document_VALID_protocol-buffer_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,563 [Rank 0]:     loaded indexed file in 0.005 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,563 [Rank 0]:     total number of samples: 2235
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,563 [Rank 0]:     total number of epochs: 7
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,646 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,647 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,648 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,648 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,648 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,648 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,648 [Rank 0]:  > finished creating indexed dataset in 0.002081 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,648 [Rank 0]:     number of documents: 186375
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,648 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,648 [Rank 0]:     VALID_cmake:
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,648 [Rank 0]:      document indices in [180597, 186189) total of 5592 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,648 [Rank 0]:  > Tokens per epoch: 4338734
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,651 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,651 [Rank 0]:  > last epoch number of samples (460) is larger than 80% of number of samples per epoch (529), setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,655 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003150
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       5592
+[ip-26-0-150-122:0]:     number of epochs:          4
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2118
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,658 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003588
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,658 [Rank 0]:  > building shuffle index with split [0, 2118) and [2118, 2118) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,661 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002840
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,666 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document_VALID_cmake_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,671 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document_VALID_cmake_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,672 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document_VALID_cmake_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,673 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,673 [Rank 0]:     total number of samples: 2119
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,673 [Rank 0]:     total number of epochs: 4
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,757 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,757 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,757 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,757 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,757 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,757 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,757 [Rank 0]:  > finished creating indexed dataset in 0.000735 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,758 [Rank 0]:     number of documents: 9226
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,758 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,758 [Rank 0]:     VALID_sas:
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,758 [Rank 0]:      document indices in [8940, 9217) total of 277 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,758 [Rank 0]:  > Tokens per epoch: 1021218
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,761 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,761 [Rank 0]:  > last epoch number of samples (54) is smaller than 80% of number of samples per epoch (124), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,764 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002904
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       277
+[ip-26-0-150-122:0]:     number of epochs:          17
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2119
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,768 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003730
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,768 [Rank 0]:  > building shuffle index with split [0, 1994) and [1994, 2119) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,771 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002562
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,771 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document_VALID_sas_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,775 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document_VALID_sas_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,776 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document_VALID_sas_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,777 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,777 [Rank 0]:     total number of samples: 2120
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,777 [Rank 0]:     total number of epochs: 17
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,860 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,862 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,862 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,862 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,863 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,863 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,863 [Rank 0]:  > finished creating indexed dataset in 0.002244 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,863 [Rank 0]:     number of documents: 3390320
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,863 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,863 [Rank 0]:     VALID_ruby:
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,863 [Rank 0]:      document indices in [3285220, 3386930) total of 101710 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,864 [Rank 0]:  > Tokens per epoch: 61345928
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,867 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,867 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,872 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.005407
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       101710
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   7488
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,876 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003428
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,876 [Rank 0]:  > building shuffle index with split [0, 7488) and [7488, 7488) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,879 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002558
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,879 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document_VALID_ruby_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,888 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document_VALID_ruby_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,888 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document_VALID_ruby_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,889 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,889 [Rank 0]:     total number of samples: 7489
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,889 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,971 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,973 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,973 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,973 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,973 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,973 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,973 [Rank 0]:  > finished creating indexed dataset in 0.002114 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,973 [Rank 0]:     number of documents: 1380468
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,973 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,973 [Rank 0]:     VALID_rust:
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,973 [Rank 0]:      document indices in [1337673, 1379088) total of 41415 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,974 [Rank 0]:  > Tokens per epoch: 81845020
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,976 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,977 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,981 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.004022
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       41415
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   9990
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,984 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003029
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,984 [Rank 0]:  > building shuffle index with split [0, 9990) and [9990, 9990) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,988 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004048
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,988 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document_VALID_rust_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,996 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document_VALID_rust_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,997 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document_VALID_rust_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,997 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,997 [Rank 0]:     total number of samples: 9991
+[ip-26-0-150-122:0]:2023-06-21 17:28:00,997 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,079 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,080 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,080 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,080 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,080 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,080 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,080 [Rank 0]:  > finished creating indexed dataset in 0.000731 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,080 [Rank 0]:     number of documents: 5386
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,080 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,080 [Rank 0]:     VALID_rmarkdown:
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,081 [Rank 0]:      document indices in [5219, 5381) total of 162 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,081 [Rank 0]:  > Tokens per epoch: 626200
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,083 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,083 [Rank 0]:  > last epoch number of samples (61) is larger than 80% of number of samples per epoch (76), setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,087 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003994
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       162
+[ip-26-0-150-122:0]:     number of epochs:          27
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2063
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,091 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003976
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,091 [Rank 0]:  > building shuffle index with split [0, 2063) and [2063, 2063) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,094 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002503
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,094 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document_VALID_rmarkdown_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,099 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document_VALID_rmarkdown_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,100 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document_VALID_rmarkdown_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,100 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,100 [Rank 0]:     total number of samples: 2064
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,100 [Rank 0]:     total number of epochs: 27
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,182 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,184 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,184 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,185 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,185 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,185 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,185 [Rank 0]:  > finished creating indexed dataset in 0.002401 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,185 [Rank 0]:     number of documents: 10801285
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,185 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,185 [Rank 0]:     VALID_c-sharp:
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,185 [Rank 0]:      document indices in [10466445, 10790484) total of 324039 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,187 [Rank 0]:  > Tokens per epoch: 318261515
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,190 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,190 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,204 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.013754
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       324039
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   38850
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,208 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004621
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,209 [Rank 0]:  > building shuffle index with split [0, 38850) and [38850, 38850) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,213 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004475
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,213 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document_VALID_c-sharp_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,223 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document_VALID_c-sharp_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,224 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document_VALID_c-sharp_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,224 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,224 [Rank 0]:     total number of samples: 38851
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,224 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,307 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,309 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,309 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,309 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,309 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,309 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,309 [Rank 0]:  > finished creating indexed dataset in 0.002165 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,309 [Rank 0]:     number of documents: 587748
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,309 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,310 [Rank 0]:     VALID_smalltalk:
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,310 [Rank 0]:      document indices in [569528, 587160) total of 17632 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,310 [Rank 0]:  > Tokens per epoch: 6393705
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,313 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,313 [Rank 0]:  > last epoch number of samples (488) is smaller than 80% of number of samples per epoch (780), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,318 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.005171
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       17632
+[ip-26-0-150-122:0]:     number of epochs:          3
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2341
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,324 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.005542
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,324 [Rank 0]:  > building shuffle index with split [0, 1560) and [1560, 2341) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,328 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004037
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,329 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document_VALID_smalltalk_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,334 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document_VALID_smalltalk_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,335 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document_VALID_smalltalk_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,335 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,335 [Rank 0]:     total number of samples: 2342
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,335 [Rank 0]:     total number of epochs: 3
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,418 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,420 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,420 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,420 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,420 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,420 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,420 [Rank 0]:  > finished creating indexed dataset in 0.002215 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,420 [Rank 0]:     number of documents: 541454
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,420 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,420 [Rank 0]:     VALID_haskell:
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,420 [Rank 0]:      document indices in [524669, 540913) total of 16244 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,421 [Rank 0]:  > Tokens per epoch: 19105324
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,423 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,423 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,427 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.004004
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       16244
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2332
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,431 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003378
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,431 [Rank 0]:  > building shuffle index with split [0, 2332) and [2332, 2332) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,434 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002811
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,436 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document_VALID_haskell_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,440 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document_VALID_haskell_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,441 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document_VALID_haskell_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,442 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,442 [Rank 0]:     total number of samples: 2333
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,442 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,525 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,525 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,525 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,525 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,525 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,526 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,526 [Rank 0]:  > finished creating indexed dataset in 0.000686 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,526 [Rank 0]:     number of documents: 1152
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,526 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,526 [Rank 0]:     VALID_maple:
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,526 [Rank 0]:      document indices in [1116, 1151) total of 35 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,526 [Rank 0]:  > Tokens per epoch: 30587
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,529 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,529 [Rank 0]:  > last epoch number of samples (2) is larger than 80% of number of samples per epoch (3), setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,532 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003190
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       35
+[ip-26-0-150-122:0]:     number of epochs:          549
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2049
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,535 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002869
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,536 [Rank 0]:  > building shuffle index with split [0, 2049) and [2049, 2049) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,537 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001853
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,540 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document_VALID_maple_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,545 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document_VALID_maple_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,546 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document_VALID_maple_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,548 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,548 [Rank 0]:     total number of samples: 2050
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,548 [Rank 0]:     total number of epochs: 549
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,631 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]:  > finished creating indexed dataset in 0.000886 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]:     number of documents: 22653
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]:     VALID_mathematica:
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]:      document indices in [21951, 22630) total of 679 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]:  > Tokens per epoch: 16838913
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,635 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,635 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,637 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002029
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       679
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2055
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,639 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.001919
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,639 [Rank 0]:  > building shuffle index with split [0, 2055) and [2055, 2055) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,641 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001841
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,644 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document_VALID_mathematica_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,648 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document_VALID_mathematica_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,652 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document_VALID_mathematica_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,654 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,654 [Rank 0]:     total number of samples: 2056
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,654 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,737 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,738 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,739 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,739 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,739 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,739 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,739 [Rank 0]:  > finished creating indexed dataset in 0.001762 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,739 [Rank 0]:     number of documents: 158356
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,739 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,739 [Rank 0]:     VALID_ocaml:
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,739 [Rank 0]:      document indices in [153447, 158198) total of 4751 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,739 [Rank 0]:  > Tokens per epoch: 9867998
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,742 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,742 [Rank 0]:  > last epoch number of samples (844) is smaller than 80% of number of samples per epoch (1204), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,745 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002709
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       4751
+[ip-26-0-150-122:0]:     number of epochs:          2
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2409
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,748 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003420
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,749 [Rank 0]:  > building shuffle index with split [0, 1204) and [1204, 2409) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,752 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003230
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,793 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document_VALID_ocaml_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,798 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document_VALID_ocaml_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,800 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document_VALID_ocaml_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,802 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,802 [Rank 0]:     total number of samples: 2410
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,802 [Rank 0]:     total number of epochs: 2
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,885 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,887 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,887 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,887 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,887 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,887 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,887 [Rank 0]:  > finished creating indexed dataset in 0.002210 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,887 [Rank 0]:     number of documents: 657349
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,887 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,888 [Rank 0]:     VALID_makefile:
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,888 [Rank 0]:      document indices in [636971, 656692) total of 19721 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,888 [Rank 0]:  > Tokens per epoch: 14806733
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,890 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,890 [Rank 0]:  > last epoch number of samples (241) is smaller than 80% of number of samples per epoch (1807), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,894 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003898
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       19721
+[ip-26-0-150-122:0]:     number of epochs:          2
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   3614
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,897 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002912
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,897 [Rank 0]:  > building shuffle index with split [0, 1807) and [1807, 3614) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,899 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002130
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,945 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document_VALID_makefile_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,953 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document_VALID_makefile_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,953 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document_VALID_makefile_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,954 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,954 [Rank 0]:     total number of samples: 3615
+[ip-26-0-150-122:0]:2023-06-21 17:28:01,954 [Rank 0]:     total number of epochs: 2
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,037 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,038 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,038 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,038 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,038 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,039 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,039 [Rank 0]:  > finished creating indexed dataset in 0.001940 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,039 [Rank 0]:     number of documents: 549459
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,039 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,039 [Rank 0]:     VALID_lua:
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,039 [Rank 0]:      document indices in [532426, 548910) total of 16484 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,039 [Rank 0]:  > Tokens per epoch: 29891276
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,042 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,042 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,046 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003458
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       16484
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   3648
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,049 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002739
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,049 [Rank 0]:  > building shuffle index with split [0, 3648) and [3648, 3648) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,052 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002691
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,052 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document_VALID_lua_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,059 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document_VALID_lua_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,065 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document_VALID_lua_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,068 [Rank 0]:     loaded indexed file in 0.016 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,068 [Rank 0]:     total number of samples: 3649
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,068 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,151 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]:  > finished creating indexed dataset in 0.000729 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]:     number of documents: 1133
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]:     VALID_literate-coffeescript:
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]:      document indices in [1098, 1132) total of 34 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]:  > Tokens per epoch: 39416
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,155 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,155 [Rank 0]:  > last epoch number of samples (4) is larger than 80% of number of samples per epoch (4), setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,158 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002805
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       34
+[ip-26-0-150-122:0]:     number of epochs:          426
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2049
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,162 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003502
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,162 [Rank 0]:  > building shuffle index with split [0, 2049) and [2049, 2049) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,165 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002926
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,165 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document_VALID_literate-coffeescript_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,172 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document_VALID_literate-coffeescript_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,172 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document_VALID_literate-coffeescript_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,172 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,173 [Rank 0]:     total number of samples: 2050
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,173 [Rank 0]:     total number of epochs: 426
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,256 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,256 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,256 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,256 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,256 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,256 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,257 [Rank 0]:  > finished creating indexed dataset in 0.000713 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,257 [Rank 0]:     number of documents: 6104
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,257 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,257 [Rank 0]:     VALID_literate-haskell:
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,257 [Rank 0]:      document indices in [5915, 6098) total of 183 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,257 [Rank 0]:  > Tokens per epoch: 518557
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,259 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,260 [Rank 0]:  > last epoch number of samples (23) is smaller than 80% of number of samples per epoch (63), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,262 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002487
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       183
+[ip-26-0-150-122:0]:     number of epochs:          33
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2088
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,265 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002833
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,265 [Rank 0]:  > building shuffle index with split [0, 2025) and [2025, 2088) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,268 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002635
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,271 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document_VALID_literate-haskell_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,277 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document_VALID_literate-haskell_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,282 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document_VALID_literate-haskell_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,283 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,283 [Rank 0]:     total number of samples: 2089
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,283 [Rank 0]:     total number of epochs: 33
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,366 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,367 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,367 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,367 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,368 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,368 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,368 [Rank 0]:  > finished creating indexed dataset in 0.002059 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,368 [Rank 0]:     number of documents: 896880
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,368 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,368 [Rank 0]:     VALID_restructuredtext:
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,368 [Rank 0]:      document indices in [869077, 895983) total of 26906 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,368 [Rank 0]:  > Tokens per epoch: 31882370
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,371 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,371 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,374 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003130
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       26906
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   3891
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,378 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003741
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,378 [Rank 0]:  > building shuffle index with split [0, 3891) and [3891, 3891) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,380 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001913
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,380 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document_VALID_restructuredtext_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,384 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document_VALID_restructuredtext_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,385 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document_VALID_restructuredtext_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,387 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,387 [Rank 0]:     total number of samples: 3892
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,387 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,470 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]:  > finished creating indexed dataset in 0.000714 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]:     number of documents: 3688
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]:     VALID_racket:
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]:      document indices in [3574, 3684) total of 110 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]:  > Tokens per epoch: 233387
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,474 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,474 [Rank 0]:  > last epoch number of samples (26) is larger than 80% of number of samples per epoch (28), setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,477 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003119
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       110
+[ip-26-0-150-122:0]:     number of epochs:          72
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2051
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,481 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003143
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,481 [Rank 0]:  > building shuffle index with split [0, 2051) and [2051, 2051) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,484 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002894
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,485 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document_VALID_racket_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,492 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document_VALID_racket_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,492 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document_VALID_racket_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,493 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,493 [Rank 0]:     total number of samples: 2052
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,493 [Rank 0]:     total number of epochs: 72
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,576 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,576 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,576 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,576 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,577 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,577 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,577 [Rank 0]:  > finished creating indexed dataset in 0.000879 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,577 [Rank 0]:     number of documents: 19630
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,577 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,577 [Rank 0]:     VALID_standard-ml:
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,577 [Rank 0]:      document indices in [19021, 19610) total of 589 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,577 [Rank 0]:  > Tokens per epoch: 2060914
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,580 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,580 [Rank 0]:  > last epoch number of samples (36) is smaller than 80% of number of samples per epoch (251), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,583 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002579
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       589
+[ip-26-0-150-122:0]:     number of epochs:          9
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2264
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,585 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002657
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,586 [Rank 0]:  > building shuffle index with split [0, 2012) and [2012, 2264) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,588 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002530
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,589 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document_VALID_standard-ml_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,595 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document_VALID_standard-ml_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,595 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document_VALID_standard-ml_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,596 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,596 [Rank 0]:     total number of samples: 2265
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,596 [Rank 0]:     total number of epochs: 9
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,679 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]:  > finished creating indexed dataset in 0.001124 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]:     number of documents: 46270
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]:     VALID_systemverilog:
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]:      document indices in [44836, 46224) total of 1388 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]:  > Tokens per epoch: 4206961
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,682 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,682 [Rank 0]:  > last epoch number of samples (508) is larger than 80% of number of samples per epoch (513), setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,685 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002397
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1388
+[ip-26-0-150-122:0]:     number of epochs:          4
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2054
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,687 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002331
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,688 [Rank 0]:  > building shuffle index with split [0, 2054) and [2054, 2054) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,689 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001858
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,693 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document_VALID_systemverilog_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,698 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document_VALID_systemverilog_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,698 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document_VALID_systemverilog_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,699 [Rank 0]:     loaded indexed file in 0.005 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,699 [Rank 0]:     total number of samples: 2055
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,699 [Rank 0]:     total number of epochs: 4
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,781 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,783 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,783 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,783 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,783 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,784 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,784 [Rank 0]:  > finished creating indexed dataset in 0.002254 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,784 [Rank 0]:     number of documents: 522778
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,784 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,784 [Rank 0]:     VALID_tex:
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,784 [Rank 0]:      document indices in [506572, 522255) total of 15683 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,784 [Rank 0]:  > Tokens per epoch: 56256264
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,786 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,786 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,789 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002800
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       15683
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   6867
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,791 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002120
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,791 [Rank 0]:  > building shuffle index with split [0, 6867) and [6867, 6867) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,793 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002073
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,798 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document_VALID_tex_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,802 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document_VALID_tex_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,803 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document_VALID_tex_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,803 [Rank 0]:     loaded indexed file in 0.005 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,803 [Rank 0]:     total number of samples: 6868
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,803 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,886 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,886 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,886 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,886 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,886 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,887 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,887 [Rank 0]:  > finished creating indexed dataset in 0.000829 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,887 [Rank 0]:     number of documents: 10289
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,887 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,887 [Rank 0]:     VALID_awk:
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,887 [Rank 0]:      document indices in [9970, 10279) total of 309 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,887 [Rank 0]:  > Tokens per epoch: 224077
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,889 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,889 [Rank 0]:  > last epoch number of samples (24) is larger than 80% of number of samples per epoch (27), setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,892 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002709
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       309
+[ip-26-0-150-122:0]:     number of epochs:          75
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2051
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,894 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002155
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,895 [Rank 0]:  > building shuffle index with split [0, 2051) and [2051, 2051) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,898 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003301
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,901 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document_VALID_awk_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,908 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document_VALID_awk_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,908 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document_VALID_awk_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,909 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,909 [Rank 0]:     total number of samples: 2052
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,909 [Rank 0]:     total number of epochs: 75
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,991 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,993 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,993 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,993 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,993 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,993 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,993 [Rank 0]:  > finished creating indexed dataset in 0.002246 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,993 [Rank 0]:     number of documents: 247919
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,993 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,993 [Rank 0]:     VALID_assembly:
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,993 [Rank 0]:      document indices in [240234, 247671) total of 7437 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,994 [Rank 0]:  > Tokens per epoch: 23244839
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,996 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,996 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:02,999 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002417
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       7437
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2837
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,002 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003104
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,002 [Rank 0]:  > building shuffle index with split [0, 2837) and [2837, 2837) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,005 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003012
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,008 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document_VALID_assembly_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,013 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document_VALID_assembly_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,013 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document_VALID_assembly_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,013 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,014 [Rank 0]:     total number of samples: 2838
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,014 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,096 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,096 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,096 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,097 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,097 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,097 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,097 [Rank 0]:  > finished creating indexed dataset in 0.000723 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,097 [Rank 0]:     number of documents: 5368
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,097 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,097 [Rank 0]:     VALID_alloy:
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,097 [Rank 0]:      document indices in [5202, 5363) total of 161 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,097 [Rank 0]:  > Tokens per epoch: 60505
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,100 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,100 [Rank 0]:  > last epoch number of samples (3) is smaller than 80% of number of samples per epoch (7), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,104 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003553
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       161
+[ip-26-0-150-122:0]:     number of epochs:          278
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2053
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,106 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002137
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,106 [Rank 0]:  > building shuffle index with split [0, 2045) and [2045, 2053) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,109 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003112
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,116 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document_VALID_alloy_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,123 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document_VALID_alloy_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,124 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document_VALID_alloy_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,124 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,124 [Rank 0]:     total number of samples: 2054
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,124 [Rank 0]:     total number of epochs: 278
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,207 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,207 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,208 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,208 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,208 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,208 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,208 [Rank 0]:  > finished creating indexed dataset in 0.000798 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,208 [Rank 0]:     number of documents: 17554
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,208 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,208 [Rank 0]:     VALID_agda:
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,208 [Rank 0]:      document indices in [17010, 17536) total of 526 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,208 [Rank 0]:  > Tokens per epoch: 791611
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,210 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,210 [Rank 0]:  > last epoch number of samples (19) is smaller than 80% of number of samples per epoch (96), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,214 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003379
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       526
+[ip-26-0-150-122:0]:     number of epochs:          22
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2125
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,217 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003273
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,217 [Rank 0]:  > building shuffle index with split [0, 2029) and [2029, 2125) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,221 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003384
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,221 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document_VALID_agda_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,227 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document_VALID_agda_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,228 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document_VALID_agda_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,231 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,231 [Rank 0]:     total number of samples: 2126
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,231 [Rank 0]:     total number of epochs: 22
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,313 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,314 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,314 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,314 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,314 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,314 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,314 [Rank 0]:  > finished creating indexed dataset in 0.001096 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,314 [Rank 0]:     number of documents: 52838
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,315 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,315 [Rank 0]:     VALID_emacs-lisp:
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,315 [Rank 0]:      document indices in [51200, 52785) total of 1585 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,315 [Rank 0]:  > Tokens per epoch: 3599819
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,317 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,317 [Rank 0]:  > last epoch number of samples (291) is smaller than 80% of number of samples per epoch (439), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,320 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002774
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1585
+[ip-26-0-150-122:0]:     number of epochs:          5
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2197
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,323 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002762
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,323 [Rank 0]:  > building shuffle index with split [0, 1757) and [1757, 2197) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,326 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002636
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,326 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document_VALID_emacs-lisp_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,331 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document_VALID_emacs-lisp_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,331 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document_VALID_emacs-lisp_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,332 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,332 [Rank 0]:     total number of samples: 2198
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,332 [Rank 0]:     total number of epochs: 5
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,414 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,417 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,417 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,417 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,417 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,417 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,417 [Rank 0]:  > finished creating indexed dataset in 0.002377 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,417 [Rank 0]:     number of documents: 928415
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,417 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,417 [Rank 0]:     VALID_dart:
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,417 [Rank 0]:      document indices in [899634, 927487) total of 27853 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,418 [Rank 0]:  > Tokens per epoch: 27319085
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,420 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,420 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,423 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003215
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       27853
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   3334
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,426 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002939
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,427 [Rank 0]:  > building shuffle index with split [0, 3334) and [3334, 3334) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,429 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002031
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,433 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document_VALID_dart_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,438 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document_VALID_dart_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,439 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document_VALID_dart_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,440 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,440 [Rank 0]:     total number of samples: 3335
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,440 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,523 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]:  > finished creating indexed dataset in 0.001148 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]:     number of documents: 58151
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]:     VALID_cuda:
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]:      document indices in [56348, 58093) total of 1745 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]:  > Tokens per epoch: 5481832
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,527 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,527 [Rank 0]:  > last epoch number of samples (41) is smaller than 80% of number of samples per epoch (669), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,530 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002944
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1745
+[ip-26-0-150-122:0]:     number of epochs:          4
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2676
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,533 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002695
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,533 [Rank 0]:  > building shuffle index with split [0, 2007) and [2007, 2676) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,535 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002061
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,537 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document_VALID_cuda_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,545 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document_VALID_cuda_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,545 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document_VALID_cuda_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,545 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,546 [Rank 0]:     total number of samples: 2677
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,546 [Rank 0]:     total number of epochs: 4
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,628 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,628 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,628 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,629 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,629 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,629 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,629 [Rank 0]:  > finished creating indexed dataset in 0.000731 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,629 [Rank 0]:     number of documents: 5928
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,629 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,629 [Rank 0]:     VALID_bluespec:
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,629 [Rank 0]:      document indices in [5744, 5922) total of 178 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,629 [Rank 0]:  > Tokens per epoch: 389178
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,631 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,631 [Rank 0]:  > last epoch number of samples (6) is smaller than 80% of number of samples per epoch (47), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,634 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002778
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       178
+[ip-26-0-150-122:0]:     number of epochs:          44
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2090
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,636 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002164
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,636 [Rank 0]:  > building shuffle index with split [0, 2042) and [2042, 2090) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,639 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003067
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,642 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document_VALID_bluespec_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,646 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document_VALID_bluespec_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,647 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document_VALID_bluespec_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,647 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,647 [Rank 0]:     total number of samples: 2091
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,647 [Rank 0]:     total number of epochs: 44
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,730 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,730 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,730 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,730 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,731 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,731 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,731 [Rank 0]:  > finished creating indexed dataset in 0.000697 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,731 [Rank 0]:     number of documents: 180
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,731 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,731 [Rank 0]:     VALID_augeas:
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,731 [Rank 0]:      document indices in [174, 180) total of 6 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,731 [Rank 0]:  > Tokens per epoch: 7815
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,735 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,735 [Rank 0]:  > last epoch number of samples (1) is larger than 80% of number of samples per epoch (0), setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,738 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003106
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       6
+[ip-26-0-150-122:0]:     number of epochs:          2147
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2048
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,742 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003817
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,742 [Rank 0]:  > building shuffle index with split [0, 2048) and [2048, 2048) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,744 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001868
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,745 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document_VALID_augeas_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,752 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document_VALID_augeas_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,756 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document_VALID_augeas_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,757 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,757 [Rank 0]:     total number of samples: 2049
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,757 [Rank 0]:     total number of epochs: 2147
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,840 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,842 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,842 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,842 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,842 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,842 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,842 [Rank 0]:  > finished creating indexed dataset in 0.002311 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,842 [Rank 0]:     number of documents: 239568
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,842 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,842 [Rank 0]:     VALID_batchfile:
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,843 [Rank 0]:      document indices in [232141, 239328) total of 7187 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,843 [Rank 0]:  > Tokens per epoch: 3729565
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,845 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,846 [Rank 0]:  > last epoch number of samples (227) is smaller than 80% of number of samples per epoch (455), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,849 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003623
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       7187
+[ip-26-0-150-122:0]:     number of epochs:          5
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2276
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,852 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003036
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,853 [Rank 0]:  > building shuffle index with split [0, 1821) and [1821, 2276) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,855 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002944
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,856 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document_VALID_batchfile_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,863 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document_VALID_batchfile_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,868 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document_VALID_batchfile_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,868 [Rank 0]:     loaded indexed file in 0.012 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,868 [Rank 0]:     total number of samples: 2277
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,868 [Rank 0]:     total number of epochs: 5
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,951 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]:  > finished creating indexed dataset in 0.000754 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]:     number of documents: 4806
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]:     VALID_tcsh:
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]:      document indices in [4657, 4801) total of 144 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]:  > Tokens per epoch: 118601
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,955 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,955 [Rank 0]:  > last epoch number of samples (7) is smaller than 80% of number of samples per epoch (14), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,959 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003267
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       144
+[ip-26-0-150-122:0]:     number of epochs:          142
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2055
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,962 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003060
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,962 [Rank 0]:  > building shuffle index with split [0, 2041) and [2041, 2055) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,965 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003306
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,966 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document_VALID_tcsh_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,973 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document_VALID_tcsh_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,979 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document_VALID_tcsh_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,985 [Rank 0]:     loaded indexed file in 0.019 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,985 [Rank 0]:     total number of samples: 2056
+[ip-26-0-150-122:0]:2023-06-21 17:28:03,985 [Rank 0]:     total number of epochs: 142
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,068 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,068 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,069 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,069 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,069 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,069 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,069 [Rank 0]:  > finished creating indexed dataset in 0.000722 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,069 [Rank 0]:     number of documents: 5429
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,069 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,069 [Rank 0]:     VALID_stan:
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,069 [Rank 0]:      document indices in [5261, 5424) total of 163 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,069 [Rank 0]:  > Tokens per epoch: 146349
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,071 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,071 [Rank 0]:  > last epoch number of samples (12) is smaller than 80% of number of samples per epoch (17), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,075 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003399
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       163
+[ip-26-0-150-122:0]:     number of epochs:          115
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2054
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,079 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003582
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,079 [Rank 0]:  > building shuffle index with split [0, 2036) and [2036, 2054) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,081 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001931
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,081 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document_VALID_stan_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,086 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document_VALID_stan_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,086 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document_VALID_stan_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,088 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,088 [Rank 0]:     total number of samples: 2055
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,088 [Rank 0]:     total number of epochs: 115
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,171 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,173 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,174 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,174 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,174 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,174 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,174 [Rank 0]:  > finished creating indexed dataset in 0.002231 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,174 [Rank 0]:     number of documents: 1355788
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,174 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,174 [Rank 0]:     VALID_scala:
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,174 [Rank 0]:      document indices in [1313759, 1354432) total of 40673 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,175 [Rank 0]:  > Tokens per epoch: 38836780
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,176 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,177 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,180 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003434
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       40673
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   4740
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,184 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003736
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,184 [Rank 0]:  > building shuffle index with split [0, 4740) and [4740, 4740) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,187 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003390
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,188 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document_VALID_scala_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,196 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document_VALID_scala_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,197 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document_VALID_scala_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,197 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,197 [Rank 0]:     total number of samples: 4741
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,197 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,280 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]:  > finished creating indexed dataset in 0.001043 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]:     number of documents: 49335
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]:     VALID_tcl:
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]:      document indices in [47806, 49286) total of 1480 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]:  > Tokens per epoch: 3611088
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,283 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,284 [Rank 0]:  > last epoch number of samples (285) is smaller than 80% of number of samples per epoch (440), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,287 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002987
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1480
+[ip-26-0-150-122:0]:     number of epochs:          5
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2204
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,290 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003655
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,291 [Rank 0]:  > building shuffle index with split [0, 1763) and [1763, 2204) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,294 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003071
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,296 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document_VALID_tcl_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,301 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document_VALID_tcl_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,301 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document_VALID_tcl_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,302 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,302 [Rank 0]:     total number of samples: 2205
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,302 [Rank 0]:     total number of epochs: 5
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,384 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,385 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,385 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,385 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,385 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,385 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,385 [Rank 0]:  > finished creating indexed dataset in 0.000846 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,385 [Rank 0]:     number of documents: 24208
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,385 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,385 [Rank 0]:     VALID_stata:
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,385 [Rank 0]:      document indices in [23458, 24184) total of 726 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,386 [Rank 0]:  > Tokens per epoch: 5577566
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,388 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,388 [Rank 0]:  > last epoch number of samples (6) is smaller than 80% of number of samples per epoch (680), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,390 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002375
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       726
+[ip-26-0-150-122:0]:     number of epochs:          4
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2723
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,394 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003521
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,394 [Rank 0]:  > building shuffle index with split [0, 2042) and [2042, 2723) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,398 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003567
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,403 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document_VALID_stata_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,410 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document_VALID_stata_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,415 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document_VALID_stata_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,415 [Rank 0]:     loaded indexed file in 0.012 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,415 [Rank 0]:     total number of samples: 2724
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,415 [Rank 0]:     total number of epochs: 4
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,498 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,498 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,499 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,499 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,499 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,499 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,499 [Rank 0]:  > finished creating indexed dataset in 0.000723 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,499 [Rank 0]:     number of documents: 4737
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,499 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,499 [Rank 0]:     VALID_applescript:
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,499 [Rank 0]:      document indices in [4590, 4732) total of 142 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,499 [Rank 0]:  > Tokens per epoch: 63420
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,502 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,502 [Rank 0]:  > last epoch number of samples (5) is larger than 80% of number of samples per epoch (7), setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,506 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003609
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       142
+[ip-26-0-150-122:0]:     number of epochs:          265
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2051
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,509 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002748
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,509 [Rank 0]:  > building shuffle index with split [0, 2051) and [2051, 2051) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,511 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002564
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,512 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document_VALID_applescript_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,519 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document_VALID_applescript_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,519 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document_VALID_applescript_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,520 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,520 [Rank 0]:     total number of samples: 2052
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,520 [Rank 0]:     total number of epochs: 265
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,603 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,605 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,605 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,605 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,605 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,605 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,605 [Rank 0]:  > finished creating indexed dataset in 0.002281 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,605 [Rank 0]:     number of documents: 2206327
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,605 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,605 [Rank 0]:     VALID_shell:
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,605 [Rank 0]:      document indices in [2137931, 2204121) total of 66190 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,606 [Rank 0]:  > Tokens per epoch: 31891052
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,608 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,608 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,613 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.004554
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       66190
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   3892
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,616 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003262
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,616 [Rank 0]:  > building shuffle index with split [0, 3892) and [3892, 3892) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,620 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003659
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,620 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document_VALID_shell_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,626 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document_VALID_shell_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,626 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document_VALID_shell_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,627 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,627 [Rank 0]:     total number of samples: 3893
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,627 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,709 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]:  > finished creating indexed dataset in 0.001496 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]:     number of documents: 125163
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]:     VALID_clojure:
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]:      document indices in [121283, 125038) total of 3755 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]:  > Tokens per epoch: 3837021
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,714 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,714 [Rank 0]:  > last epoch number of samples (175) is smaller than 80% of number of samples per epoch (468), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,718 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003604
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       3755
+[ip-26-0-150-122:0]:     number of epochs:          5
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2341
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,721 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002620
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,721 [Rank 0]:  > building shuffle index with split [0, 1873) and [1873, 2341) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,723 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002258
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,775 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document_VALID_clojure_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,782 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document_VALID_clojure_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,783 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document_VALID_clojure_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,783 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,783 [Rank 0]:     total number of samples: 2342
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,783 [Rank 0]:     total number of epochs: 5
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,866 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]:  > finished creating indexed dataset in 0.000991 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]:     number of documents: 41890
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]:     VALID_scheme:
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]:      document indices in [40591, 41848) total of 1257 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]:  > Tokens per epoch: 2017219
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,870 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,870 [Rank 0]:  > last epoch number of samples (79) is smaller than 80% of number of samples per epoch (246), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,873 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003488
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1257
+[ip-26-0-150-122:0]:     number of epochs:          9
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2216
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,876 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002666
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,876 [Rank 0]:  > building shuffle index with split [0, 1969) and [1969, 2216) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,879 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002155
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,927 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document_VALID_scheme_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,932 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document_VALID_scheme_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,936 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document_VALID_scheme_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,939 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,939 [Rank 0]:     total number of samples: 2217
+[ip-26-0-150-122:0]:2023-06-21 17:28:04,939 [Rank 0]:     total number of epochs: 9
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,022 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]:  > finished creating indexed dataset in 0.000759 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]:     number of documents: 7917
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]:     VALID_antlr:
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]:      document indices in [7672, 7909) total of 237 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]:  > Tokens per epoch: 1102148
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,026 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,026 [Rank 0]:  > last epoch number of samples (30) is smaller than 80% of number of samples per epoch (134), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,030 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003216
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       237
+[ip-26-0-150-122:0]:     number of epochs:          16
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2152
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,032 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002051
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,032 [Rank 0]:  > building shuffle index with split [0, 2018) and [2018, 2152) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,036 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003610
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,067 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document_VALID_antlr_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,075 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document_VALID_antlr_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,075 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document_VALID_antlr_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,076 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,076 [Rank 0]:     total number of samples: 2153
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,076 [Rank 0]:     total number of epochs: 16
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,159 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,159 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,159 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,159 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,160 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,160 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,160 [Rank 0]:  > finished creating indexed dataset in 0.000803 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,160 [Rank 0]:     number of documents: 13716
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,160 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,160 [Rank 0]:     VALID_sparql:
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,160 [Rank 0]:      document indices in [13291, 13702) total of 411 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,160 [Rank 0]:  > Tokens per epoch: 465467
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,162 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,163 [Rank 0]:  > last epoch number of samples (3) is smaller than 80% of number of samples per epoch (56), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,166 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003404
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       411
+[ip-26-0-150-122:0]:     number of epochs:          37
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2102
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,169 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002594
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,169 [Rank 0]:  > building shuffle index with split [0, 2045) and [2045, 2102) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,172 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002611
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,172 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document_VALID_sparql_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,177 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document_VALID_sparql_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,220 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document_VALID_sparql_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,225 [Rank 0]:     loaded indexed file in 0.053 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,225 [Rank 0]:     total number of samples: 2103
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,225 [Rank 0]:     total number of epochs: 37
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,300 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,301 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,301 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,301 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,301 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,301 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,302 [Rank 0]:  > finished creating indexed dataset in 0.001526 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,302 [Rank 0]:     number of documents: 975420
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,302 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,302 [Rank 0]:     VALID_sql:
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,302 [Rank 0]:      document indices in [945182, 974445) total of 29263 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,302 [Rank 0]:  > Tokens per epoch: 164859090
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,305 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,305 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,308 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003180
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       29263
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   20124
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,311 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002947
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,311 [Rank 0]:  > building shuffle index with split [0, 20124) and [20124, 20124) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,315 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003353
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,315 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document_VALID_sql_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,322 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document_VALID_sql_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,323 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document_VALID_sql_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,323 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,323 [Rank 0]:     total number of samples: 20125
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,323 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,405 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]:  > finished creating indexed dataset in 0.001858 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]:     number of documents: 167701
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]:     VALID_glsl:
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]:      document indices in [162502, 167533) total of 5031 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]:  > Tokens per epoch: 5272081
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,410 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,410 [Rank 0]:  > last epoch number of samples (118) is smaller than 80% of number of samples per epoch (643), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,413 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003568
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       5031
+[ip-26-0-150-122:0]:     number of epochs:          4
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2574
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,418 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004110
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,418 [Rank 0]:  > building shuffle index with split [0, 1930) and [1930, 2574) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,423 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004795
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,423 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document_VALID_glsl_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,428 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document_VALID_glsl_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,429 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document_VALID_glsl_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,429 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,429 [Rank 0]:     total number of samples: 2575
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,429 [Rank 0]:     total number of epochs: 4
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,512 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,513 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,513 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,513 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,513 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,513 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,513 [Rank 0]:  > finished creating indexed dataset in 0.001139 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,513 [Rank 0]:     number of documents: 62033
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,513 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,513 [Rank 0]:     VALID_elm:
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,513 [Rank 0]:      document indices in [60110, 61971) total of 1861 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,514 [Rank 0]:  > Tokens per epoch: 2205938
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,516 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,516 [Rank 0]:  > last epoch number of samples (164) is smaller than 80% of number of samples per epoch (269), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,519 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003202
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1861
+[ip-26-0-150-122:0]:     number of epochs:          8
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2154
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,523 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003124
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,523 [Rank 0]:  > building shuffle index with split [0, 1884) and [1884, 2154) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,525 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002546
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,529 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document_VALID_elm_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,537 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document_VALID_elm_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,537 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document_VALID_elm_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,538 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,538 [Rank 0]:     total number of samples: 2155
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,538 [Rank 0]:     total number of epochs: 8
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,620 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,622 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,622 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,622 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,622 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,622 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,623 [Rank 0]:  > finished creating indexed dataset in 0.001971 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,623 [Rank 0]:     number of documents: 571506
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,623 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,623 [Rank 0]:     VALID_dockerfile:
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,623 [Rank 0]:      document indices in [553789, 570934) total of 17145 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,623 [Rank 0]:  > Tokens per epoch: 4375164
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,625 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,625 [Rank 0]:  > last epoch number of samples (446) is larger than 80% of number of samples per epoch (534), setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,630 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.004376
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       17145
+[ip-26-0-150-122:0]:     number of epochs:          4
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2136
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,632 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002386
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,633 [Rank 0]:  > building shuffle index with split [0, 2136) and [2136, 2136) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,636 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003057
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,636 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document_VALID_dockerfile_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,641 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document_VALID_dockerfile_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,642 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document_VALID_dockerfile_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,642 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,642 [Rank 0]:     total number of samples: 2137
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,642 [Rank 0]:     total number of epochs: 4
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,726 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,728 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,728 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,728 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,728 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,728 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,728 [Rank 0]:  > finished creating indexed dataset in 0.002331 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,728 [Rank 0]:     number of documents: 6353527
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,728 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,728 [Rank 0]:     VALID_cpp:
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,729 [Rank 0]:      document indices in [6156568, 6347173) total of 190605 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,730 [Rank 0]:  > Tokens per epoch: 476705041
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,732 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,733 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,741 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.007958
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       190605
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   58191
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,745 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004300
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,745 [Rank 0]:  > building shuffle index with split [0, 58191) and [58191, 58191) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,750 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.005185
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,751 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document_VALID_cpp_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,759 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document_VALID_cpp_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,760 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document_VALID_cpp_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,760 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,761 [Rank 0]:     total number of samples: 58192
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,761 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,844 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,845 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,845 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,846 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,846 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,846 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,846 [Rank 0]:  > finished creating indexed dataset in 0.001933 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,846 [Rank 0]:     number of documents: 226209
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,846 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,846 [Rank 0]:     VALID_coffeescript:
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,846 [Rank 0]:      document indices in [219197, 225983) total of 6786 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,846 [Rank 0]:  > Tokens per epoch: 5560129
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,849 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,849 [Rank 0]:  > last epoch number of samples (12) is smaller than 80% of number of samples per epoch (678), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,853 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003625
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       6786
+[ip-26-0-150-122:0]:     number of epochs:          4
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2714
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,856 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002911
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,856 [Rank 0]:  > building shuffle index with split [0, 2036) and [2036, 2714) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,858 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002097
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,861 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document_VALID_coffeescript_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,870 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document_VALID_coffeescript_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,871 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document_VALID_coffeescript_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,871 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,871 [Rank 0]:     total number of samples: 2715
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,871 [Rank 0]:     total number of epochs: 4
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,954 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,955 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,955 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,955 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,956 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,956 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,956 [Rank 0]:  > finished creating indexed dataset in 0.001300 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,956 [Rank 0]:     number of documents: 98733
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,956 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,956 [Rank 0]:     VALID_common-lisp:
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,956 [Rank 0]:      document indices in [95672, 98634) total of 2962 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,956 [Rank 0]:  > Tokens per epoch: 16829467
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,958 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,958 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,960 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002123
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       2962
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2054
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,962 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.001976
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,963 [Rank 0]:  > building shuffle index with split [0, 2054) and [2054, 2054) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:05,965 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002257
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,014 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document_VALID_common-lisp_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,019 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document_VALID_common-lisp_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,019 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document_VALID_common-lisp_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,021 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,021 [Rank 0]:     total number of samples: 2055
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,021 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,104 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,106 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,106 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,107 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,107 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,107 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,107 [Rank 0]:  > finished creating indexed dataset in 0.002294 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,107 [Rank 0]:     number of documents: 281016
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,107 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,107 [Rank 0]:     VALID_elixir:
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,107 [Rank 0]:      document indices in [272305, 280735) total of 8430 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,107 [Rank 0]:  > Tokens per epoch: 7046176
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,110 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,110 [Rank 0]:  > last epoch number of samples (328) is smaller than 80% of number of samples per epoch (860), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,114 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003461
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       8430
+[ip-26-0-150-122:0]:     number of epochs:          3
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2580
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,117 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003002
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,117 [Rank 0]:  > building shuffle index with split [0, 1720) and [1720, 2580) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,119 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002223
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,120 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document_VALID_elixir_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,125 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document_VALID_elixir_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,126 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document_VALID_elixir_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,126 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,126 [Rank 0]:     total number of samples: 2581
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,126 [Rank 0]:     total number of epochs: 3
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,210 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,211 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,212 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,212 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,212 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,212 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,212 [Rank 0]:  > finished creating indexed dataset in 0.002243 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,212 [Rank 0]:     number of documents: 250834
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,212 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,212 [Rank 0]:     VALID_groovy:
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,212 [Rank 0]:      document indices in [243058, 250583) total of 7525 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,212 [Rank 0]:  > Tokens per epoch: 7066083
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,215 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,215 [Rank 0]:  > last epoch number of samples (323) is smaller than 80% of number of samples per epoch (862), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,219 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003078
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       7525
+[ip-26-0-150-122:0]:     number of epochs:          3
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2587
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,222 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002899
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,222 [Rank 0]:  > building shuffle index with split [0, 1725) and [1725, 2587) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,225 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003019
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,227 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document_VALID_groovy_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,232 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document_VALID_groovy_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,232 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document_VALID_groovy_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,232 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,233 [Rank 0]:     total number of samples: 2588
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,233 [Rank 0]:     total number of epochs: 3
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,315 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,317 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,317 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,317 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,317 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,318 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,318 [Rank 0]:  > finished creating indexed dataset in 0.002011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,318 [Rank 0]:     number of documents: 3299965
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,318 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,318 [Rank 0]:     VALID_html:
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,318 [Rank 0]:      document indices in [3197666, 3296665) total of 98999 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,319 [Rank 0]:  > Tokens per epoch: 293479485
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,322 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,322 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,328 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.005949
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       98999
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   35825
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,331 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003183
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,331 [Rank 0]:  > building shuffle index with split [0, 35825) and [35825, 35825) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,335 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003513
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,380 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document_VALID_html_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,390 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document_VALID_html_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,390 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document_VALID_html_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,391 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,391 [Rank 0]:     total number of samples: 35826
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,391 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,474 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,476 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,476 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,476 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,476 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,477 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,477 [Rank 0]:  > finished creating indexed dataset in 0.002271 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,477 [Rank 0]:     number of documents: 20071773
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,477 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,477 [Rank 0]:     VALID_java:
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,477 [Rank 0]:      document indices in [19449548, 20051701) total of 602153 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,481 [Rank 0]:  > Tokens per epoch: 679829501
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,483 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,483 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,508 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.024745
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       602153
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   82986
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,514 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.005701
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,514 [Rank 0]:  > building shuffle index with split [0, 82986) and [82986, 82986) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,518 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004076
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,543 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document_VALID_java_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,555 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document_VALID_java_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,559 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document_VALID_java_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,559 [Rank 0]:     loaded indexed file in 0.016 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,559 [Rank 0]:     total number of samples: 82987
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,559 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,642 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,644 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,645 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,645 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,645 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,645 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,645 [Rank 0]:  > finished creating indexed dataset in 0.002342 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,645 [Rank 0]:     number of documents: 19544285
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,645 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,645 [Rank 0]:     VALID_javascript:
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,645 [Rank 0]:      document indices in [18938412, 19524741) total of 586329 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,649 [Rank 0]:  > Tokens per epoch: 565628573
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,652 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,652 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,674 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.021904
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       586329
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   69046
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,680 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.005604
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,680 [Rank 0]:  > building shuffle index with split [0, 69046) and [69046, 69046) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,683 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003501
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,710 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document_VALID_javascript_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,722 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document_VALID_javascript_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,722 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document_VALID_javascript_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,723 [Rank 0]:     loaded indexed file in 0.013 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,723 [Rank 0]:     total number of samples: 69047
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,723 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,805 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,808 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,808 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,808 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,808 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,808 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,808 [Rank 0]:  > finished creating indexed dataset in 0.002514 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,808 [Rank 0]:     number of documents: 21029287
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,808 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,808 [Rank 0]:     VALID_markdown:
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,808 [Rank 0]:      document indices in [20377379, 21008258) total of 630879 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,812 [Rank 0]:  > Tokens per epoch: 765105610
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,815 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,815 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,838 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.022965
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       630879
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   93396
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,845 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.006653
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,845 [Rank 0]:  > building shuffle index with split [0, 93396) and [93396, 93396) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,850 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.005421
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,870 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document_VALID_markdown_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,883 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document_VALID_markdown_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,883 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document_VALID_markdown_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,884 [Rank 0]:     loaded indexed file in 0.014 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,884 [Rank 0]:     total number of samples: 93397
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,884 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,967 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,969 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,969 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,969 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,969 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,969 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,969 [Rank 0]:  > finished creating indexed dataset in 0.002205 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,969 [Rank 0]:     number of documents: 15683017
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,970 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,970 [Rank 0]:     VALID_php:
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,970 [Rank 0]:      document indices in [15196843, 15667334) total of 470491 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,973 [Rank 0]:  > Tokens per epoch: 512566580
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,976 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,976 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,994 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.017977
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       470491
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   62569
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,998 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004299
+[ip-26-0-150-122:0]:2023-06-21 17:28:06,998 [Rank 0]:  > building shuffle index with split [0, 62569) and [62569, 62569) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,002 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004163
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,006 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document_VALID_php_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,018 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document_VALID_php_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,019 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document_VALID_php_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,020 [Rank 0]:     loaded indexed file in 0.014 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,020 [Rank 0]:     total number of samples: 62570
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,020 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,103 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,105 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,105 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,105 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,105 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,105 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,105 [Rank 0]:  > finished creating indexed dataset in 0.002205 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,105 [Rank 0]:     number of documents: 12866649
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,105 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,105 [Rank 0]:     VALID_python:
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,105 [Rank 0]:      document indices in [12467783, 12853782) total of 385999 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,108 [Rank 0]:  > Tokens per epoch: 529606827
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,111 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,111 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,126 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.015276
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       385999
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   64649
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,131 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004516
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,131 [Rank 0]:  > building shuffle index with split [0, 64649) and [64649, 64649) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,134 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003384
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,135 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document_VALID_python_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,145 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document_VALID_python_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,146 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document_VALID_python_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,146 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,146 [Rank 0]:     total number of samples: 64650
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,146 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,230 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,232 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,232 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,232 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,232 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,232 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,232 [Rank 0]:  > finished creating indexed dataset in 0.002286 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,232 [Rank 0]:     number of documents: 10547331
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,232 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,232 [Rank 0]:     VALID_typescript:
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,232 [Rank 0]:      document indices in [10220364, 10536784) total of 316420 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,234 [Rank 0]:  > Tokens per epoch: 222078157
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,237 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,237 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,250 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.012563
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       316420
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   27109
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,254 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003621
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,254 [Rank 0]:  > building shuffle index with split [0, 27109) and [27109, 27109) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,258 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003985
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,258 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document_VALID_typescript_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,268 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document_VALID_typescript_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,273 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document_VALID_typescript_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,278 [Rank 0]:     loaded indexed file in 0.020 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,278 [Rank 0]:     total number of samples: 27110
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,278 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,361 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,362 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,362 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,362 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,362 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,362 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,362 [Rank 0]:  > finished creating indexed dataset in 0.000719 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,362 [Rank 0]:     number of documents: 75
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,362 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,362 [Rank 0]:     VALID_verilog:
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,362 [Rank 0]:      document indices in [73, 75) total of 2 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,363 [Rank 0]:  > Tokens per epoch: 5184
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,365 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,366 [Rank 0]:  > last epoch number of samples (1) is larger than 80% of number of samples per epoch (0), setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,368 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002391
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       2
+[ip-26-0-150-122:0]:     number of epochs:          3237
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2048
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,370 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.001837
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,370 [Rank 0]:  > building shuffle index with split [0, 2048) and [2048, 2048) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,373 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002986
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,373 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document_VALID_verilog_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,378 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document_VALID_verilog_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,378 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document_VALID_verilog_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,379 [Rank 0]:     loaded indexed file in 0.005 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,379 [Rank 0]:     total number of samples: 2049
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,379 [Rank 0]:     total number of epochs: 3237
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,462 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,463 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,464 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,464 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,464 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,464 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,464 [Rank 0]:  > finished creating indexed dataset in 0.001833 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,464 [Rank 0]:     number of documents: 161239
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,464 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,464 [Rank 0]:     VALID_visual-basic:
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,464 [Rank 0]:      document indices in [156241, 161078) total of 4837 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,464 [Rank 0]:  > Tokens per epoch: 11401469
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,467 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,467 [Rank 0]:  > last epoch number of samples (657) is smaller than 80% of number of samples per epoch (1391), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,471 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002988
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       4837
+[ip-26-0-150-122:0]:     number of epochs:          2
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2783
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,473 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002198
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,473 [Rank 0]:  > building shuffle index with split [0, 1391) and [1391, 2783) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,476 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002656
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,526 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document_VALID_visual-basic_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,530 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document_VALID_visual-basic_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,531 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document_VALID_visual-basic_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,531 [Rank 0]:     loaded indexed file in 0.005 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,532 [Rank 0]:     total number of samples: 2784
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,532 [Rank 0]:     total number of epochs: 2
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,615 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,616 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,616 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,616 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,616 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,616 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,616 [Rank 0]:  > finished creating indexed dataset in 0.001218 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,616 [Rank 0]:     number of documents: 58208
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,616 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,616 [Rank 0]:     VALID_vhdl:
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,616 [Rank 0]:      document indices in [56404, 58150) total of 1746 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,617 [Rank 0]:  > Tokens per epoch: 12008501
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,619 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,619 [Rank 0]:  > last epoch number of samples (583) is smaller than 80% of number of samples per epoch (1465), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,622 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002933
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1746
+[ip-26-0-150-122:0]:     number of epochs:          2
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2931
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,625 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002862
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,625 [Rank 0]:  > building shuffle index with split [0, 1465) and [1465, 2931) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,627 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001572
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,632 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document_VALID_vhdl_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,638 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document_VALID_vhdl_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,639 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document_VALID_vhdl_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,639 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,639 [Rank 0]:     total number of samples: 2932
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,639 [Rank 0]:     total number of epochs: 2
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,722 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,723 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,723 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,723 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,723 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,723 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,723 [Rank 0]:  > finished creating indexed dataset in 0.000704 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,723 [Rank 0]:     number of documents: 4661
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,723 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,723 [Rank 0]:     VALID_thrift:
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,723 [Rank 0]:      document indices in [4517, 4656) total of 139 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,724 [Rank 0]:  > Tokens per epoch: 98302
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,726 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,727 [Rank 0]:  > last epoch number of samples (9) is larger than 80% of number of samples per epoch (11), setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,730 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003812
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       139
+[ip-26-0-150-122:0]:     number of epochs:          171
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2051
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,733 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002584
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,733 [Rank 0]:  > building shuffle index with split [0, 2051) and [2051, 2051) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,735 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002088
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,784 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document_VALID_thrift_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,790 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document_VALID_thrift_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,790 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document_VALID_thrift_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,791 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,791 [Rank 0]:     total number of samples: 2052
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,791 [Rank 0]:     total number of epochs: 171
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,875 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,875 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,875 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,875 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,875 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,876 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,876 [Rank 0]:  > finished creating indexed dataset in 0.000680 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,876 [Rank 0]:     number of documents: 93
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,876 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,876 [Rank 0]:     VALID_matlab:
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,876 [Rank 0]:      document indices in [90, 93) total of 3 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,876 [Rank 0]:  > Tokens per epoch: 4277
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,879 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,880 [Rank 0]:  > last epoch number of samples (1) is larger than 80% of number of samples per epoch (0), setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,883 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003370
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       3
+[ip-26-0-150-122:0]:     number of epochs:          3923
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2048
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,886 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003336
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,887 [Rank 0]:  > building shuffle index with split [0, 2048) and [2048, 2048) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,889 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002569
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,890 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document_VALID_matlab_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,895 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document_VALID_matlab_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,895 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document_VALID_matlab_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,896 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,896 [Rank 0]:     total number of samples: 2049
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,896 [Rank 0]:     total number of epochs: 3923
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,979 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,979 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,980 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,980 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,980 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,980 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,980 [Rank 0]:  > finished creating indexed dataset in 0.000772 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,980 [Rank 0]:     number of documents: 7451
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,980 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,980 [Rank 0]:     VALID_yacc:
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,980 [Rank 0]:      document indices in [7220, 7444) total of 224 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,980 [Rank 0]:  > Tokens per epoch: 1128407
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,982 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,982 [Rank 0]:  > last epoch number of samples (120) is larger than 80% of number of samples per epoch (137), setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,985 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002760
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       224
+[ip-26-0-150-122:0]:     number of epochs:          15
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2066
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,989 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003140
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,989 [Rank 0]:  > building shuffle index with split [0, 2066) and [2066, 2066) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,991 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002217
+[ip-26-0-150-122:0]:2023-06-21 17:28:07,995 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document_VALID_yacc_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,002 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document_VALID_yacc_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,005 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document_VALID_yacc_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,006 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,006 [Rank 0]:     total number of samples: 2067
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,006 [Rank 0]:     total number of epochs: 15
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,089 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,090 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,090 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,090 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,090 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,090 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,090 [Rank 0]:  > finished creating indexed dataset in 0.000813 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,090 [Rank 0]:     number of documents: 15850
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,090 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,090 [Rank 0]:     VALID_zig:
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,091 [Rank 0]:      document indices in [15359, 15834) total of 475 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,091 [Rank 0]:  > Tokens per epoch: 2144189
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,093 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,093 [Rank 0]:  > last epoch number of samples (216) is larger than 80% of number of samples per epoch (261), setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,095 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002125
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       475
+[ip-26-0-150-122:0]:     number of epochs:          8
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2093
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,099 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003680
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,099 [Rank 0]:  > building shuffle index with split [0, 2093) and [2093, 2093) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,103 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003473
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,103 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document_VALID_zig_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,110 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document_VALID_zig_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,110 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document_VALID_zig_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,111 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,111 [Rank 0]:     total number of samples: 2094
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,111 [Rank 0]:     total number of epochs: 8
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,194 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,195 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,195 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,195 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,195 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,195 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,195 [Rank 0]:  > finished creating indexed dataset in 0.000978 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,195 [Rank 0]:     number of documents: 42103
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,195 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,196 [Rank 0]:     VALID_xslt:
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,196 [Rank 0]:      document indices in [40798, 42061) total of 1263 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,196 [Rank 0]:  > Tokens per epoch: 4166294
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,198 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,199 [Rank 0]:  > last epoch number of samples (14) is smaller than 80% of number of samples per epoch (508), setting separate_last_epoch to True
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,201 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002378
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1263
+[ip-26-0-150-122:0]:     number of epochs:          5
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2542
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,204 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002669
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,204 [Rank 0]:  > building shuffle index with split [0, 2034) and [2034, 2542) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,207 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002728
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,208 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document_VALID_xslt_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,213 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document_VALID_xslt_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,213 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document_VALID_xslt_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,214 [Rank 0]:     loaded indexed file in 0.005 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,214 [Rank 0]:     total number of samples: 2543
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,214 [Rank 0]:     total number of epochs: 5
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,297 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,299 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,299 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,299 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,299 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,300 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,300 [Rank 0]:  > finished creating indexed dataset in 0.002198 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,300 [Rank 0]:     number of documents: 4751547
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,300 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,300 [Rank 0]:     VALID_json:
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,300 [Rank 0]:      document indices in [4604249, 4746795) total of 142546 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,301 [Rank 0]:  > Tokens per epoch: 62884447
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,303 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,304 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,311 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.007176
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       142546
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   7676
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,315 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003801
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,315 [Rank 0]:  > building shuffle index with split [0, 7676) and [7676, 7676) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,318 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003295
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,366 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document_VALID_json_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,375 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document_VALID_json_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,376 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document_VALID_json_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,376 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,376 [Rank 0]:     total number of samples: 7677
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,376 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,460 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,461 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,462 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,462 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,462 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,462 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,462 [Rank 0]:  > finished creating indexed dataset in 0.002057 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,462 [Rank 0]:     number of documents: 3995948
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,462 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,462 [Rank 0]:     VALID_yaml:
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,462 [Rank 0]:      document indices in [3872074, 3991952) total of 119878 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,463 [Rank 0]:  > Tokens per epoch: 35974762
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,466 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,466 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,472 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.005821
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       119878
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   4391
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,475 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003133
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,475 [Rank 0]:  > building shuffle index with split [0, 4391) and [4391, 4391) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,478 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002653
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,527 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document_VALID_yaml_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,536 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document_VALID_yaml_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,536 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document_VALID_yaml_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,537 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,537 [Rank 0]:     total number of samples: 4392
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,537 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,619 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,621 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,621 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,621 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,621 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,621 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,621 [Rank 0]:  > finished creating indexed dataset in 0.002074 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,621 [Rank 0]:     number of documents: 30982955
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,621 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,621 [Rank 0]:     VALID_gh_issues:
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,621 [Rank 0]:      document indices in [30022483, 30951972) total of 929489 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,627 [Rank 0]:  > Tokens per epoch: 538755961
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,630 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,630 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,666 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.035955
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       929489
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   65766
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,673 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.007118
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,673 [Rank 0]:  > building shuffle index with split [0, 65766) and [65766, 65766) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,678 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004275
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,736 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document_VALID_gh_issues_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,751 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document_VALID_gh_issues_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,752 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document_VALID_gh_issues_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,752 [Rank 0]:     loaded indexed file in 0.016 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,752 [Rank 0]:     total number of samples: 65767
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,752 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,836 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,837 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,838 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,838 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,838 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,838 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,838 [Rank 0]:  > finished creating indexed dataset in 0.002246 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,838 [Rank 0]:     number of documents: 7634718
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,838 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,838 [Rank 0]:     VALID_gh_commits:
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,838 [Rank 0]:      document indices in [7398042, 7627083) total of 229041 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,840 [Rank 0]:  > Tokens per epoch: 483498380
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,842 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,842 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,852 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.009787
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       229041
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   59020
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,857 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004542
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,857 [Rank 0]:  > building shuffle index with split [0, 59020) and [59020, 59020) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,862 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004894
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,862 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document_VALID_gh_commits_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,874 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document_VALID_gh_commits_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,875 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document_VALID_gh_commits_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,875 [Rank 0]:     loaded indexed file in 0.013 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,875 [Rank 0]:     total number of samples: 59021
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,875 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,959 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,960 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,960 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,960 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,961 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,961 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,961 [Rank 0]:  > finished creating indexed dataset in 0.001894 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,961 [Rank 0]:     number of documents: 914510
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,961 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,961 [Rank 0]:     VALID_notebook_scripts:
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,961 [Rank 0]:      document indices in [886160, 913595) total of 27435 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,961 [Rank 0]:  > Tokens per epoch: 73709652
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,964 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,964 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,968 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003803
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       27435
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   8997
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,971 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003204
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,971 [Rank 0]:  > building shuffle index with split [0, 8997) and [8997, 8997) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,974 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002797
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,977 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document_VALID_notebook_scripts_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,985 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document_VALID_notebook_scripts_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,985 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document_VALID_notebook_scripts_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,986 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,986 [Rank 0]:     total number of samples: 8998
+[ip-26-0-150-122:0]:2023-06-21 17:28:08,986 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,067 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,068 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,068 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,068 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,068 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,069 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,069 [Rank 0]:  > finished creating indexed dataset in 0.001880 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,069 [Rank 0]:     number of documents: 668743
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,069 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,069 [Rank 0]:     VALID_notebook_structured:
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,069 [Rank 0]:      document indices in [648012, 668074) total of 20062 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,069 [Rank 0]:  > Tokens per epoch: 56156688
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,071 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,072 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,075 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003791
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       20062
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   6855
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,078 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002285
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,078 [Rank 0]:  > building shuffle index with split [0, 6855) and [6855, 6855) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,080 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002088
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,139 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document_VALID_notebook_structured_indexmap_2048ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,146 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document_VALID_notebook_structured_indexmap_2048ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,148 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document_VALID_notebook_structured_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,150 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,150 [Rank 0]:     total number of samples: 6856
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,150 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,235 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,237 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,237 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,237 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,237 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,237 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,237 [Rank 0]:  > finished creating indexed dataset in 0.001864 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,237 [Rank 0]:     number of documents: 2721616
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,237 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,237 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,237 [Rank 0]:      document indices in [2637246, 2718894) total of 81648 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,238 [Rank 0]:  > Tokens per epoch: 142752310
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,241 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,241 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,246 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.005119
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       81648
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   17425
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,250 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003681
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,250 [Rank 0]:  > building shuffle index with split [0, 17425) and [17425, 17425) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,252 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002301
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,253 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_9ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,260 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_9ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,261 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_9ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,261 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,261 [Rank 0]:     total number of samples: 17426
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,261 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,345 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,345 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,345 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,345 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,346 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,346 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,346 [Rank 0]:  > finished creating indexed dataset in 0.000685 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,346 [Rank 0]:     number of documents: 968
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,346 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,346 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,346 [Rank 0]:      document indices in [938, 967) total of 29 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,346 [Rank 0]:  > Tokens per epoch: 55028
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,358 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,362 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,364 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,367 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,367 [Rank 0]:     total number of samples: 7
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,367 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,450 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,452 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,452 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,452 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,452 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,453 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,453 [Rank 0]:  > finished creating indexed dataset in 0.002246 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,453 [Rank 0]:     number of documents: 8536791
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,453 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,453 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,453 [Rank 0]:      document indices in [8272150, 8528254) total of 256104 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,455 [Rank 0]:  > Tokens per epoch: 613576495
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,456 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,456 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,467 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.010761
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       256104
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   74899
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,471 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003652
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,471 [Rank 0]:  > building shuffle index with split [0, 74899) and [74899, 74899) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,475 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003861
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,475 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_145ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,487 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_145ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,488 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_145ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,489 [Rank 0]:     loaded indexed file in 0.014 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,489 [Rank 0]:     total number of samples: 74900
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,489 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,572 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,573 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,574 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,574 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,574 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,574 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,574 [Rank 0]:  > finished creating indexed dataset in 0.001657 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,574 [Rank 0]:     number of documents: 158792
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,574 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,574 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,574 [Rank 0]:      document indices in [153869, 158633) total of 4764 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,574 [Rank 0]:  > Tokens per epoch: 18815887
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,577 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,577 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,580 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002379
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       4764
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2296
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,583 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002757
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,583 [Rank 0]:  > building shuffle index with split [0, 2296) and [2296, 2296) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,586 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003100
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,589 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_5ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,594 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_5ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,594 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_5ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,594 [Rank 0]:     loaded indexed file in 0.005 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,594 [Rank 0]:     total number of samples: 2297
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,595 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,678 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,679 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,679 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,680 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,680 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,680 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,680 [Rank 0]:  > finished creating indexed dataset in 0.001779 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,680 [Rank 0]:     number of documents: 153194
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,680 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,680 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,680 [Rank 0]:      document indices in [148445, 153041) total of 4596 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,680 [Rank 0]:  > Tokens per epoch: 8220293
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,682 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,682 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,684 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002250
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       4596
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   1003
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,688 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003712
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,688 [Rank 0]:  > building shuffle index with split [0, 1003) and [1003, 1003) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,691 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002539
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,697 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,701 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,703 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,705 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,705 [Rank 0]:     total number of samples: 1004
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,705 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,788 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,790 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,790 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,790 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,790 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,791 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,791 [Rank 0]:  > finished creating indexed dataset in 0.002346 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,791 [Rank 0]:     number of documents: 2239354
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,791 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,791 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,791 [Rank 0]:      document indices in [2169934, 2237115) total of 67181 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,792 [Rank 0]:  > Tokens per epoch: 43085225
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,793 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,793 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,798 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.004282
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       67181
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   5259
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,801 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002722
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,801 [Rank 0]:  > building shuffle index with split [0, 5259) and [5259, 5259) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,804 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003165
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,805 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_16ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,811 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_16ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,811 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_16ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,812 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,812 [Rank 0]:     total number of samples: 5260
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,812 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,895 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,896 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,896 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,896 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,896 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,896 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,896 [Rank 0]:  > finished creating indexed dataset in 0.000727 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,896 [Rank 0]:     number of documents: 523
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,896 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,896 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,896 [Rank 0]:      document indices in [507, 522) total of 15 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,897 [Rank 0]:  > Tokens per epoch: 46791
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,912 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,916 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,919 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,923 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,923 [Rank 0]:     total number of samples: 6
+[ip-26-0-150-122:0]:2023-06-21 17:28:09,923 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,007 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,009 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,009 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,009 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,009 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,009 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,009 [Rank 0]:  > finished creating indexed dataset in 0.002274 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,009 [Rank 0]:     number of documents: 295364
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,009 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,009 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,009 [Rank 0]:      document indices in [286208, 295069) total of 8861 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,010 [Rank 0]:  > Tokens per epoch: 13589070
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,011 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,012 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,015 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002999
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       8861
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   1658
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,017 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002250
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,017 [Rank 0]:  > building shuffle index with split [0, 1658) and [1658, 1658) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,019 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002191
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,020 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,027 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,032 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,033 [Rank 0]:     loaded indexed file in 0.013 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,033 [Rank 0]:     total number of samples: 1659
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,033 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,116 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,118 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,118 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,118 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,118 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,118 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,118 [Rank 0]:  > finished creating indexed dataset in 0.001998 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,118 [Rank 0]:     number of documents: 210816
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,118 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,118 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,119 [Rank 0]:      document indices in [204281, 210605) total of 6324 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,119 [Rank 0]:  > Tokens per epoch: 8481384
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,122 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,122 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,125 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003184
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       6324
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   1035
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,127 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002203
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,127 [Rank 0]:  > building shuffle index with split [0, 1035) and [1035, 1035) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,130 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002909
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,131 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,136 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,136 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,137 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,137 [Rank 0]:     total number of samples: 1036
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,137 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,220 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,220 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,220 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,221 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,221 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,221 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,221 [Rank 0]:  > finished creating indexed dataset in 0.000698 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,221 [Rank 0]:     number of documents: 5001
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,221 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,221 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,221 [Rank 0]:      document indices in [4846, 4996) total of 150 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,221 [Rank 0]:  > Tokens per epoch: 1014769
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,222 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,223 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,225 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002061
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       150
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   123
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,227 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002050
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,227 [Rank 0]:  > building shuffle index with split [0, 123) and [123, 123) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,229 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002345
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,235 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,239 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,239 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,241 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,242 [Rank 0]:     total number of samples: 124
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,242 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,325 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]:  > finished creating indexed dataset in 0.000789 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]:     number of documents: 8042
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]:      document indices in [7793, 8034) total of 241 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]:  > Tokens per epoch: 225513
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,329 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,329 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,332 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002602
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       241
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   27
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,336 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004295
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,336 [Rank 0]:  > building shuffle index with split [0, 27) and [27, 27) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,338 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001874
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,338 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,343 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,343 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,346 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,346 [Rank 0]:     total number of samples: 28
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,346 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,430 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,430 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,430 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,430 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,431 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,431 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,431 [Rank 0]:  > finished creating indexed dataset in 0.000812 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,431 [Rank 0]:     number of documents: 16870
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,431 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,431 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,431 [Rank 0]:      document indices in [16347, 16853) total of 506 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,431 [Rank 0]:  > Tokens per epoch: 1042103
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,433 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,433 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,435 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002311
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       506
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   127
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,438 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002538
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,438 [Rank 0]:  > building shuffle index with split [0, 127) and [127, 127) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,441 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003080
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,441 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,446 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,446 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,448 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,448 [Rank 0]:     total number of samples: 128
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,448 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,532 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,534 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,534 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,534 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,534 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,534 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,534 [Rank 0]:  > finished creating indexed dataset in 0.002170 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,534 [Rank 0]:     number of documents: 267627
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,534 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,534 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,534 [Rank 0]:      document indices in [259331, 267359) total of 8028 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,535 [Rank 0]:  > Tokens per epoch: 8559847
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,537 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,537 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,539 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002263
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       8028
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   1044
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,542 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002454
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,542 [Rank 0]:  > building shuffle index with split [0, 1044) and [1044, 1044) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,544 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001797
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,544 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,551 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,552 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,552 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,552 [Rank 0]:     total number of samples: 1045
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,552 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,636 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,637 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,638 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,638 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,638 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,638 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,638 [Rank 0]:  > finished creating indexed dataset in 0.002060 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,638 [Rank 0]:     number of documents: 4700526
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,638 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,638 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,638 [Rank 0]:      document indices in [4554810, 4695825) total of 141015 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,639 [Rank 0]:  > Tokens per epoch: 253353715
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,642 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,642 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,649 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.007241
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       141015
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   30926
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,653 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003915
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,653 [Rank 0]:  > building shuffle index with split [0, 30926) and [30926, 30926) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,656 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002367
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,656 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_64ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,667 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_64ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,667 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_64ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,668 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,668 [Rank 0]:     total number of samples: 30927
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,668 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,751 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,752 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,753 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,753 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,753 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,753 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,753 [Rank 0]:  > finished creating indexed dataset in 0.001324 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,753 [Rank 0]:     number of documents: 98447
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,753 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,753 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,753 [Rank 0]:      document indices in [95395, 98349) total of 2954 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,753 [Rank 0]:  > Tokens per epoch: 6597590
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,756 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,756 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,759 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002831
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       2954
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   805
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,762 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003247
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,762 [Rank 0]:  > building shuffle index with split [0, 805) and [805, 805) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,765 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002427
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,765 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,772 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,772 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,773 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,773 [Rank 0]:     total number of samples: 806
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,773 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,856 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,858 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,858 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,858 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,858 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,858 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,858 [Rank 0]:  > finished creating indexed dataset in 0.001475 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,858 [Rank 0]:     number of documents: 124066
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,858 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,858 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,858 [Rank 0]:      document indices in [120220, 123942) total of 3722 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,859 [Rank 0]:  > Tokens per epoch: 4694260
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,861 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,861 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,864 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002899
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       3722
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   573
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,868 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003971
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,868 [Rank 0]:  > building shuffle index with split [0, 573) and [573, 573) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,871 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003098
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,872 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,879 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,880 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,880 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,880 [Rank 0]:     total number of samples: 574
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,880 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,964 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,964 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,964 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,965 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,965 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,965 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,965 [Rank 0]:  > finished creating indexed dataset in 0.000930 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,965 [Rank 0]:     number of documents: 30934
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,965 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,965 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,965 [Rank 0]:      document indices in [29975, 30903) total of 928 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,965 [Rank 0]:  > Tokens per epoch: 2230554
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,967 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,967 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,969 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002251
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       928
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   272
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,971 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002059
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,972 [Rank 0]:  > building shuffle index with split [0, 272) and [272, 272) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,975 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003396
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,975 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,980 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,980 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,981 [Rank 0]:     loaded indexed file in 0.005 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,981 [Rank 0]:     total number of samples: 273
+[ip-26-0-150-122:0]:2023-06-21 17:28:10,981 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,065 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,066 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,066 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,066 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,066 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,066 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,066 [Rank 0]:  > finished creating indexed dataset in 0.001506 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,066 [Rank 0]:     number of documents: 110981
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,066 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,066 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,066 [Rank 0]:      document indices in [107541, 110870) total of 3329 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,067 [Rank 0]:  > Tokens per epoch: 21526929
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,070 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,070 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,072 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002216
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       3329
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2627
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,076 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003878
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,076 [Rank 0]:  > building shuffle index with split [0, 2627) and [2627, 2627) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,078 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002243
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,079 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_5ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,086 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_5ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,086 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_5ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,087 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,087 [Rank 0]:     total number of samples: 2628
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,087 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,170 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,172 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,172 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,172 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,172 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,172 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,172 [Rank 0]:  > finished creating indexed dataset in 0.002155 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,172 [Rank 0]:     number of documents: 365491
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,172 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,172 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,173 [Rank 0]:      document indices in [354161, 365126) total of 10965 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,173 [Rank 0]:  > Tokens per epoch: 25729670
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,175 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,175 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,178 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003171
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       10965
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   3140
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,181 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002608
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,181 [Rank 0]:  > building shuffle index with split [0, 3140) and [3140, 3140) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,183 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002510
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,185 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_6ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,192 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_6ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,193 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_6ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,193 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,193 [Rank 0]:     total number of samples: 3141
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,193 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,277 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]:  > finished creating indexed dataset in 0.001016 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]:     number of documents: 39042
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]:      document indices in [37832, 39003) total of 1171 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]:  > Tokens per epoch: 2880088
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,281 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,281 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,285 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003717
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1171
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   351
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,288 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003302
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,288 [Rank 0]:  > building shuffle index with split [0, 351) and [351, 351) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,291 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002865
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,291 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,299 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,299 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,300 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,300 [Rank 0]:     total number of samples: 352
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,300 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,383 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]:  > finished creating indexed dataset in 0.001383 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]:     number of documents: 97167
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]:      document indices in [94155, 97070) total of 2915 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]:  > Tokens per epoch: 2614634
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,388 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,388 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,391 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002748
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       2915
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   319
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,394 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003080
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,394 [Rank 0]:  > building shuffle index with split [0, 319) and [319, 319) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,396 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002230
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,397 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,404 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,405 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,405 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,405 [Rank 0]:     total number of samples: 320
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,405 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,489 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,490 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,490 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,491 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,491 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,491 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,491 [Rank 0]:  > finished creating indexed dataset in 0.001998 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,491 [Rank 0]:     number of documents: 186375
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,491 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,491 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,491 [Rank 0]:      document indices in [180597, 186189) total of 5592 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,491 [Rank 0]:  > Tokens per epoch: 4338734
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,494 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,494 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,497 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003543
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       5592
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   529
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,500 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002967
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,501 [Rank 0]:  > building shuffle index with split [0, 529) and [529, 529) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,502 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001862
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,503 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,508 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,510 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,512 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,513 [Rank 0]:     total number of samples: 530
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,513 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,597 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,597 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,597 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,597 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,597 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,597 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,598 [Rank 0]:  > finished creating indexed dataset in 0.000777 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,598 [Rank 0]:     number of documents: 9226
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,598 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,598 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,598 [Rank 0]:      document indices in [8940, 9217) total of 277 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,598 [Rank 0]:  > Tokens per epoch: 1021218
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,600 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,600 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,603 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002658
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       277
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   124
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,606 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003479
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,606 [Rank 0]:  > building shuffle index with split [0, 124) and [124, 124) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,608 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001835
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,611 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,615 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,619 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,623 [Rank 0]:     loaded indexed file in 0.012 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,623 [Rank 0]:     total number of samples: 125
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,623 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,707 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,709 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,709 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,709 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,709 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,709 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,709 [Rank 0]:  > finished creating indexed dataset in 0.002208 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,709 [Rank 0]:     number of documents: 3390320
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,709 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,709 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,709 [Rank 0]:      document indices in [3285220, 3386930) total of 101710 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,710 [Rank 0]:  > Tokens per epoch: 61345928
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,712 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,712 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,718 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.005851
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       101710
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   7488
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,721 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002472
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,721 [Rank 0]:  > building shuffle index with split [0, 7488) and [7488, 7488) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,723 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002289
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,724 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_19ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,733 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_19ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,733 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_19ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,734 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,734 [Rank 0]:     total number of samples: 7489
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,734 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,817 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,819 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,819 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,819 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,819 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,819 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,819 [Rank 0]:  > finished creating indexed dataset in 0.002295 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,819 [Rank 0]:     number of documents: 1380468
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,819 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,820 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,820 [Rank 0]:      document indices in [1337673, 1379088) total of 41415 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,820 [Rank 0]:  > Tokens per epoch: 81845020
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,823 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,823 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,826 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003589
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       41415
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   9990
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,829 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002887
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,829 [Rank 0]:  > building shuffle index with split [0, 9990) and [9990, 9990) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,832 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002872
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,833 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_25ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,838 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_25ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,838 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_25ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,841 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,841 [Rank 0]:     total number of samples: 9991
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,841 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,925 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]:  > finished creating indexed dataset in 0.000769 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]:     number of documents: 5386
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]:      document indices in [5219, 5381) total of 162 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]:  > Tokens per epoch: 626200
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,928 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,928 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,931 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002702
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       162
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   76
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,934 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003063
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,935 [Rank 0]:  > building shuffle index with split [0, 76) and [76, 76) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,937 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002100
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,940 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,944 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,945 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,947 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,947 [Rank 0]:     total number of samples: 77
+[ip-26-0-150-122:0]:2023-06-21 17:28:11,947 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,031 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,033 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,033 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,033 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,033 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,033 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,033 [Rank 0]:  > finished creating indexed dataset in 0.002331 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,033 [Rank 0]:     number of documents: 10801285
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,033 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,033 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,033 [Rank 0]:      document indices in [10466445, 10790484) total of 324039 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,036 [Rank 0]:  > Tokens per epoch: 318261515
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,037 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,037 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,051 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.013492
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       324039
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   38850
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,055 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003927
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,055 [Rank 0]:  > building shuffle index with split [0, 38850) and [38850, 38850) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,059 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004215
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,060 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_120ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,070 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_120ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,071 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_120ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,071 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,071 [Rank 0]:     total number of samples: 38851
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,071 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,155 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,157 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,157 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,157 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,157 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,157 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,157 [Rank 0]:  > finished creating indexed dataset in 0.001766 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,157 [Rank 0]:     number of documents: 587748
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,157 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,157 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,157 [Rank 0]:      document indices in [569528, 587160) total of 17632 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,158 [Rank 0]:  > Tokens per epoch: 6393705
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,160 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,160 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,163 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003214
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       17632
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   780
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,167 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003279
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,167 [Rank 0]:  > building shuffle index with split [0, 780) and [780, 780) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,169 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002647
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,173 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,178 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,178 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,178 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,179 [Rank 0]:     total number of samples: 781
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,179 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,263 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,265 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,265 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,265 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,265 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,265 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,265 [Rank 0]:  > finished creating indexed dataset in 0.002192 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,265 [Rank 0]:     number of documents: 541454
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,265 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,265 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,265 [Rank 0]:      document indices in [524669, 540913) total of 16244 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,266 [Rank 0]:  > Tokens per epoch: 19105324
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,266 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,267 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,270 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003258
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       16244
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2332
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,273 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002468
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,273 [Rank 0]:  > building shuffle index with split [0, 2332) and [2332, 2332) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,275 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002231
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,278 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_6ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,283 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_6ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,284 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_6ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,284 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,284 [Rank 0]:     total number of samples: 2333
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,284 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,368 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]:  > finished creating indexed dataset in 0.000680 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]:     number of documents: 1152
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]:      document indices in [1116, 1151) total of 35 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]:  > Tokens per epoch: 30587
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,385 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,388 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,391 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,395 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,395 [Rank 0]:     total number of samples: 4
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,395 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,482 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,483 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,483 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,483 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,483 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,483 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,483 [Rank 0]:  > finished creating indexed dataset in 0.000777 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,483 [Rank 0]:     number of documents: 22653
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,483 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,483 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,484 [Rank 0]:      document indices in [21951, 22630) total of 679 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,484 [Rank 0]:  > Tokens per epoch: 16838913
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,485 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,485 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,487 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002180
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       679
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2055
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,490 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003326
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,490 [Rank 0]:  > building shuffle index with split [0, 2055) and [2055, 2055) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,493 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002627
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,493 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,498 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,498 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,500 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,500 [Rank 0]:     total number of samples: 2056
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,500 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,584 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,585 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,586 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,586 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,586 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,586 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,586 [Rank 0]:  > finished creating indexed dataset in 0.001711 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,586 [Rank 0]:     number of documents: 158356
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,586 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,586 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,586 [Rank 0]:      document indices in [153447, 158198) total of 4751 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,586 [Rank 0]:  > Tokens per epoch: 9867998
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,588 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,589 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,591 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002278
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       4751
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   1204
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,594 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002527
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,594 [Rank 0]:  > building shuffle index with split [0, 1204) and [1204, 1204) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,596 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002608
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,599 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,603 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,603 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,604 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,604 [Rank 0]:     total number of samples: 1205
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,604 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,688 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,690 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,690 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,690 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,690 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,690 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,690 [Rank 0]:  > finished creating indexed dataset in 0.002049 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,691 [Rank 0]:     number of documents: 657349
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,691 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,691 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,691 [Rank 0]:      document indices in [636971, 656692) total of 19721 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,691 [Rank 0]:  > Tokens per epoch: 14806733
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,694 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,694 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,697 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003486
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       19721
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   1807
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,700 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002162
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,700 [Rank 0]:  > building shuffle index with split [0, 1807) and [1807, 1807) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,703 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002931
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,708 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,714 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,714 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,715 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,715 [Rank 0]:     total number of samples: 1808
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,715 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,799 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,801 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,801 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,801 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,801 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,801 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,801 [Rank 0]:  > finished creating indexed dataset in 0.002136 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,801 [Rank 0]:     number of documents: 549459
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,801 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,801 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,801 [Rank 0]:      document indices in [532426, 548910) total of 16484 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,802 [Rank 0]:  > Tokens per epoch: 29891276
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,804 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,804 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,807 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002825
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       16484
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   3648
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,809 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002457
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,809 [Rank 0]:  > building shuffle index with split [0, 3648) and [3648, 3648) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,812 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002820
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,817 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_8ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,823 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_8ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,823 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_8ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,823 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,824 [Rank 0]:     total number of samples: 3649
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,824 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,907 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,908 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,908 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,908 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,908 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,908 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,908 [Rank 0]:  > finished creating indexed dataset in 0.000729 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,908 [Rank 0]:     number of documents: 1133
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,908 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,909 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,909 [Rank 0]:      document indices in [1098, 1132) total of 34 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,909 [Rank 0]:  > Tokens per epoch: 39416
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,925 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,929 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,930 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,933 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,933 [Rank 0]:     total number of samples: 5
+[ip-26-0-150-122:0]:2023-06-21 17:28:12,933 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,017 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,017 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,018 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,018 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,018 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,018 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,018 [Rank 0]:  > finished creating indexed dataset in 0.000767 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,018 [Rank 0]:     number of documents: 6104
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,018 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,018 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,018 [Rank 0]:      document indices in [5915, 6098) total of 183 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,018 [Rank 0]:  > Tokens per epoch: 518557
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,020 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,020 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,023 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002899
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       183
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   63
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,026 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003113
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,026 [Rank 0]:  > building shuffle index with split [0, 63) and [63, 63) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,028 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001994
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,028 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,033 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,033 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,035 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,035 [Rank 0]:     total number of samples: 64
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,035 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,119 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,121 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,122 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,122 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,122 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,122 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,122 [Rank 0]:  > finished creating indexed dataset in 0.002302 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,122 [Rank 0]:     number of documents: 896880
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,122 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,122 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,122 [Rank 0]:      document indices in [869077, 895983) total of 26906 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,122 [Rank 0]:  > Tokens per epoch: 31882370
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,125 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,125 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,128 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003160
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       26906
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   3891
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,131 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002524
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,131 [Rank 0]:  > building shuffle index with split [0, 3891) and [3891, 3891) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,133 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002579
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,134 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_9ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,141 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_9ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,141 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_9ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,142 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,142 [Rank 0]:     total number of samples: 3892
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,142 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,226 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,226 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,226 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,226 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,226 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,226 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,226 [Rank 0]:  > finished creating indexed dataset in 0.000706 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,226 [Rank 0]:     number of documents: 3688
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,227 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,227 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,227 [Rank 0]:      document indices in [3574, 3684) total of 110 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,227 [Rank 0]:  > Tokens per epoch: 233387
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,229 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,229 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,231 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002137
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       110
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   28
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,233 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.001910
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,233 [Rank 0]:  > building shuffle index with split [0, 28) and [28, 28) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,236 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002616
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,239 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,243 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,247 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,250 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,250 [Rank 0]:     total number of samples: 29
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,251 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,335 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,335 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,335 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,336 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,336 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,336 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,336 [Rank 0]:  > finished creating indexed dataset in 0.000871 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,336 [Rank 0]:     number of documents: 19630
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,336 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,336 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,336 [Rank 0]:      document indices in [19021, 19610) total of 589 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,336 [Rank 0]:  > Tokens per epoch: 2060914
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,337 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,337 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,339 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.001882
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       589
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   251
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,342 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002619
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,342 [Rank 0]:  > building shuffle index with split [0, 251) and [251, 251) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,344 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002281
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,345 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,349 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,349 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,351 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,351 [Rank 0]:     total number of samples: 252
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,352 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,436 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,437 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,437 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,437 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,437 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,437 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,437 [Rank 0]:  > finished creating indexed dataset in 0.001044 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,437 [Rank 0]:     number of documents: 46270
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,437 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,437 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,437 [Rank 0]:      document indices in [44836, 46224) total of 1388 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,438 [Rank 0]:  > Tokens per epoch: 4206961
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,438 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,438 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,441 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002783
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1388
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   513
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,443 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.001830
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,443 [Rank 0]:  > building shuffle index with split [0, 513) and [513, 513) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,446 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003026
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,447 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,455 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,455 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,455 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,455 [Rank 0]:     total number of samples: 514
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,456 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,539 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,541 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,541 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,541 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,542 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,542 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,542 [Rank 0]:  > finished creating indexed dataset in 0.002116 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,542 [Rank 0]:     number of documents: 522778
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,542 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,542 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,542 [Rank 0]:      document indices in [506572, 522255) total of 15683 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,542 [Rank 0]:  > Tokens per epoch: 56256264
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,544 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,544 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,548 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003553
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       15683
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   6867
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,551 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003154
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,551 [Rank 0]:  > building shuffle index with split [0, 6867) and [6867, 6867) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,553 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001761
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,553 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_14ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,560 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_14ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,561 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_14ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,561 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,561 [Rank 0]:     total number of samples: 6868
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,561 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,645 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,646 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,646 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,646 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,646 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,646 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,646 [Rank 0]:  > finished creating indexed dataset in 0.000779 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,646 [Rank 0]:     number of documents: 10289
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,646 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,646 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,646 [Rank 0]:      document indices in [9970, 10279) total of 309 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,647 [Rank 0]:  > Tokens per epoch: 224077
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,657 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,662 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,663 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,666 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,666 [Rank 0]:     total number of samples: 28
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,666 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,750 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,752 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,752 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,752 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,752 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,752 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,753 [Rank 0]:  > finished creating indexed dataset in 0.002376 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,753 [Rank 0]:     number of documents: 247919
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,753 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,753 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,753 [Rank 0]:      document indices in [240234, 247671) total of 7437 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,753 [Rank 0]:  > Tokens per epoch: 23244839
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,754 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,754 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,757 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002579
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       7437
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2837
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,760 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002428
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,760 [Rank 0]:  > building shuffle index with split [0, 2837) and [2837, 2837) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,762 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002113
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,762 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_5ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,767 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_5ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,768 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_5ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,769 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,769 [Rank 0]:     total number of samples: 2838
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,770 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,854 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,854 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,854 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,854 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,854 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,855 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,855 [Rank 0]:  > finished creating indexed dataset in 0.000720 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,855 [Rank 0]:     number of documents: 5368
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,855 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,855 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,855 [Rank 0]:      document indices in [5202, 5363) total of 161 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,855 [Rank 0]:  > Tokens per epoch: 60505
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,866 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,870 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,874 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,877 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,878 [Rank 0]:     total number of samples: 8
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,878 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,962 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,962 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,962 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,962 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,963 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,963 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,963 [Rank 0]:  > finished creating indexed dataset in 0.000803 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,963 [Rank 0]:     number of documents: 17554
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,963 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,963 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,963 [Rank 0]:      document indices in [17010, 17536) total of 526 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,963 [Rank 0]:  > Tokens per epoch: 791611
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,964 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,964 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,967 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002345
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       526
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   96
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,970 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003401
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,970 [Rank 0]:  > building shuffle index with split [0, 96) and [96, 96) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,972 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001927
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,972 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,977 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,977 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,979 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,980 [Rank 0]:     total number of samples: 97
+[ip-26-0-150-122:0]:2023-06-21 17:28:13,980 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,064 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,065 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,065 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,065 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,065 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,065 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,065 [Rank 0]:  > finished creating indexed dataset in 0.001090 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,065 [Rank 0]:     number of documents: 52838
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,065 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,065 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,066 [Rank 0]:      document indices in [51200, 52785) total of 1585 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,066 [Rank 0]:  > Tokens per epoch: 3599819
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,067 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,067 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,069 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002339
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1585
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   439
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,071 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.001771
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,071 [Rank 0]:  > building shuffle index with split [0, 439) and [439, 439) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,073 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001737
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,075 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,082 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,083 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,083 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,083 [Rank 0]:     total number of samples: 440
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,083 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,167 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,169 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,169 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,169 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,170 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,170 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,170 [Rank 0]:  > finished creating indexed dataset in 0.002252 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,170 [Rank 0]:     number of documents: 928415
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,170 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,170 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,170 [Rank 0]:      document indices in [899634, 927487) total of 27853 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,170 [Rank 0]:  > Tokens per epoch: 27319085
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,172 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,172 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,176 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003651
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       27853
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   3334
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,178 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002606
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,178 [Rank 0]:  > building shuffle index with split [0, 3334) and [3334, 3334) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,182 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003556
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,182 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_10ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,190 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_10ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,191 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_10ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,191 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,191 [Rank 0]:     total number of samples: 3335
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,191 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,275 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,276 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,276 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,276 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,276 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,277 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,277 [Rank 0]:  > finished creating indexed dataset in 0.001104 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,277 [Rank 0]:     number of documents: 58151
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,277 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,277 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,277 [Rank 0]:      document indices in [56348, 58093) total of 1745 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,277 [Rank 0]:  > Tokens per epoch: 5481832
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,278 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,278 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,281 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002538
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1745
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   669
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,283 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002462
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,283 [Rank 0]:  > building shuffle index with split [0, 669) and [669, 669) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,286 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002933
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,289 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,297 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,305 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,305 [Rank 0]:     loaded indexed file in 0.016 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,305 [Rank 0]:     total number of samples: 670
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,305 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,391 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,391 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,391 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,391 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,391 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,391 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,391 [Rank 0]:  > finished creating indexed dataset in 0.000700 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,392 [Rank 0]:     number of documents: 5928
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,392 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,392 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,392 [Rank 0]:      document indices in [5744, 5922) total of 178 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,392 [Rank 0]:  > Tokens per epoch: 389178
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,394 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,394 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,396 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002733
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       178
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   47
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,399 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002642
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,399 [Rank 0]:  > building shuffle index with split [0, 47) and [47, 47) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,402 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002614
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,402 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,407 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,407 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,408 [Rank 0]:     loaded indexed file in 0.005 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,408 [Rank 0]:     total number of samples: 48
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,408 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,492 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,492 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,492 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,492 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,493 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,493 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,493 [Rank 0]:  > finished creating indexed dataset in 0.000684 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,493 [Rank 0]:     number of documents: 180
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,493 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,493 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,493 [Rank 0]:      document indices in [174, 180) total of 6 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,493 [Rank 0]:  > Tokens per epoch: 7815
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,502 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,507 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,510 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,511 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,511 [Rank 0]:     total number of samples: 2
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,511 [Rank 0]:     total number of epochs: 2
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,595 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,597 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,597 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,597 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,597 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,597 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,597 [Rank 0]:  > finished creating indexed dataset in 0.002417 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,597 [Rank 0]:     number of documents: 239568
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,597 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,597 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,598 [Rank 0]:      document indices in [232141, 239328) total of 7187 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,598 [Rank 0]:  > Tokens per epoch: 3729565
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,600 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,600 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,604 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003165
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       7187
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   455
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,606 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002676
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,606 [Rank 0]:  > building shuffle index with split [0, 455) and [455, 455) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,610 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003518
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,610 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,618 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,619 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,619 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,619 [Rank 0]:     total number of samples: 456
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,619 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,703 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,704 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,704 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,704 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,704 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,704 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,704 [Rank 0]:  > finished creating indexed dataset in 0.000751 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,704 [Rank 0]:     number of documents: 4806
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,704 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,704 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,704 [Rank 0]:      document indices in [4657, 4801) total of 144 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,705 [Rank 0]:  > Tokens per epoch: 118601
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,713 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,718 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,721 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,726 [Rank 0]:     loaded indexed file in 0.013 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,726 [Rank 0]:     total number of samples: 15
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,726 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,811 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,812 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,812 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,812 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,812 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,812 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,812 [Rank 0]:  > finished creating indexed dataset in 0.000733 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,812 [Rank 0]:     number of documents: 5429
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,812 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,812 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,812 [Rank 0]:      document indices in [5261, 5424) total of 163 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,813 [Rank 0]:  > Tokens per epoch: 146349
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,816 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,820 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,824 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,826 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,826 [Rank 0]:     total number of samples: 18
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,826 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,912 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,914 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,914 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,914 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,914 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,914 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,914 [Rank 0]:  > finished creating indexed dataset in 0.001934 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,914 [Rank 0]:     number of documents: 1355788
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,914 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,914 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,914 [Rank 0]:      document indices in [1313759, 1354432) total of 40673 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,915 [Rank 0]:  > Tokens per epoch: 38836780
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,917 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,917 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,921 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003928
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       40673
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   4740
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,924 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002658
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,924 [Rank 0]:  > building shuffle index with split [0, 4740) and [4740, 4740) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,927 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002755
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,927 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_13ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,933 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_13ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,934 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_13ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,936 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,936 [Rank 0]:     total number of samples: 4741
+[ip-26-0-150-122:0]:2023-06-21 17:28:14,936 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,022 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,023 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,023 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,023 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,023 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,024 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,024 [Rank 0]:  > finished creating indexed dataset in 0.000984 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,024 [Rank 0]:     number of documents: 49335
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,024 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,024 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,024 [Rank 0]:      document indices in [47806, 49286) total of 1480 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,024 [Rank 0]:  > Tokens per epoch: 3611088
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,026 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,026 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,028 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002204
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1480
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   440
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,031 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002141
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,031 [Rank 0]:  > building shuffle index with split [0, 440) and [440, 440) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,033 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002876
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,079 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,083 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,085 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,087 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,087 [Rank 0]:     total number of samples: 441
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,087 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,171 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,172 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,172 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,172 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,172 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,172 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,172 [Rank 0]:  > finished creating indexed dataset in 0.000877 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,173 [Rank 0]:     number of documents: 24208
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,173 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,173 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,173 [Rank 0]:      document indices in [23458, 24184) total of 726 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,173 [Rank 0]:  > Tokens per epoch: 5577566
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,174 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,174 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,176 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002153
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       726
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   680
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,179 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002290
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,179 [Rank 0]:  > building shuffle index with split [0, 680) and [680, 680) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,181 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002041
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,209 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,213 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,213 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,215 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,215 [Rank 0]:     total number of samples: 681
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,215 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,299 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,300 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,300 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,300 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,300 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,300 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,300 [Rank 0]:  > finished creating indexed dataset in 0.000756 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,300 [Rank 0]:     number of documents: 4737
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,300 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,300 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,300 [Rank 0]:      document indices in [4590, 4732) total of 142 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,301 [Rank 0]:  > Tokens per epoch: 63420
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,311 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,315 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,319 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,321 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,321 [Rank 0]:     total number of samples: 8
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,321 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,405 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,407 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,408 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,408 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,408 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,408 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,408 [Rank 0]:  > finished creating indexed dataset in 0.002352 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,408 [Rank 0]:     number of documents: 2206327
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,408 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,408 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,408 [Rank 0]:      document indices in [2137931, 2204121) total of 66190 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,409 [Rank 0]:  > Tokens per epoch: 31891052
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,410 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,411 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,415 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.004590
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       66190
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   3892
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,419 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003354
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,419 [Rank 0]:  > building shuffle index with split [0, 3892) and [3892, 3892) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,421 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002542
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,422 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_9ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,428 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_9ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,429 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_9ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,429 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,429 [Rank 0]:     total number of samples: 3893
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,429 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,514 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,515 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,515 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,515 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,515 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,515 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,515 [Rank 0]:  > finished creating indexed dataset in 0.001544 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,515 [Rank 0]:     number of documents: 125163
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,515 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,516 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,516 [Rank 0]:      document indices in [121283, 125038) total of 3755 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,516 [Rank 0]:  > Tokens per epoch: 3837021
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,517 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,517 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,519 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002499
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       3755
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   468
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,523 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003726
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,523 [Rank 0]:  > building shuffle index with split [0, 468) and [468, 468) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,526 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003039
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,531 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,537 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,538 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,538 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,538 [Rank 0]:     total number of samples: 469
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,538 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,623 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,623 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,623 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,623 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,624 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,624 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,624 [Rank 0]:  > finished creating indexed dataset in 0.000978 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,624 [Rank 0]:     number of documents: 41890
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,624 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,624 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,624 [Rank 0]:      document indices in [40591, 41848) total of 1257 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,624 [Rank 0]:  > Tokens per epoch: 2017219
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,626 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,626 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,628 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002203
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1257
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   246
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,631 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002365
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,631 [Rank 0]:  > building shuffle index with split [0, 246) and [246, 246) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,633 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002468
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,676 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,684 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,685 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,685 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,685 [Rank 0]:     total number of samples: 247
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,685 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,770 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,771 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,771 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,771 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,771 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,771 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,771 [Rank 0]:  > finished creating indexed dataset in 0.000719 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,771 [Rank 0]:     number of documents: 7917
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,771 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,771 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,772 [Rank 0]:      document indices in [7672, 7909) total of 237 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,772 [Rank 0]:  > Tokens per epoch: 1102148
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,772 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,773 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,775 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002410
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       237
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   134
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,778 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002763
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,778 [Rank 0]:  > building shuffle index with split [0, 134) and [134, 134) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,780 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001769
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,780 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,785 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,785 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,787 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,787 [Rank 0]:     total number of samples: 135
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,787 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,872 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,872 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,873 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,873 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,873 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,873 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,873 [Rank 0]:  > finished creating indexed dataset in 0.000787 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,873 [Rank 0]:     number of documents: 13716
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,873 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,873 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,873 [Rank 0]:      document indices in [13291, 13702) total of 411 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,873 [Rank 0]:  > Tokens per epoch: 465467
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,875 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,875 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,878 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002776
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       411
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   56
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,880 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002538
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,880 [Rank 0]:  > building shuffle index with split [0, 56) and [56, 56) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,882 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002017
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,923 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,927 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,928 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,930 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,930 [Rank 0]:     total number of samples: 57
+[ip-26-0-150-122:0]:2023-06-21 17:28:15,930 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,015 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,017 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,017 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,017 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,017 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,017 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,017 [Rank 0]:  > finished creating indexed dataset in 0.002290 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,017 [Rank 0]:     number of documents: 975420
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,017 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,017 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,017 [Rank 0]:      document indices in [945182, 974445) total of 29263 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,018 [Rank 0]:  > Tokens per epoch: 164859090
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,018 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,019 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,022 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003657
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       29263
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   20124
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,025 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002956
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,025 [Rank 0]:  > building shuffle index with split [0, 20124) and [20124, 20124) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,028 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002082
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,034 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_30ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,043 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_30ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,043 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_30ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,044 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,044 [Rank 0]:     total number of samples: 20125
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,044 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,127 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,129 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,129 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,129 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,129 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,129 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,129 [Rank 0]:  > finished creating indexed dataset in 0.001750 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,129 [Rank 0]:     number of documents: 167701
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,129 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,129 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,129 [Rank 0]:      document indices in [162502, 167533) total of 5031 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,130 [Rank 0]:  > Tokens per epoch: 5272081
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,130 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,131 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,134 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003043
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       5031
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   643
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,136 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002185
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,136 [Rank 0]:  > building shuffle index with split [0, 643) and [643, 643) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,138 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002272
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,142 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,149 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,149 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,150 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,150 [Rank 0]:     total number of samples: 644
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,150 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,234 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,235 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,235 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,235 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,235 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,235 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,235 [Rank 0]:  > finished creating indexed dataset in 0.001125 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,235 [Rank 0]:     number of documents: 62033
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,235 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,235 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,236 [Rank 0]:      document indices in [60110, 61971) total of 1861 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,236 [Rank 0]:  > Tokens per epoch: 2205938
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,238 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,238 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,241 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003110
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1861
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   269
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,243 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.001885
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,243 [Rank 0]:  > building shuffle index with split [0, 269) and [269, 269) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,246 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003120
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,247 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,252 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,252 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,254 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,254 [Rank 0]:     total number of samples: 270
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,254 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,339 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,341 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,341 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,341 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,341 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,341 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,341 [Rank 0]:  > finished creating indexed dataset in 0.002170 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,341 [Rank 0]:     number of documents: 571506
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,341 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,341 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,341 [Rank 0]:      document indices in [553789, 570934) total of 17145 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,342 [Rank 0]:  > Tokens per epoch: 4375164
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,342 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,343 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,346 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003622
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       17145
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   534
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,350 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003655
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,350 [Rank 0]:  > building shuffle index with split [0, 534) and [534, 534) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,353 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003345
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,354 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,360 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,360 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,361 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,361 [Rank 0]:     total number of samples: 535
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,361 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,445 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,447 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,447 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,447 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,447 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,448 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,448 [Rank 0]:  > finished creating indexed dataset in 0.002257 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,448 [Rank 0]:     number of documents: 6353527
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,448 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,448 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,448 [Rank 0]:      document indices in [6156568, 6347173) total of 190605 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,449 [Rank 0]:  > Tokens per epoch: 476705041
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,450 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,450 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,458 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.008151
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       190605
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   58191
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,462 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003879
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,463 [Rank 0]:  > building shuffle index with split [0, 58191) and [58191, 58191) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,466 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003294
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,466 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_132ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,477 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_132ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,477 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_132ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,478 [Rank 0]:     loaded indexed file in 0.012 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,478 [Rank 0]:     total number of samples: 58192
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,478 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,562 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,563 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,564 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,564 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,564 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,564 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,564 [Rank 0]:  > finished creating indexed dataset in 0.001764 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,564 [Rank 0]:     number of documents: 226209
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,564 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,564 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,564 [Rank 0]:      document indices in [219197, 225983) total of 6786 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,564 [Rank 0]:  > Tokens per epoch: 5560129
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,566 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,566 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,570 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003138
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       6786
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   678
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,573 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002916
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,573 [Rank 0]:  > building shuffle index with split [0, 678) and [678, 678) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,574 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001654
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,581 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,589 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,595 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,595 [Rank 0]:     loaded indexed file in 0.014 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,595 [Rank 0]:     total number of samples: 679
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,595 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,679 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,680 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,680 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,680 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,680 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,681 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,681 [Rank 0]:  > finished creating indexed dataset in 0.001380 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,681 [Rank 0]:     number of documents: 98733
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,681 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,681 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,681 [Rank 0]:      document indices in [95672, 98634) total of 2962 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,681 [Rank 0]:  > Tokens per epoch: 16829467
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,682 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,682 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,685 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002942
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       2962
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   2054
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,688 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003067
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,689 [Rank 0]:  > building shuffle index with split [0, 2054) and [2054, 2054) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,691 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002519
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,691 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,699 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,700 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,700 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,700 [Rank 0]:     total number of samples: 2055
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,700 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,784 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,786 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,786 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,786 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,786 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,787 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,787 [Rank 0]:  > finished creating indexed dataset in 0.002117 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,787 [Rank 0]:     number of documents: 281016
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,787 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,787 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,787 [Rank 0]:      document indices in [272305, 280735) total of 8430 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,787 [Rank 0]:  > Tokens per epoch: 7046176
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,788 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,789 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,792 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003131
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       8430
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   860
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,795 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003006
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,795 [Rank 0]:  > building shuffle index with split [0, 860) and [860, 860) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,797 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001766
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,799 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,804 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,804 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,804 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,805 [Rank 0]:     total number of samples: 861
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,805 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,889 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,891 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,891 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,891 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,891 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,891 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,891 [Rank 0]:  > finished creating indexed dataset in 0.002249 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,891 [Rank 0]:     number of documents: 250834
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,891 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,891 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,891 [Rank 0]:      document indices in [243058, 250583) total of 7525 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,892 [Rank 0]:  > Tokens per epoch: 7066083
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,893 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,893 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,896 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002884
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       7525
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   862
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,898 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002220
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,899 [Rank 0]:  > building shuffle index with split [0, 862) and [862, 862) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,901 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002579
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,904 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,908 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,910 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,912 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,912 [Rank 0]:     total number of samples: 863
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,912 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,997 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,999 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,999 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,999 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,999 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,999 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,999 [Rank 0]:  > finished creating indexed dataset in 0.002316 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,999 [Rank 0]:     number of documents: 3299965
+[ip-26-0-150-122:0]:2023-06-21 17:28:16,999 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,000 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,000 [Rank 0]:      document indices in [3197666, 3296665) total of 98999 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,000 [Rank 0]:  > Tokens per epoch: 293479485
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,002 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,002 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,007 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.005542
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       98999
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   35825
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,010 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002775
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,010 [Rank 0]:  > building shuffle index with split [0, 35825) and [35825, 35825) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,013 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003100
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,056 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_79ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,065 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_79ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,066 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_79ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,066 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,066 [Rank 0]:     total number of samples: 35826
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,066 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,151 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,152 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,153 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,153 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,153 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,153 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,153 [Rank 0]:  > finished creating indexed dataset in 0.002153 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,153 [Rank 0]:     number of documents: 20071773
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,153 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,153 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,153 [Rank 0]:      document indices in [19449548, 20051701) total of 602153 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,157 [Rank 0]:  > Tokens per epoch: 679829501
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,158 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,158 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,182 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.023708
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       602153
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   82986
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,187 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.005039
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,187 [Rank 0]:  > building shuffle index with split [0, 82986) and [82986, 82986) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,192 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004693
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,217 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_234ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,228 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_234ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,231 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_234ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,231 [Rank 0]:     loaded indexed file in 0.014 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,231 [Rank 0]:     total number of samples: 82987
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,231 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,315 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,317 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,318 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,318 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,318 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,318 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,318 [Rank 0]:  > finished creating indexed dataset in 0.002495 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,318 [Rank 0]:     number of documents: 19544285
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,318 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,318 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,318 [Rank 0]:      document indices in [18938412, 19524741) total of 586329 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,322 [Rank 0]:  > Tokens per epoch: 565628573
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,324 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,324 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,346 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.021432
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       586329
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   69046
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,352 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.006332
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,352 [Rank 0]:  > building shuffle index with split [0, 69046) and [69046, 69046) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,356 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003602
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,356 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_174ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,367 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_174ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,368 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_174ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,368 [Rank 0]:     loaded indexed file in 0.012 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,368 [Rank 0]:     total number of samples: 69047
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,368 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,452 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,454 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,454 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,454 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,454 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,455 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,455 [Rank 0]:  > finished creating indexed dataset in 0.002519 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,455 [Rank 0]:     number of documents: 21029287
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,455 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,455 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,455 [Rank 0]:      document indices in [20377379, 21008258) total of 630879 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,459 [Rank 0]:  > Tokens per epoch: 765105610
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,460 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,460 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,484 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.023816
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       630879
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   93396
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,489 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.005210
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,489 [Rank 0]:  > building shuffle index with split [0, 93396) and [93396, 93396) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,494 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004117
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,524 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_202ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,536 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_202ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,537 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_202ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,538 [Rank 0]:     loaded indexed file in 0.014 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,538 [Rank 0]:     total number of samples: 93397
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,538 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,622 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,624 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,624 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,624 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,624 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,625 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,625 [Rank 0]:  > finished creating indexed dataset in 0.002314 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,625 [Rank 0]:     number of documents: 15683017
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,625 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,625 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,625 [Rank 0]:      document indices in [15196843, 15667334) total of 470491 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,629 [Rank 0]:  > Tokens per epoch: 512566580
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,630 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,631 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,648 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.017646
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       470491
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   62569
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,652 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004050
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,653 [Rank 0]:  > building shuffle index with split [0, 62569) and [62569, 62569) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,657 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.004143
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,657 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_164ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,670 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_164ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,672 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_164ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,673 [Rank 0]:     loaded indexed file in 0.015 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,673 [Rank 0]:     total number of samples: 62570
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,673 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,757 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,759 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,759 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,759 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,759 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,759 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,760 [Rank 0]:  > finished creating indexed dataset in 0.002110 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,760 [Rank 0]:     number of documents: 12866649
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,760 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,760 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,760 [Rank 0]:      document indices in [12467783, 12853782) total of 385999 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,762 [Rank 0]:  > Tokens per epoch: 529606827
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,764 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,764 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,780 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.015376
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       385999
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   64649
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,785 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.005202
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,785 [Rank 0]:  > building shuffle index with split [0, 64649) and [64649, 64649) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,789 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003803
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,790 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_163ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,801 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_163ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,808 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_163ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,808 [Rank 0]:     loaded indexed file in 0.019 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,808 [Rank 0]:     total number of samples: 64650
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,809 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,892 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,893 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,893 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,893 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,893 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,894 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,894 [Rank 0]:  > finished creating indexed dataset in 0.001840 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,894 [Rank 0]:     number of documents: 10547331
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,894 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,894 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,894 [Rank 0]:      document indices in [10220364, 10536784) total of 316420 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,896 [Rank 0]:  > Tokens per epoch: 222078157
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,898 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,898 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,910 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.012172
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       316420
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   27109
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,914 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.004088
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,914 [Rank 0]:  > building shuffle index with split [0, 27109) and [27109, 27109) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,917 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002453
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,917 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_72ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,926 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_72ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,926 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_72ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,927 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,927 [Rank 0]:     total number of samples: 27110
+[ip-26-0-150-122:0]:2023-06-21 17:28:17,927 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,011 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,012 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,012 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,012 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,012 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,012 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,012 [Rank 0]:  > finished creating indexed dataset in 0.000731 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,012 [Rank 0]:     number of documents: 75
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,012 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,012 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,012 [Rank 0]:      document indices in [73, 75) total of 2 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,013 [Rank 0]:  > Tokens per epoch: 5184
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,026 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,030 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,031 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,035 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,035 [Rank 0]:     total number of samples: 2
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,035 [Rank 0]:     total number of epochs: 2
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,120 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,121 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,122 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,122 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,122 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,122 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,122 [Rank 0]:  > finished creating indexed dataset in 0.001868 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,122 [Rank 0]:     number of documents: 161239
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,122 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,122 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,122 [Rank 0]:      document indices in [156241, 161078) total of 4837 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,122 [Rank 0]:  > Tokens per epoch: 11401469
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,124 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,124 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,127 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002941
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       4837
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   1391
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,129 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.001961
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,129 [Rank 0]:  > building shuffle index with split [0, 1391) and [1391, 1391) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,132 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002429
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,132 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,140 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,143 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,144 [Rank 0]:     loaded indexed file in 0.012 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,144 [Rank 0]:     total number of samples: 1392
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,144 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,229 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,229 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,230 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,230 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,230 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,230 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,230 [Rank 0]:  > finished creating indexed dataset in 0.001112 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,230 [Rank 0]:     number of documents: 58208
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,230 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,230 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,230 [Rank 0]:      document indices in [56404, 58150) total of 1746 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,230 [Rank 0]:  > Tokens per epoch: 12008501
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,231 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,231 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,234 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002831
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1746
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   1465
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,240 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.005459
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,240 [Rank 0]:  > building shuffle index with split [0, 1465) and [1465, 1465) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,243 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003084
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,244 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,252 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,257 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,257 [Rank 0]:     loaded indexed file in 0.014 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,258 [Rank 0]:     total number of samples: 1466
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,258 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,342 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,342 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,343 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,343 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,343 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,343 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,343 [Rank 0]:  > finished creating indexed dataset in 0.000728 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,343 [Rank 0]:     number of documents: 4661
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,343 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,343 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,343 [Rank 0]:      document indices in [4517, 4656) total of 139 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,343 [Rank 0]:  > Tokens per epoch: 98302
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,357 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,361 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,363 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,366 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,367 [Rank 0]:     total number of samples: 12
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,367 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,451 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]:  > finished creating indexed dataset in 0.000742 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]:     number of documents: 93
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]:      document indices in [90, 93) total of 3 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]:  > Tokens per epoch: 4277
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,456 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,459 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,460 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,463 [Rank 0]:     loaded indexed file in 0.007 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,463 [Rank 0]:     total number of samples: 2
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,463 [Rank 0]:     total number of epochs: 2
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,547 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,548 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,548 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,548 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,548 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,548 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,548 [Rank 0]:  > finished creating indexed dataset in 0.000765 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,548 [Rank 0]:     number of documents: 7451
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,548 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,549 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,549 [Rank 0]:      document indices in [7220, 7444) total of 224 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,549 [Rank 0]:  > Tokens per epoch: 1128407
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,550 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,550 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,553 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002814
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       224
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   137
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,555 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002108
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,555 [Rank 0]:  > building shuffle index with split [0, 137) and [137, 137) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,557 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001781
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,557 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,561 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,565 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,568 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,568 [Rank 0]:     total number of samples: 138
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,568 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,653 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,653 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,653 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,653 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,653 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,654 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,654 [Rank 0]:  > finished creating indexed dataset in 0.000821 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,654 [Rank 0]:     number of documents: 15850
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,654 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,654 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,654 [Rank 0]:      document indices in [15359, 15834) total of 475 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,654 [Rank 0]:  > Tokens per epoch: 2144189
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,655 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,655 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,658 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003053
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       475
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   261
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,661 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002448
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,661 [Rank 0]:  > building shuffle index with split [0, 261) and [261, 261) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,663 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002231
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,663 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,668 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,672 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,675 [Rank 0]:     loaded indexed file in 0.011 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,675 [Rank 0]:     total number of samples: 262
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,675 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,760 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,760 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,760 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,761 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,761 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,761 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,761 [Rank 0]:  > finished creating indexed dataset in 0.001013 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,761 [Rank 0]:     number of documents: 42103
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,761 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,761 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,761 [Rank 0]:      document indices in [40798, 42061) total of 1263 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,761 [Rank 0]:  > Tokens per epoch: 4166294
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,762 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,762 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,764 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002029
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       1263
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   508
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,766 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002079
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,767 [Rank 0]:  > building shuffle index with split [0, 508) and [508, 508) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,768 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001777
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,769 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,776 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,776 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,777 [Rank 0]:     loaded indexed file in 0.008 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,777 [Rank 0]:     total number of samples: 509
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,777 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,861 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,863 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,863 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,863 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,863 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,864 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,864 [Rank 0]:  > finished creating indexed dataset in 0.002177 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,864 [Rank 0]:     number of documents: 4751547
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,864 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,864 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,864 [Rank 0]:      document indices in [4604249, 4746795) total of 142546 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,865 [Rank 0]:  > Tokens per epoch: 62884447
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,866 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,866 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,872 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.006435
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       142546
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   7676
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,875 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002373
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,875 [Rank 0]:  > building shuffle index with split [0, 7676) and [7676, 7676) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,877 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.001934
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,877 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,885 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,886 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,886 [Rank 0]:     loaded indexed file in 0.009 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,886 [Rank 0]:     total number of samples: 7677
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,886 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,971 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,973 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,973 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,973 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,973 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,973 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,973 [Rank 0]:  > finished creating indexed dataset in 0.002288 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,973 [Rank 0]:     number of documents: 3995948
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,973 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,974 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,974 [Rank 0]:      document indices in [3872074, 3991952) total of 119878 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,975 [Rank 0]:  > Tokens per epoch: 35974762
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,976 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,976 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,982 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.005884
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       119878
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   4391
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,985 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002445
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,985 [Rank 0]:  > building shuffle index with split [0, 4391) and [4391, 4391) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,987 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002585
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,988 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,997 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,998 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,998 [Rank 0]:     loaded indexed file in 0.010 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,998 [Rank 0]:     total number of samples: 4392
+[ip-26-0-150-122:0]:2023-06-21 17:28:18,998 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,081 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,083 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,083 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,083 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,083 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,084 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,084 [Rank 0]:  > finished creating indexed dataset in 0.002353 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,084 [Rank 0]:     number of documents: 30982955
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,084 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,084 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,084 [Rank 0]:      document indices in [30022483, 30951972) total of 929489 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,090 [Rank 0]:  > Tokens per epoch: 538755961
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,091 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,091 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,125 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.033432
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       929489
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   65766
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,131 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.006360
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,131 [Rank 0]:  > building shuffle index with split [0, 65766) and [65766, 65766) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,135 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003373
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,159 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_146ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,174 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_146ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,175 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_146ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,175 [Rank 0]:     loaded indexed file in 0.016 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,176 [Rank 0]:     total number of samples: 65767
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,176 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,258 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,261 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,261 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,261 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,261 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,261 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,261 [Rank 0]:  > finished creating indexed dataset in 0.002677 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,261 [Rank 0]:     number of documents: 7634718
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,261 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,261 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,261 [Rank 0]:      document indices in [7398042, 7627083) total of 229041 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,263 [Rank 0]:  > Tokens per epoch: 483498380
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,265 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,265 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,276 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.010390
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       229041
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   59020
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,279 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003644
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,279 [Rank 0]:  > building shuffle index with split [0, 59020) and [59020, 59020) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,283 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.003726
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,284 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_86ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,296 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_86ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,297 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_86ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,297 [Rank 0]:     loaded indexed file in 0.014 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,298 [Rank 0]:     total number of samples: 59021
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,298 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,380 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,382 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,382 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,382 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,382 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,383 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,383 [Rank 0]:  > finished creating indexed dataset in 0.002234 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,383 [Rank 0]:     number of documents: 914510
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,383 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,383 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,383 [Rank 0]:      document indices in [886160, 913595) total of 27435 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,383 [Rank 0]:  > Tokens per epoch: 73709652
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,384 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,384 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,388 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.003489
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       27435
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   8997
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,391 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.002764
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,391 [Rank 0]:  > building shuffle index with split [0, 8997) and [8997, 8997) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,393 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002206
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,399 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_20ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,405 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_20ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,405 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_20ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,406 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,406 [Rank 0]:     total number of samples: 8998
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,406 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,488 [Rank 0]:  > building dataset index ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,491 [Rank 0]:     reading sizes...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,491 [Rank 0]:     reading pointers...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,491 [Rank 0]:     reading document index...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,491 [Rank 0]:     creating numpy buffer of mmap...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,491 [Rank 0]:     creating memory view of numpy buffer...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,491 [Rank 0]:  > finished creating indexed dataset in 0.002633 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,491 [Rank 0]:     number of documents: 668743
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,491 [Rank 0]:  > dataset split:
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,492 [Rank 0]:     VALID_all_sources_weighted:
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,492 [Rank 0]:      document indices in [648012, 668074) total of 20062 documents
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,492 [Rank 0]:  > Tokens per epoch: 56156688
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,493 [Rank 0]:  > WARNING: could not find index map files, building the indices on rank 0 ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,493 [Rank 0]:  > only one epoch required, setting separate_last_epoch to False
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,495 [Rank 0]:  > elasped time to build and save doc-idx mapping (seconds): 0.002499
+[ip-26-0-150-122:0]:    using:
+[ip-26-0-150-122:0]:     number of documents:       20062
+[ip-26-0-150-122:0]:     number of epochs:          1
+[ip-26-0-150-122:0]:     sequence length:           8192
+[ip-26-0-150-122:0]:     total number of samples:   6855
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,499 [Rank 0]:  > elasped time to build and save sample-idx mapping (seconds): 0.003009
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,499 [Rank 0]:  > building shuffle index with split [0, 6855) and [6855, 6855) ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,501 [Rank 0]:  > elasped time to build and save shuffle-idx mapping (seconds): 0.002398
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,508 [Rank 0]:  > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_17ns_8192sl_1234s_doc_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,513 [Rank 0]:  > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_17ns_8192sl_1234s_sample_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,513 [Rank 0]:  > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_17ns_8192sl_1234s_shuffle_idx.npy
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,514 [Rank 0]:     loaded indexed file in 0.006 seconds
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,514 [Rank 0]:     total number of samples: 6856
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,514 [Rank 0]:     total number of epochs: 1
+[ip-26-0-150-122:0]:> building indices for blendable datasets ...
+[ip-26-0-150-122:0]: > sample ratios:
+[ip-26-0-150-122:0]:   dataset 0, input: 0.00391159, achieved: 0.00391165
+[ip-26-0-150-122:0]:   dataset 1, input: 1.30386e-05, achieved: 1.30534e-05
+[ip-26-0-150-122:0]:   dataset 2, input: 0.0702651, achieved: 0.0702651
+[ip-26-0-150-122:0]:   dataset 3, input: 0.00232087, achieved: 0.00232023
+[ip-26-0-150-122:0]:   dataset 4, input: 0.00110828, achieved: 0.00110845
+[ip-26-0-150-122:0]:   dataset 5, input: 0.00740594, achieved: 0.0074056
+[ip-26-0-150-122:0]:   dataset 6, input: 1.30386e-05, achieved: 1.30534e-05
+[ip-26-0-150-122:0]:   dataset 7, input: 0.00170806, achieved: 0.00170781
+[ip-26-0-150-122:0]:   dataset 8, input: 0.00127778, achieved: 0.00127814
+[ip-26-0-150-122:0]:   dataset 9, input: 0.000104309, achieved: 0.000104427
+[ip-26-0-150-122:0]:   dataset 10, input: 3.91159e-05, achieved: 3.91601e-05
+[ip-26-0-150-122:0]:   dataset 11, input: 0.000117348, achieved: 0.00011748
+[ip-26-0-150-122:0]:   dataset 12, input: 0.00146033, achieved: 0.0014598
+[ip-26-0-150-122:0]:   dataset 13, input: 0.0310058, achieved: 0.0310061
+[ip-26-0-150-122:0]:   dataset 14, input: 0.000912704, achieved: 0.000912647
+[ip-26-0-150-122:0]:   dataset 15, input: 0.000795356, achieved: 0.000795167
+[ip-26-0-150-122:0]:   dataset 16, input: 0.000339004, achieved: 0.000339387
+[ip-26-0-150-122:0]:   dataset 17, input: 0.00219049, achieved: 0.00219079
+[ip-26-0-150-122:0]:   dataset 18, input: 0.00290761, achieved: 0.00290763
+[ip-26-0-150-122:0]:   dataset 19, input: 0.000391159, achieved: 0.000391601
+[ip-26-0-150-122:0]:   dataset 20, input: 0.000404197, achieved: 0.000404654
+[ip-26-0-150-122:0]:   dataset 21, input: 0.000586738, achieved: 0.000586313
+[ip-26-0-150-122:0]:   dataset 22, input: 0.000156463, achieved: 0.00015664
+[ip-26-0-150-122:0]:   dataset 23, input: 0.0088793, achieved: 0.00887954
+[ip-26-0-150-122:0]:   dataset 24, input: 0.0118782, achieved: 0.0118786
+[ip-26-0-150-122:0]:   dataset 25, input: 7.82317e-05, achieved: 7.83201e-05
+[ip-26-0-150-122:0]:   dataset 26, input: 0.0582305, achieved: 0.0582299
+[ip-26-0-150-122:0]:   dataset 27, input: 0.00075624, achieved: 0.000756007
+[ip-26-0-150-122:0]:   dataset 28, input: 0.00290761, achieved: 0.00290763
+[ip-26-0-150-122:0]:   dataset 29, input: 1.30386e-05, achieved: 1.30534e-05
+[ip-26-0-150-122:0]:   dataset 30, input: 0.00162983, achieved: 0.00162949
+[ip-26-0-150-122:0]:   dataset 31, input: 0.00134298, achieved: 0.00134341
+[ip-26-0-150-122:0]:   dataset 32, input: 0.00170806, achieved: 0.00170781
+[ip-26-0-150-122:0]:   dataset 33, input: 0.00374208, achieved: 0.00374196
+[ip-26-0-150-122:0]:   dataset 34, input: 1.30386e-05, achieved: 1.30534e-05
+[ip-26-0-150-122:0]:   dataset 35, input: 6.51931e-05, achieved: 6.52668e-05
+[ip-26-0-150-122:0]:   dataset 36, input: 0.00432882, achieved: 0.00432827
+[ip-26-0-150-122:0]:   dataset 37, input: 3.91159e-05, achieved: 3.91601e-05
+[ip-26-0-150-122:0]:   dataset 38, input: 0.000247734, achieved: 0.000248014
+[ip-26-0-150-122:0]:   dataset 39, input: 0.000508506, achieved: 0.000507993
+[ip-26-0-150-122:0]:   dataset 40, input: 0.00678008, achieved: 0.00678013
+[ip-26-0-150-122:0]:   dataset 41, input: 2.60772e-05, achieved: 2.61067e-05
+[ip-26-0-150-122:0]:   dataset 42, input: 0.00203403, achieved: 0.00203415
+[ip-26-0-150-122:0]:   dataset 43, input: 1.30386e-05, achieved: 1.30534e-05
+[ip-26-0-150-122:0]:   dataset 44, input: 9.12704e-05, achieved: 9.13735e-05
+[ip-26-0-150-122:0]:   dataset 45, input: 0.000534584, achieved: 0.0005341
+[ip-26-0-150-122:0]:   dataset 46, input: 0.00477214, achieved: 0.00477209
+[ip-26-0-150-122:0]:   dataset 47, input: 0.000730163, achieved: 0.0007299
+[ip-26-0-150-122:0]:   dataset 48, input: 3.91159e-05, achieved: 3.91601e-05
+[ip-26-0-150-122:0]:   dataset 49, input: 1.30386e-06, achieved: 2.17556e-06
+[ip-26-0-150-122:0]:   dataset 50, input: 0.000299888, achieved: 0.000300227
+[ip-26-0-150-122:0]:   dataset 51, input: 2.60772e-05, achieved: 2.61067e-05
+[ip-26-0-150-122:0]:   dataset 52, input: 1.30386e-05, achieved: 1.30534e-05
+[ip-26-0-150-122:0]:   dataset 53, input: 0.00611511, achieved: 0.0061155
+[ip-26-0-150-122:0]:   dataset 54, input: 0.000456352, achieved: 0.00045578
+[ip-26-0-150-122:0]:   dataset 55, input: 0.000430275, achieved: 0.000430761
+[ip-26-0-150-122:0]:   dataset 56, input: 1.30386e-05, achieved: 1.30534e-05
+[ip-26-0-150-122:0]:   dataset 57, input: 0.00402893, achieved: 0.00402914
+[ip-26-0-150-122:0]:   dataset 58, input: 0.000599777, achieved: 0.000599366
+[ip-26-0-150-122:0]:   dataset 59, input: 0.000260772, achieved: 0.000261067
+[ip-26-0-150-122:0]:   dataset 60, input: 6.51931e-05, achieved: 6.52668e-05
+[ip-26-0-150-122:0]:   dataset 61, input: 5.21545e-05, achieved: 5.22134e-05
+[ip-26-0-150-122:0]:   dataset 62, input: 0.0144598, achieved: 0.0144599
+[ip-26-0-150-122:0]:   dataset 63, input: 0.000521545, achieved: 0.000521046
+[ip-26-0-150-122:0]:   dataset 64, input: 0.000391159, achieved: 0.000391601
+[ip-26-0-150-122:0]:   dataset 65, input: 0.000547622, achieved: 0.000547153
+[ip-26-0-150-122:0]:   dataset 66, input: 0.0637849, achieved: 0.0637852
+[ip-26-0-150-122:0]:   dataset 67, input: 0.000834472, achieved: 0.000834327
+[ip-26-0-150-122:0]:   dataset 68, input: 0.00182541, achieved: 0.00182529
+[ip-26-0-150-122:0]:   dataset 69, input: 0.000925742, achieved: 0.0009257
+[ip-26-0-150-122:0]:   dataset 70, input: 0.00118651, achieved: 0.00118677
+[ip-26-0-150-122:0]:   dataset 71, input: 0.0382814, achieved: 0.0382811
+[ip-26-0-150-122:0]:   dataset 72, input: 0.113358, achieved: 0.113357
+[ip-26-0-150-122:0]:   dataset 73, input: 0.0843729, achieved: 0.0843725
+[ip-26-0-150-122:0]:   dataset 74, input: 0.0976984, achieved: 0.0976978
+[ip-26-0-150-122:0]:   dataset 75, input: 0.0793922, achieved: 0.0793916
+[ip-26-0-150-122:0]:   dataset 76, input: 0.0787533, achieved: 0.0787531
+[ip-26-0-150-122:0]:   dataset 77, input: 0.0345784, achieved: 0.0345783
+[ip-26-0-150-122:0]:   dataset 78, input: 1.30386e-06, achieved: 1.08778e-06
+[ip-26-0-150-122:0]:   dataset 79, input: 0.00185148, achieved: 0.0018514
+[ip-26-0-150-122:0]:   dataset 80, input: 0.00122563, achieved: 0.00122593
+[ip-26-0-150-122:0]:   dataset 81, input: 1.30386e-05, achieved: 1.30534e-05
+[ip-26-0-150-122:0]:   dataset 82, input: 2.60772e-07, achieved: 1.08778e-06
+[ip-26-0-150-122:0]:   dataset 83, input: 0.000143425, achieved: 0.000143587
+[ip-26-0-150-122:0]:   dataset 84, input: 0.000234695, achieved: 0.00023496
+[ip-26-0-150-122:0]:   dataset 85, input: 6.51931e-05, achieved: 6.52668e-05
+[ip-26-0-150-122:0]:   dataset 86, input: 0.00130386, achieved: 0.00130425
+[ip-26-0-150-122:0]:   dataset 87, input: 0.00130386, achieved: 0.00130425
+[ip-26-0-150-122:0]:   dataset 88, input: 0.0709301, achieved: 0.0709297
+[ip-26-0-150-122:0]:   dataset 89, input: 0.0417236, achieved: 0.041724
+[ip-26-0-150-122:0]:   dataset 90, input: 0.0092835, achieved: 0.00928311
+[ip-26-0-150-122:0]:   dataset 91, input: 0.00782317, achieved: 0.00782331
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,705 [Rank 0]: > elapsed time for building blendable dataset indices: 0.11 (sec)
+[ip-26-0-150-122:0]:2023-06-21 17:28:19,705 [Rank 0]: > finished creating GPT datasets ...
+[ip-26-0-155-69:7]:2023-06-21 17:28:20,378 [Rank 63]: time (ms) | model-and-optimizer-setup: 691.12 | train/valid/test-data-iterators-setup: 57399.51
+[ip-26-0-150-122:0]:2023-06-21 17:28:20,375 [Rank 0]: [after dataloaders are built] datetime: 2023-06-21 17:28:20 
+[ip-26-0-150-122:0]:2023-06-21 17:28:20,375 [Rank 0]: done with setup ...
+[ip-26-0-150-122:0]:2023-06-21 17:28:20,375 [Rank 0]: training ...
+[ip-26-0-155-69:7]:2023-06-21 17:28:22,858 [Rank 63]: wandb: Currently logged in as: loubnabnl. Use `wandb login --relogin` to force relogin
+[ip-26-0-155-69:7]:
+[ip-26-0-155-69:7]:2023-06-21 17:28:30,107 [Rank 63]: wandb: wandb version 0.15.4 is available!  To upgrade, please run:
+[ip-26-0-155-69:7]:wandb:  $ pip install wandb --upgrade
+[ip-26-0-155-69:7]:
+[ip-26-0-155-69:7]:2023-06-21 17:28:30,107 [Rank 63]: wandb: Tracking run with wandb version 0.13.10
+[ip-26-0-155-69:7]:
+[ip-26-0-155-69:7]:2023-06-21 17:28:30,107 [Rank 63]: wandb: Run data is saved locally in /fsx/loubna/code/Megatron-LM/wandb/run-20230621_172822-yyzr4vv2
+[ip-26-0-155-69:7]:wandb: Run `wandb offline` to turn off syncing.
+[ip-26-0-155-69:7]:
+[ip-26-0-155-69:7]:2023-06-21 17:28:30,113 [Rank 63]: wandb: Syncing run 1b-starcoder
+[ip-26-0-155-69:7]:
+[ip-26-0-155-69:7]:2023-06-21 17:28:30,113 [Rank 63]: wandb:  View project at https://wandb.ai/loubnabnl/1b-model
+[ip-26-0-155-69:7]:
+[ip-26-0-155-69:7]:2023-06-21 17:28:30,113 [Rank 63]: wandb:  View run at https://wandb.ai/loubnabnl/1b-model/runs/yyzr4vv2
+[ip-26-0-155-69:7]:
+[ip-26-0-150-122:0]:2023-06-21 17:28:30,119 [Rank 0]: [before the start of training step] datetime: 2023-06-21 17:28:30 
+[ip-26-0-155-69:7]:2023-06-21 17:28:42,341 [Rank 63]:  iteration       10/  150000 | consumed samples:          640 | elapsed time per iteration (ms): 1222.0 | learning rate: 1.500E-06 | global batch size:    64 | lm loss: 1.096193E+01 | loss scale: 1.0 | grad norm: 24.321 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 77.43 | tokens-per-second-per-gpu: 6703.52 |
+[ip-26-0-155-69:7]:2023-06-21 17:28:42,342 [Rank 63]: time (ms) | forward-compute: 451.99 | backward-compute: 458.74 | backward-params-all-reduce: 231.46 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 231.59 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 43.83 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.71 | optimizer-copy-main-to-model-params: 8.35 | optimizer: 75.57 | batch-generator: 5.70
+[ip-26-0-150-122:0]:2023-06-21 17:28:42,339 [Rank 0]: [Rank 0] (after 10 iterations) memory (MB) | allocated: 19521.45947265625 | max allocated: 35040.9794921875 | reserved: 36068.0 | max reserved: 36068.0
+[ip-26-0-155-69:7]:2023-06-21 17:28:51,279 [Rank 63]:  iteration       20/  150000 | consumed samples:         1280 | elapsed time per iteration (ms): 893.8 | learning rate: 3.000E-06 | global batch size:    64 | lm loss: 9.533918E+00 | loss scale: 1.0 | grad norm: 10.686 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.86 | tokens-per-second-per-gpu: 9164.90 |
+[ip-26-0-155-69:7]:2023-06-21 17:28:51,280 [Rank 63]: time (ms) | forward-compute: 223.56 | backward-compute: 398.31 | backward-params-all-reduce: 225.90 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.01 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.92 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:29:00,210 [Rank 63]:  iteration       30/  150000 | consumed samples:         1920 | elapsed time per iteration (ms): 893.0 | learning rate: 4.500E-06 | global batch size:    64 | lm loss: 8.796992E+00 | loss scale: 1.0 | grad norm: 4.760 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9173.13 |
+[ip-26-0-155-69:7]:2023-06-21 17:29:00,210 [Rank 63]: time (ms) | forward-compute: 223.11 | backward-compute: 398.41 | backward-params-all-reduce: 225.46 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.57 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.34 | optimizer: 41.94 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:29:09,139 [Rank 63]:  iteration       40/  150000 | consumed samples:         2560 | elapsed time per iteration (ms): 892.9 | learning rate: 6.000E-06 | global batch size:    64 | lm loss: 8.444675E+00 | loss scale: 1.0 | grad norm: 3.994 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9174.64 |
+[ip-26-0-155-69:7]:2023-06-21 17:29:09,140 [Rank 63]: time (ms) | forward-compute: 223.09 | backward-compute: 398.37 | backward-params-all-reduce: 225.39 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 225.51 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.91 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:29:18,068 [Rank 63]:  iteration       50/  150000 | consumed samples:         3200 | elapsed time per iteration (ms): 893.0 | learning rate: 7.500E-06 | global batch size:    64 | lm loss: 8.253671E+00 | loss scale: 1.0 | grad norm: 4.000 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.97 | tokens-per-second-per-gpu: 9173.97 |
+[ip-26-0-155-69:7]:2023-06-21 17:29:18,069 [Rank 63]: time (ms) | forward-compute: 223.13 | backward-compute: 398.25 | backward-params-all-reduce: 225.57 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.67 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.89 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:29:27,000 [Rank 63]:  iteration       60/  150000 | consumed samples:         3840 | elapsed time per iteration (ms): 893.1 | learning rate: 9.000E-06 | global batch size:    64 | lm loss: 7.951717E+00 | loss scale: 1.0 | grad norm: 4.310 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.19 |
+[ip-26-0-155-69:7]:2023-06-21 17:29:27,000 [Rank 63]: time (ms) | forward-compute: 223.03 | backward-compute: 398.36 | backward-params-all-reduce: 225.73 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.83 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.06 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.93 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:29:35,936 [Rank 63]:  iteration       70/  150000 | consumed samples:         4480 | elapsed time per iteration (ms): 893.6 | learning rate: 1.050E-05 | global batch size:    64 | lm loss: 7.758693E+00 | loss scale: 1.0 | grad norm: 5.879 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.89 | tokens-per-second-per-gpu: 9167.51 |
+[ip-26-0-155-69:7]:2023-06-21 17:29:35,936 [Rank 63]: time (ms) | forward-compute: 223.28 | backward-compute: 398.32 | backward-params-all-reduce: 225.93 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.04 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.91 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:29:44,869 [Rank 63]:  iteration       80/  150000 | consumed samples:         5120 | elapsed time per iteration (ms): 893.3 | learning rate: 1.200E-05 | global batch size:    64 | lm loss: 7.419704E+00 | loss scale: 1.0 | grad norm: 6.722 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.41 |
+[ip-26-0-155-69:7]:2023-06-21 17:29:44,870 [Rank 63]: time (ms) | forward-compute: 223.34 | backward-compute: 398.20 | backward-params-all-reduce: 225.77 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 225.89 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.89 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:29:53,797 [Rank 63]:  iteration       90/  150000 | consumed samples:         5760 | elapsed time per iteration (ms): 892.8 | learning rate: 1.350E-05 | global batch size:    64 | lm loss: 7.135265E+00 | loss scale: 1.0 | grad norm: 5.285 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.99 | tokens-per-second-per-gpu: 9175.39 |
+[ip-26-0-155-69:7]:2023-06-21 17:29:53,798 [Rank 63]: time (ms) | forward-compute: 222.86 | backward-compute: 398.29 | backward-params-all-reduce: 225.67 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.78 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.89 | batch-generator: 1.73
+[ip-26-0-155-69:7]:2023-06-21 17:30:02,728 [Rank 63]:  iteration      100/  150000 | consumed samples:         6400 | elapsed time per iteration (ms): 893.1 | learning rate: 1.500E-05 | global batch size:    64 | lm loss: 7.016300E+00 | loss scale: 1.0 | grad norm: 4.335 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.48 |
+[ip-26-0-155-69:7]:2023-06-21 17:30:02,729 [Rank 63]: time (ms) | forward-compute: 223.11 | backward-compute: 398.33 | backward-params-all-reduce: 225.51 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.61 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.58 | optimizer-clip-main-grad: 10.96 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 42.01 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:30:11,659 [Rank 63]:  iteration      110/  150000 | consumed samples:         7040 | elapsed time per iteration (ms): 893.0 | learning rate: 1.650E-05 | global batch size:    64 | lm loss: 6.814932E+00 | loss scale: 1.0 | grad norm: 3.932 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9173.07 |
+[ip-26-0-155-69:7]:2023-06-21 17:30:11,659 [Rank 63]: time (ms) | forward-compute: 223.32 | backward-compute: 398.36 | backward-params-all-reduce: 225.42 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.51 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.89 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:30:20,587 [Rank 63]:  iteration      120/  150000 | consumed samples:         7680 | elapsed time per iteration (ms): 892.8 | learning rate: 1.800E-05 | global batch size:    64 | lm loss: 6.757275E+00 | loss scale: 1.0 | grad norm: 3.359 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9175.17 |
+[ip-26-0-155-69:7]:2023-06-21 17:30:20,588 [Rank 63]: time (ms) | forward-compute: 222.69 | backward-compute: 398.32 | backward-params-all-reduce: 225.75 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.85 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.58 | optimizer-clip-main-grad: 10.94 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.95 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:30:29,514 [Rank 63]:  iteration      130/  150000 | consumed samples:         8320 | elapsed time per iteration (ms): 892.6 | learning rate: 1.950E-05 | global batch size:    64 | lm loss: 6.519125E+00 | loss scale: 1.0 | grad norm: 3.028 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.01 | tokens-per-second-per-gpu: 9177.21 |
+[ip-26-0-155-69:7]:2023-06-21 17:30:29,514 [Rank 63]: time (ms) | forward-compute: 222.84 | backward-compute: 398.26 | backward-params-all-reduce: 225.66 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.76 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.83 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.81 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:30:38,456 [Rank 63]:  iteration      140/  150000 | consumed samples:         8960 | elapsed time per iteration (ms): 894.2 | learning rate: 2.100E-05 | global batch size:    64 | lm loss: 6.416656E+00 | loss scale: 1.0 | grad norm: 3.510 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.82 | tokens-per-second-per-gpu: 9161.28 |
+[ip-26-0-155-69:7]:2023-06-21 17:30:38,456 [Rank 63]: time (ms) | forward-compute: 224.22 | backward-compute: 398.30 | backward-params-all-reduce: 225.72 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.82 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.83 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.85 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:30:47,384 [Rank 63]:  iteration      150/  150000 | consumed samples:         9600 | elapsed time per iteration (ms): 892.8 | learning rate: 2.250E-05 | global batch size:    64 | lm loss: 6.377288E+00 | loss scale: 1.0 | grad norm: 3.265 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9175.13 |
+[ip-26-0-155-69:7]:2023-06-21 17:30:47,385 [Rank 63]: time (ms) | forward-compute: 223.14 | backward-compute: 398.10 | backward-params-all-reduce: 225.73 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.84 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.83 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.82 | batch-generator: 1.73
+[ip-26-0-155-69:7]:2023-06-21 17:30:56,316 [Rank 63]:  iteration      160/  150000 | consumed samples:        10240 | elapsed time per iteration (ms): 893.2 | learning rate: 2.400E-05 | global batch size:    64 | lm loss: 6.216093E+00 | loss scale: 1.0 | grad norm: 3.617 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.63 |
+[ip-26-0-155-69:7]:2023-06-21 17:30:56,316 [Rank 63]: time (ms) | forward-compute: 223.70 | backward-compute: 398.22 | backward-params-all-reduce: 225.42 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.52 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.84 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:31:05,247 [Rank 63]:  iteration      170/  150000 | consumed samples:        10880 | elapsed time per iteration (ms): 893.1 | learning rate: 2.550E-05 | global batch size:    64 | lm loss: 6.279401E+00 | loss scale: 1.0 | grad norm: 3.731 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.16 |
+[ip-26-0-155-69:7]:2023-06-21 17:31:05,248 [Rank 63]: time (ms) | forward-compute: 223.37 | backward-compute: 398.20 | backward-params-all-reduce: 225.58 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.68 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.91 | batch-generator: 1.73
+[ip-26-0-155-69:7]:2023-06-21 17:31:14,182 [Rank 63]:  iteration      180/  150000 | consumed samples:        11520 | elapsed time per iteration (ms): 893.5 | learning rate: 2.700E-05 | global batch size:    64 | lm loss: 6.152369E+00 | loss scale: 1.0 | grad norm: 3.729 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.91 | tokens-per-second-per-gpu: 9168.89 |
+[ip-26-0-155-69:7]:2023-06-21 17:31:14,183 [Rank 63]: time (ms) | forward-compute: 223.66 | backward-compute: 398.17 | backward-params-all-reduce: 225.65 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 225.77 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.06 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.89 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:31:23,115 [Rank 63]:  iteration      190/  150000 | consumed samples:        12160 | elapsed time per iteration (ms): 893.4 | learning rate: 2.850E-05 | global batch size:    64 | lm loss: 6.109496E+00 | loss scale: 1.0 | grad norm: 3.128 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.92 | tokens-per-second-per-gpu: 9169.91 |
+[ip-26-0-155-69:7]:2023-06-21 17:31:23,116 [Rank 63]: time (ms) | forward-compute: 223.58 | backward-compute: 398.16 | backward-params-all-reduce: 225.68 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.77 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.87 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:31:32,048 [Rank 63]:  iteration      200/  150000 | consumed samples:        12800 | elapsed time per iteration (ms): 893.3 | learning rate: 3.000E-05 | global batch size:    64 | lm loss: 6.104686E+00 | loss scale: 1.0 | grad norm: 4.132 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.54 |
+[ip-26-0-155-69:7]:2023-06-21 17:31:32,049 [Rank 63]: time (ms) | forward-compute: 223.54 | backward-compute: 398.16 | backward-params-all-reduce: 225.67 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.78 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.86 | batch-generator: 1.73
+[ip-26-0-155-69:7]:2023-06-21 17:31:40,980 [Rank 63]:  iteration      210/  150000 | consumed samples:        13440 | elapsed time per iteration (ms): 893.2 | learning rate: 3.150E-05 | global batch size:    64 | lm loss: 5.995741E+00 | loss scale: 1.0 | grad norm: 3.759 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.78 |
+[ip-26-0-155-69:7]:2023-06-21 17:31:40,981 [Rank 63]: time (ms) | forward-compute: 223.24 | backward-compute: 398.19 | backward-params-all-reduce: 225.70 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.81 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.57 | optimizer-clip-main-grad: 10.94 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.96 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:31:49,920 [Rank 63]:  iteration      220/  150000 | consumed samples:        14080 | elapsed time per iteration (ms): 894.0 | learning rate: 3.300E-05 | global batch size:    64 | lm loss: 6.099563E+00 | loss scale: 1.0 | grad norm: 3.449 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.84 | tokens-per-second-per-gpu: 9163.10 |
+[ip-26-0-155-69:7]:2023-06-21 17:31:49,921 [Rank 63]: time (ms) | forward-compute: 223.51 | backward-compute: 398.19 | backward-params-all-reduce: 226.34 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.45 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.91 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:31:58,857 [Rank 63]:  iteration      230/  150000 | consumed samples:        14720 | elapsed time per iteration (ms): 893.7 | learning rate: 3.450E-05 | global batch size:    64 | lm loss: 5.972797E+00 | loss scale: 1.0 | grad norm: 2.340 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.88 | tokens-per-second-per-gpu: 9166.35 |
+[ip-26-0-155-69:7]:2023-06-21 17:31:58,858 [Rank 63]: time (ms) | forward-compute: 223.74 | backward-compute: 398.25 | backward-params-all-reduce: 225.76 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.88 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.88 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:32:07,795 [Rank 63]:  iteration      240/  150000 | consumed samples:        15360 | elapsed time per iteration (ms): 893.7 | learning rate: 3.600E-05 | global batch size:    64 | lm loss: 5.918838E+00 | loss scale: 1.0 | grad norm: 3.002 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.88 | tokens-per-second-per-gpu: 9166.09 |
+[ip-26-0-155-69:7]:2023-06-21 17:32:07,796 [Rank 63]: time (ms) | forward-compute: 223.70 | backward-compute: 398.17 | backward-params-all-reduce: 225.87 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 225.99 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.87 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:32:16,738 [Rank 63]:  iteration      250/  150000 | consumed samples:        16000 | elapsed time per iteration (ms): 894.3 | learning rate: 3.750E-05 | global batch size:    64 | lm loss: 5.861612E+00 | loss scale: 1.0 | grad norm: 2.419 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.81 | tokens-per-second-per-gpu: 9159.76 |
+[ip-26-0-155-69:7]:2023-06-21 17:32:16,739 [Rank 63]: time (ms) | forward-compute: 224.00 | backward-compute: 398.17 | backward-params-all-reduce: 226.14 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.25 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.90 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:32:25,676 [Rank 63]:  iteration      260/  150000 | consumed samples:        16640 | elapsed time per iteration (ms): 893.7 | learning rate: 3.900E-05 | global batch size:    64 | lm loss: 5.845439E+00 | loss scale: 1.0 | grad norm: 2.481 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.88 | tokens-per-second-per-gpu: 9166.01 |
+[ip-26-0-155-69:7]:2023-06-21 17:32:25,676 [Rank 63]: time (ms) | forward-compute: 223.57 | backward-compute: 398.23 | backward-params-all-reduce: 226.00 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.10 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.83 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.84 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:32:34,607 [Rank 63]:  iteration      270/  150000 | consumed samples:        17280 | elapsed time per iteration (ms): 893.2 | learning rate: 4.050E-05 | global batch size:    64 | lm loss: 5.770058E+00 | loss scale: 1.0 | grad norm: 1.942 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.01 |
+[ip-26-0-155-69:7]:2023-06-21 17:32:34,608 [Rank 63]: time (ms) | forward-compute: 223.55 | backward-compute: 398.10 | backward-params-all-reduce: 225.55 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.65 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.89 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:32:43,541 [Rank 63]:  iteration      280/  150000 | consumed samples:        17920 | elapsed time per iteration (ms): 893.4 | learning rate: 4.200E-05 | global batch size:    64 | lm loss: 5.792897E+00 | loss scale: 1.0 | grad norm: 2.421 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.92 | tokens-per-second-per-gpu: 9169.78 |
+[ip-26-0-155-69:7]:2023-06-21 17:32:43,541 [Rank 63]: time (ms) | forward-compute: 223.18 | backward-compute: 398.16 | backward-params-all-reduce: 226.06 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.17 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.89 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:32:52,473 [Rank 63]:  iteration      290/  150000 | consumed samples:        18560 | elapsed time per iteration (ms): 893.2 | learning rate: 4.350E-05 | global batch size:    64 | lm loss: 5.725514E+00 | loss scale: 1.0 | grad norm: 2.173 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.27 |
+[ip-26-0-155-69:7]:2023-06-21 17:32:52,474 [Rank 63]: time (ms) | forward-compute: 223.29 | backward-compute: 398.16 | backward-params-all-reduce: 225.78 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.89 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.94 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.93 | batch-generator: 1.73
+[ip-26-0-155-69:7]:2023-06-21 17:33:01,403 [Rank 63]:  iteration      300/  150000 | consumed samples:        19200 | elapsed time per iteration (ms): 893.0 | learning rate: 4.500E-05 | global batch size:    64 | lm loss: 5.613900E+00 | loss scale: 1.0 | grad norm: 3.062 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9173.31 |
+[ip-26-0-155-69:7]:2023-06-21 17:33:01,404 [Rank 63]: time (ms) | forward-compute: 223.35 | backward-compute: 398.07 | backward-params-all-reduce: 225.67 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.79 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.85 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.06 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.86 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:33:10,337 [Rank 63]:  iteration      310/  150000 | consumed samples:        19840 | elapsed time per iteration (ms): 893.4 | learning rate: 4.650E-05 | global batch size:    64 | lm loss: 5.624342E+00 | loss scale: 1.0 | grad norm: 2.426 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.92 | tokens-per-second-per-gpu: 9169.85 |
+[ip-26-0-155-69:7]:2023-06-21 17:33:10,338 [Rank 63]: time (ms) | forward-compute: 223.49 | backward-compute: 398.17 | backward-params-all-reduce: 225.69 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 225.82 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.06 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.90 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:33:19,368 [Rank 63]:  iteration      320/  150000 | consumed samples:        20480 | elapsed time per iteration (ms): 903.1 | learning rate: 4.800E-05 | global batch size:    64 | lm loss: 5.548281E+00 | loss scale: 1.0 | grad norm: 2.668 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 104.78 | tokens-per-second-per-gpu: 9071.09 |
+[ip-26-0-155-69:7]:2023-06-21 17:33:19,368 [Rank 63]: time (ms) | forward-compute: 233.18 | backward-compute: 398.24 | backward-params-all-reduce: 225.56 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.67 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.86 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.88 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:33:28,306 [Rank 63]:  iteration      330/  150000 | consumed samples:        21120 | elapsed time per iteration (ms): 893.8 | learning rate: 4.950E-05 | global batch size:    64 | lm loss: 5.607609E+00 | loss scale: 1.0 | grad norm: 2.161 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.87 | tokens-per-second-per-gpu: 9165.07 |
+[ip-26-0-155-69:7]:2023-06-21 17:33:28,307 [Rank 63]: time (ms) | forward-compute: 223.70 | backward-compute: 398.14 | backward-params-all-reduce: 226.04 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.13 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.88 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:33:37,243 [Rank 63]:  iteration      340/  150000 | consumed samples:        21760 | elapsed time per iteration (ms): 893.7 | learning rate: 5.100E-05 | global batch size:    64 | lm loss: 5.562651E+00 | loss scale: 1.0 | grad norm: 2.971 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.88 | tokens-per-second-per-gpu: 9166.56 |
+[ip-26-0-155-69:7]:2023-06-21 17:33:37,244 [Rank 63]: time (ms) | forward-compute: 223.80 | backward-compute: 398.15 | backward-params-all-reduce: 225.75 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.84 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.57 | optimizer-clip-main-grad: 10.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.92 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:33:46,183 [Rank 63]:  iteration      350/  150000 | consumed samples:        22400 | elapsed time per iteration (ms): 893.9 | learning rate: 5.250E-05 | global batch size:    64 | lm loss: 5.522157E+00 | loss scale: 1.0 | grad norm: 2.511 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.85 | tokens-per-second-per-gpu: 9163.88 |
+[ip-26-0-155-69:7]:2023-06-21 17:33:46,183 [Rank 63]: time (ms) | forward-compute: 223.56 | backward-compute: 398.24 | backward-params-all-reduce: 226.15 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.25 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.90 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:33:55,116 [Rank 63]:  iteration      360/  150000 | consumed samples:        23040 | elapsed time per iteration (ms): 893.3 | learning rate: 5.400E-05 | global batch size:    64 | lm loss: 5.491142E+00 | loss scale: 1.0 | grad norm: 2.720 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.28 |
+[ip-26-0-155-69:7]:2023-06-21 17:33:55,116 [Rank 63]: time (ms) | forward-compute: 223.39 | backward-compute: 398.24 | backward-params-all-reduce: 225.69 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.79 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.94 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:34:04,050 [Rank 63]:  iteration      370/  150000 | consumed samples:        23680 | elapsed time per iteration (ms): 893.4 | learning rate: 5.550E-05 | global batch size:    64 | lm loss: 5.483192E+00 | loss scale: 1.0 | grad norm: 2.772 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.91 | tokens-per-second-per-gpu: 9169.19 |
+[ip-26-0-155-69:7]:2023-06-21 17:34:04,051 [Rank 63]: time (ms) | forward-compute: 223.31 | backward-compute: 398.21 | backward-params-all-reduce: 225.87 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.98 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.93 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:34:12,982 [Rank 63]:  iteration      380/  150000 | consumed samples:        24320 | elapsed time per iteration (ms): 893.2 | learning rate: 5.700E-05 | global batch size:    64 | lm loss: 5.409019E+00 | loss scale: 1.0 | grad norm: 2.076 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9171.99 |
+[ip-26-0-155-69:7]:2023-06-21 17:34:12,982 [Rank 63]: time (ms) | forward-compute: 223.24 | backward-compute: 398.16 | backward-params-all-reduce: 225.75 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.86 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.89 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:34:21,911 [Rank 63]:  iteration      390/  150000 | consumed samples:        24960 | elapsed time per iteration (ms): 892.9 | learning rate: 5.850E-05 | global batch size:    64 | lm loss: 5.394781E+00 | loss scale: 1.0 | grad norm: 2.562 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9174.66 |
+[ip-26-0-155-69:7]:2023-06-21 17:34:21,911 [Rank 63]: time (ms) | forward-compute: 223.04 | backward-compute: 398.13 | backward-params-all-reduce: 225.65 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.76 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.34 | optimizer: 41.95 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:34:30,843 [Rank 63]:  iteration      400/  150000 | consumed samples:        25600 | elapsed time per iteration (ms): 893.2 | learning rate: 6.000E-05 | global batch size:    64 | lm loss: 5.375826E+00 | loss scale: 1.0 | grad norm: 2.044 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.41 |
+[ip-26-0-155-69:7]:2023-06-21 17:34:30,843 [Rank 63]: time (ms) | forward-compute: 223.65 | backward-compute: 398.08 | backward-params-all-reduce: 225.57 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.67 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.86 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:34:39,772 [Rank 63]:  iteration      410/  150000 | consumed samples:        26240 | elapsed time per iteration (ms): 893.0 | learning rate: 6.150E-05 | global batch size:    64 | lm loss: 5.348161E+00 | loss scale: 1.0 | grad norm: 2.454 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.97 | tokens-per-second-per-gpu: 9173.80 |
+[ip-26-0-155-69:7]:2023-06-21 17:34:39,773 [Rank 63]: time (ms) | forward-compute: 223.43 | backward-compute: 398.14 | backward-params-all-reduce: 225.46 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.56 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.60 | optimizer-clip-main-grad: 10.82 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.88 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:34:48,707 [Rank 63]:  iteration      420/  150000 | consumed samples:        26880 | elapsed time per iteration (ms): 893.5 | learning rate: 6.300E-05 | global batch size:    64 | lm loss: 5.274976E+00 | loss scale: 1.0 | grad norm: 2.302 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.91 | tokens-per-second-per-gpu: 9168.69 |
+[ip-26-0-155-69:7]:2023-06-21 17:34:48,708 [Rank 63]: time (ms) | forward-compute: 223.82 | backward-compute: 398.13 | backward-params-all-reduce: 225.51 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.61 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.34 | optimizer: 41.94 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:34:57,639 [Rank 63]:  iteration      430/  150000 | consumed samples:        27520 | elapsed time per iteration (ms): 893.2 | learning rate: 6.450E-05 | global batch size:    64 | lm loss: 5.287198E+00 | loss scale: 1.0 | grad norm: 2.816 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.52 |
+[ip-26-0-155-69:7]:2023-06-21 17:34:57,640 [Rank 63]: time (ms) | forward-compute: 223.67 | backward-compute: 398.00 | backward-params-all-reduce: 225.56 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.67 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.90 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:35:06,572 [Rank 63]:  iteration      440/  150000 | consumed samples:        28160 | elapsed time per iteration (ms): 893.3 | learning rate: 6.600E-05 | global batch size:    64 | lm loss: 5.272359E+00 | loss scale: 1.0 | grad norm: 2.385 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.71 |
+[ip-26-0-155-69:7]:2023-06-21 17:35:06,572 [Rank 63]: time (ms) | forward-compute: 223.48 | backward-compute: 398.14 | backward-params-all-reduce: 225.62 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.73 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.06 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.94 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:35:15,506 [Rank 63]:  iteration      450/  150000 | consumed samples:        28800 | elapsed time per iteration (ms): 893.4 | learning rate: 6.750E-05 | global batch size:    64 | lm loss: 5.157737E+00 | loss scale: 1.0 | grad norm: 1.884 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.92 | tokens-per-second-per-gpu: 9169.48 |
+[ip-26-0-155-69:7]:2023-06-21 17:35:15,507 [Rank 63]: time (ms) | forward-compute: 223.19 | backward-compute: 398.11 | backward-params-all-reduce: 226.04 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.15 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.94 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:35:24,447 [Rank 63]:  iteration      460/  150000 | consumed samples:        29440 | elapsed time per iteration (ms): 894.1 | learning rate: 6.900E-05 | global batch size:    64 | lm loss: 5.164676E+00 | loss scale: 1.0 | grad norm: 2.064 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.84 | tokens-per-second-per-gpu: 9162.49 |
+[ip-26-0-155-69:7]:2023-06-21 17:35:24,447 [Rank 63]: time (ms) | forward-compute: 222.98 | backward-compute: 398.16 | backward-params-all-reduce: 226.75 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.87 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.97 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.34 | optimizer: 42.02 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:35:33,377 [Rank 63]:  iteration      470/  150000 | consumed samples:        30080 | elapsed time per iteration (ms): 893.0 | learning rate: 7.050E-05 | global batch size:    64 | lm loss: 5.136440E+00 | loss scale: 1.0 | grad norm: 2.273 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9173.14 |
+[ip-26-0-155-69:7]:2023-06-21 17:35:33,378 [Rank 63]: time (ms) | forward-compute: 223.44 | backward-compute: 398.14 | backward-params-all-reduce: 225.53 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.64 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.86 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.85 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:35:42,308 [Rank 63]:  iteration      480/  150000 | consumed samples:        30720 | elapsed time per iteration (ms): 893.1 | learning rate: 7.200E-05 | global batch size:    64 | lm loss: 5.159489E+00 | loss scale: 1.0 | grad norm: 1.732 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.50 |
+[ip-26-0-155-69:7]:2023-06-21 17:35:42,309 [Rank 63]: time (ms) | forward-compute: 223.25 | backward-compute: 398.14 | backward-params-all-reduce: 225.71 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.82 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.91 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:35:51,239 [Rank 63]:  iteration      490/  150000 | consumed samples:        31360 | elapsed time per iteration (ms): 893.1 | learning rate: 7.350E-05 | global batch size:    64 | lm loss: 5.114197E+00 | loss scale: 1.0 | grad norm: 1.781 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9172.75 |
+[ip-26-0-155-69:7]:2023-06-21 17:35:51,240 [Rank 63]: time (ms) | forward-compute: 222.66 | backward-compute: 398.17 | backward-params-all-reduce: 226.20 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.31 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.93 | batch-generator: 1.80
+[ip-26-0-155-69:7]:2023-06-21 17:36:00,171 [Rank 63]:  iteration      500/  150000 | consumed samples:        32000 | elapsed time per iteration (ms): 893.2 | learning rate: 7.500E-05 | global batch size:    64 | lm loss: 5.069198E+00 | loss scale: 1.0 | grad norm: 1.696 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.36 |
+[ip-26-0-155-69:7]:2023-06-21 17:36:00,172 [Rank 63]: time (ms) | forward-compute: 222.62 | backward-compute: 398.19 | backward-params-all-reduce: 226.32 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.43 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.97 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.98 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:36:09,107 [Rank 63]:  iteration      510/  150000 | consumed samples:        32640 | elapsed time per iteration (ms): 893.6 | learning rate: 7.650E-05 | global batch size:    64 | lm loss: 5.068162E+00 | loss scale: 1.0 | grad norm: 2.051 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.90 | tokens-per-second-per-gpu: 9167.91 |
+[ip-26-0-155-69:7]:2023-06-21 17:36:09,108 [Rank 63]: time (ms) | forward-compute: 222.88 | backward-compute: 398.32 | backward-params-all-reduce: 226.19 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 226.32 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.57 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.07 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.97 | batch-generator: 1.83
+[ip-26-0-155-69:7]:2023-06-21 17:36:18,040 [Rank 63]:  iteration      520/  150000 | consumed samples:        33280 | elapsed time per iteration (ms): 893.3 | learning rate: 7.800E-05 | global batch size:    64 | lm loss: 5.031533E+00 | loss scale: 1.0 | grad norm: 1.916 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.92 | tokens-per-second-per-gpu: 9170.03 |
+[ip-26-0-155-69:7]:2023-06-21 17:36:18,041 [Rank 63]: time (ms) | forward-compute: 223.18 | backward-compute: 398.20 | backward-params-all-reduce: 225.92 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.04 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.92 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:36:26,973 [Rank 63]:  iteration      530/  150000 | consumed samples:        33920 | elapsed time per iteration (ms): 893.3 | learning rate: 7.950E-05 | global batch size:    64 | lm loss: 5.029686E+00 | loss scale: 1.0 | grad norm: 1.798 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.73 |
+[ip-26-0-155-69:7]:2023-06-21 17:36:26,973 [Rank 63]: time (ms) | forward-compute: 222.69 | backward-compute: 398.25 | backward-params-all-reduce: 226.27 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.38 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.95 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.96 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:36:35,906 [Rank 63]:  iteration      540/  150000 | consumed samples:        34560 | elapsed time per iteration (ms): 893.3 | learning rate: 8.100E-05 | global batch size:    64 | lm loss: 5.031442E+00 | loss scale: 1.0 | grad norm: 1.951 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.17 |
+[ip-26-0-155-69:7]:2023-06-21 17:36:35,907 [Rank 63]: time (ms) | forward-compute: 222.88 | backward-compute: 398.24 | backward-params-all-reduce: 226.18 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.29 | backward-gather-model-params: 0.02 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.92 | batch-generator: 1.80
+[ip-26-0-155-69:7]:2023-06-21 17:36:44,841 [Rank 63]:  iteration      550/  150000 | consumed samples:        35200 | elapsed time per iteration (ms): 893.5 | learning rate: 8.250E-05 | global batch size:    64 | lm loss: 5.012273E+00 | loss scale: 1.0 | grad norm: 1.726 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.91 | tokens-per-second-per-gpu: 9168.83 |
+[ip-26-0-155-69:7]:2023-06-21 17:36:44,841 [Rank 63]: time (ms) | forward-compute: 223.23 | backward-compute: 398.20 | backward-params-all-reduce: 226.03 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.14 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.92 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:36:53,775 [Rank 63]:  iteration      560/  150000 | consumed samples:        35840 | elapsed time per iteration (ms): 893.4 | learning rate: 8.400E-05 | global batch size:    64 | lm loss: 4.852672E+00 | loss scale: 1.0 | grad norm: 1.536 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.92 | tokens-per-second-per-gpu: 9169.49 |
+[ip-26-0-155-69:7]:2023-06-21 17:36:53,775 [Rank 63]: time (ms) | forward-compute: 223.19 | backward-compute: 398.17 | backward-params-all-reduce: 226.08 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.18 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.88 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:37:02,768 [Rank 63]:  iteration      570/  150000 | consumed samples:        36480 | elapsed time per iteration (ms): 899.3 | learning rate: 8.550E-05 | global batch size:    64 | lm loss: 4.964608E+00 | loss scale: 1.0 | grad norm: 1.570 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.23 | tokens-per-second-per-gpu: 9109.74 |
+[ip-26-0-155-69:7]:2023-06-21 17:37:02,768 [Rank 63]: time (ms) | forward-compute: 227.82 | backward-compute: 398.21 | backward-params-all-reduce: 227.14 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 227.24 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.57 | optimizer-clip-main-grad: 11.00 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 42.02 | batch-generator: 1.80
+[ip-26-0-155-69:7]:2023-06-21 17:37:11,738 [Rank 63]:  iteration      580/  150000 | consumed samples:        37120 | elapsed time per iteration (ms): 897.1 | learning rate: 8.700E-05 | global batch size:    64 | lm loss: 4.988046E+00 | loss scale: 1.0 | grad norm: 1.668 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.48 | tokens-per-second-per-gpu: 9132.00 |
+[ip-26-0-155-69:7]:2023-06-21 17:37:11,739 [Rank 63]: time (ms) | forward-compute: 225.71 | backward-compute: 398.15 | backward-params-all-reduce: 226.45 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.55 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 11.68 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 42.69 | batch-generator: 1.83
+[ip-26-0-155-69:7]:2023-06-21 17:37:20,711 [Rank 63]:  iteration      590/  150000 | consumed samples:        37760 | elapsed time per iteration (ms): 897.3 | learning rate: 8.850E-05 | global batch size:    64 | lm loss: 4.848716E+00 | loss scale: 1.0 | grad norm: 1.516 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.45 | tokens-per-second-per-gpu: 9129.36 |
+[ip-26-0-155-69:7]:2023-06-21 17:37:20,712 [Rank 63]: time (ms) | forward-compute: 225.90 | backward-compute: 398.07 | backward-params-all-reduce: 227.47 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 227.56 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.30 | optimizer: 41.81 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:37:29,703 [Rank 63]:  iteration      600/  150000 | consumed samples:        38400 | elapsed time per iteration (ms): 899.2 | learning rate: 9.000E-05 | global batch size:    64 | lm loss: 4.889231E+00 | loss scale: 1.0 | grad norm: 1.931 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.24 | tokens-per-second-per-gpu: 9110.76 |
+[ip-26-0-155-69:7]:2023-06-21 17:37:29,704 [Rank 63]: time (ms) | forward-compute: 227.13 | backward-compute: 398.11 | backward-params-all-reduce: 227.04 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 227.43 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 11.58 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 42.55 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:37:38,696 [Rank 63]:  iteration      610/  150000 | consumed samples:        39040 | elapsed time per iteration (ms): 899.3 | learning rate: 9.150E-05 | global batch size:    64 | lm loss: 4.808221E+00 | loss scale: 1.0 | grad norm: 1.401 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.23 | tokens-per-second-per-gpu: 9109.76 |
+[ip-26-0-155-69:7]:2023-06-21 17:37:38,696 [Rank 63]: time (ms) | forward-compute: 228.96 | backward-compute: 398.22 | backward-params-all-reduce: 226.10 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.20 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.57 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.90 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:37:47,636 [Rank 63]:  iteration      620/  150000 | consumed samples:        39680 | elapsed time per iteration (ms): 894.1 | learning rate: 9.300E-05 | global batch size:    64 | lm loss: 4.808089E+00 | loss scale: 1.0 | grad norm: 1.950 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.84 | tokens-per-second-per-gpu: 9162.50 |
+[ip-26-0-155-69:7]:2023-06-21 17:37:47,637 [Rank 63]: time (ms) | forward-compute: 223.79 | backward-compute: 398.16 | backward-params-all-reduce: 226.23 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.33 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.84 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:37:56,582 [Rank 63]:  iteration      630/  150000 | consumed samples:        40320 | elapsed time per iteration (ms): 894.5 | learning rate: 9.450E-05 | global batch size:    64 | lm loss: 4.800224E+00 | loss scale: 1.0 | grad norm: 1.574 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.79 | tokens-per-second-per-gpu: 9158.00 |
+[ip-26-0-155-69:7]:2023-06-21 17:37:56,582 [Rank 63]: time (ms) | forward-compute: 223.59 | backward-compute: 398.13 | backward-params-all-reduce: 226.88 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.97 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.94 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.92 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:38:05,535 [Rank 63]:  iteration      640/  150000 | consumed samples:        40960 | elapsed time per iteration (ms): 895.4 | learning rate: 9.600E-05 | global batch size:    64 | lm loss: 4.780347E+00 | loss scale: 1.0 | grad norm: 1.543 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.68 | tokens-per-second-per-gpu: 9149.31 |
+[ip-26-0-155-69:7]:2023-06-21 17:38:05,536 [Rank 63]: time (ms) | forward-compute: 224.03 | backward-compute: 398.16 | backward-params-all-reduce: 227.10 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 227.21 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.95 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:38:14,479 [Rank 63]:  iteration      650/  150000 | consumed samples:        41600 | elapsed time per iteration (ms): 894.4 | learning rate: 9.750E-05 | global batch size:    64 | lm loss: 4.692219E+00 | loss scale: 1.0 | grad norm: 1.669 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.80 | tokens-per-second-per-gpu: 9159.09 |
+[ip-26-0-155-69:7]:2023-06-21 17:38:14,480 [Rank 63]: time (ms) | forward-compute: 224.39 | backward-compute: 398.19 | backward-params-all-reduce: 225.91 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.01 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.84 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.80 | batch-generator: 1.80
+[ip-26-0-155-69:7]:2023-06-21 17:38:23,427 [Rank 63]:  iteration      660/  150000 | consumed samples:        42240 | elapsed time per iteration (ms): 894.7 | learning rate: 9.900E-05 | global batch size:    64 | lm loss: 4.747536E+00 | loss scale: 1.0 | grad norm: 1.602 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.76 | tokens-per-second-per-gpu: 9155.98 |
+[ip-26-0-155-69:7]:2023-06-21 17:38:23,427 [Rank 63]: time (ms) | forward-compute: 224.55 | backward-compute: 398.17 | backward-params-all-reduce: 225.81 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.91 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 11.04 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.09 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 42.10 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:38:32,487 [Rank 63]:  iteration      670/  150000 | consumed samples:        42880 | elapsed time per iteration (ms): 906.0 | learning rate: 1.005E-04 | global batch size:    64 | lm loss: 4.675434E+00 | loss scale: 1.0 | grad norm: 1.829 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 104.44 | tokens-per-second-per-gpu: 9041.71 |
+[ip-26-0-155-69:7]:2023-06-21 17:38:32,487 [Rank 63]: time (ms) | forward-compute: 235.41 | backward-compute: 398.13 | backward-params-all-reduce: 225.79 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.89 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.94 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.30 | optimizer: 41.90 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:38:42,562 [Rank 63]:  iteration      680/  150000 | consumed samples:        43520 | elapsed time per iteration (ms): 1007.5 | learning rate: 1.020E-04 | global batch size:    64 | lm loss: 4.675757E+00 | loss scale: 1.0 | grad norm: 1.328 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 93.92 | tokens-per-second-per-gpu: 8130.96 |
+[ip-26-0-155-69:7]:2023-06-21 17:38:42,562 [Rank 63]: time (ms) | forward-compute: 287.37 | backward-compute: 399.98 | backward-params-all-reduce: 261.04 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 261.14 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 1.71 | optimizer-clip-main-grad: 20.31 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.26 | optimizer-copy-main-to-model-params: 8.58 | optimizer: 52.96 | batch-generator: 5.67
+[ip-26-0-155-69:7]:2023-06-21 17:38:51,816 [Rank 63]:  iteration      690/  150000 | consumed samples:        44160 | elapsed time per iteration (ms): 925.4 | learning rate: 1.035E-04 | global batch size:    64 | lm loss: 4.607193E+00 | loss scale: 1.0 | grad norm: 1.564 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 102.26 | tokens-per-second-per-gpu: 8852.63 |
+[ip-26-0-155-69:7]:2023-06-21 17:38:51,816 [Rank 63]: time (ms) | forward-compute: 231.94 | backward-compute: 400.73 | backward-params-all-reduce: 245.32 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 245.42 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.63 | optimizer-clip-main-grad: 11.44 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.64 | optimizer-copy-main-to-model-params: 8.35 | optimizer: 43.16 | batch-generator: 3.11
+[ip-26-0-155-69:7]:2023-06-21 17:39:00,757 [Rank 63]:  iteration      700/  150000 | consumed samples:        44800 | elapsed time per iteration (ms): 894.1 | learning rate: 1.050E-04 | global batch size:    64 | lm loss: 4.614832E+00 | loss scale: 1.0 | grad norm: 1.521 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.83 | tokens-per-second-per-gpu: 9162.08 |
+[ip-26-0-155-69:7]:2023-06-21 17:39:00,757 [Rank 63]: time (ms) | forward-compute: 224.12 | backward-compute: 398.13 | backward-params-all-reduce: 225.98 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.08 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.30 | optimizer: 41.89 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:39:09,698 [Rank 63]:  iteration      710/  150000 | consumed samples:        45440 | elapsed time per iteration (ms): 894.1 | learning rate: 1.065E-04 | global batch size:    64 | lm loss: 4.601290E+00 | loss scale: 1.0 | grad norm: 1.367 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.83 | tokens-per-second-per-gpu: 9162.04 |
+[ip-26-0-155-69:7]:2023-06-21 17:39:09,699 [Rank 63]: time (ms) | forward-compute: 223.75 | backward-compute: 398.14 | backward-params-all-reduce: 226.27 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.37 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.89 | batch-generator: 1.80
+[ip-26-0-155-69:7]:2023-06-21 17:39:18,635 [Rank 63]:  iteration      720/  150000 | consumed samples:        46080 | elapsed time per iteration (ms): 893.7 | learning rate: 1.080E-04 | global batch size:    64 | lm loss: 4.536628E+00 | loss scale: 1.0 | grad norm: 1.323 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.88 | tokens-per-second-per-gpu: 9166.55 |
+[ip-26-0-155-69:7]:2023-06-21 17:39:18,636 [Rank 63]: time (ms) | forward-compute: 223.77 | backward-compute: 398.11 | backward-params-all-reduce: 225.92 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.01 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.30 | optimizer: 41.82 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:39:27,573 [Rank 63]:  iteration      730/  150000 | consumed samples:        46720 | elapsed time per iteration (ms): 893.8 | learning rate: 1.095E-04 | global batch size:    64 | lm loss: 4.509668E+00 | loss scale: 1.0 | grad norm: 1.453 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.87 | tokens-per-second-per-gpu: 9164.97 |
+[ip-26-0-155-69:7]:2023-06-21 17:39:27,574 [Rank 63]: time (ms) | forward-compute: 223.89 | backward-compute: 398.10 | backward-params-all-reduce: 226.01 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.11 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.83 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:39:36,511 [Rank 63]:  iteration      740/  150000 | consumed samples:        47360 | elapsed time per iteration (ms): 893.7 | learning rate: 1.110E-04 | global batch size:    64 | lm loss: 4.574774E+00 | loss scale: 1.0 | grad norm: 1.336 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.88 | tokens-per-second-per-gpu: 9165.89 |
+[ip-26-0-155-69:7]:2023-06-21 17:39:36,511 [Rank 63]: time (ms) | forward-compute: 223.87 | backward-compute: 398.07 | backward-params-all-reduce: 225.98 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.08 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.30 | optimizer: 41.81 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:39:45,451 [Rank 63]:  iteration      750/  150000 | consumed samples:        48000 | elapsed time per iteration (ms): 894.0 | learning rate: 1.125E-04 | global batch size:    64 | lm loss: 4.512045E+00 | loss scale: 1.0 | grad norm: 1.408 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.85 | tokens-per-second-per-gpu: 9163.20 |
+[ip-26-0-155-69:7]:2023-06-21 17:39:45,451 [Rank 63]: time (ms) | forward-compute: 224.03 | backward-compute: 398.07 | backward-params-all-reduce: 225.84 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.95 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.95 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.98 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:39:54,388 [Rank 63]:  iteration      760/  150000 | consumed samples:        48640 | elapsed time per iteration (ms): 893.7 | learning rate: 1.140E-04 | global batch size:    64 | lm loss: 4.472682E+00 | loss scale: 1.0 | grad norm: 1.327 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.88 | tokens-per-second-per-gpu: 9166.21 |
+[ip-26-0-155-69:7]:2023-06-21 17:39:54,389 [Rank 63]: time (ms) | forward-compute: 223.61 | backward-compute: 398.09 | backward-params-all-reduce: 226.06 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.17 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.86 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.87 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:40:03,328 [Rank 63]:  iteration      770/  150000 | consumed samples:        49280 | elapsed time per iteration (ms): 894.0 | learning rate: 1.155E-04 | global batch size:    64 | lm loss: 4.452821E+00 | loss scale: 1.0 | grad norm: 1.213 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.84 | tokens-per-second-per-gpu: 9163.00 |
+[ip-26-0-155-69:7]:2023-06-21 17:40:03,329 [Rank 63]: time (ms) | forward-compute: 223.95 | backward-compute: 398.19 | backward-params-all-reduce: 225.91 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.02 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.89 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:40:12,260 [Rank 63]:  iteration      780/  150000 | consumed samples:        49920 | elapsed time per iteration (ms): 893.1 | learning rate: 1.170E-04 | global batch size:    64 | lm loss: 4.386324E+00 | loss scale: 1.0 | grad norm: 1.504 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.49 |
+[ip-26-0-155-69:7]:2023-06-21 17:40:12,260 [Rank 63]: time (ms) | forward-compute: 222.84 | backward-compute: 398.06 | backward-params-all-reduce: 226.17 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 226.29 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.59 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.34 | optimizer: 41.96 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:40:21,190 [Rank 63]:  iteration      790/  150000 | consumed samples:        50560 | elapsed time per iteration (ms): 893.1 | learning rate: 1.185E-04 | global batch size:    64 | lm loss: 4.306153E+00 | loss scale: 1.0 | grad norm: 1.187 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9172.82 |
+[ip-26-0-155-69:7]:2023-06-21 17:40:21,191 [Rank 63]: time (ms) | forward-compute: 223.26 | backward-compute: 398.02 | backward-params-all-reduce: 225.89 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.01 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.79 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.06 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.82 | batch-generator: 1.80
+[ip-26-0-155-69:7]:2023-06-21 17:40:30,120 [Rank 63]:  iteration      800/  150000 | consumed samples:        51200 | elapsed time per iteration (ms): 893.0 | learning rate: 1.200E-04 | global batch size:    64 | lm loss: 4.318950E+00 | loss scale: 1.0 | grad norm: 1.484 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.97 | tokens-per-second-per-gpu: 9173.80 |
+[ip-26-0-155-69:7]:2023-06-21 17:40:30,121 [Rank 63]: time (ms) | forward-compute: 223.13 | backward-compute: 398.05 | backward-params-all-reduce: 225.88 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 225.99 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.81 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.82 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:40:39,050 [Rank 63]:  iteration      810/  150000 | consumed samples:        51840 | elapsed time per iteration (ms): 893.0 | learning rate: 1.215E-04 | global batch size:    64 | lm loss: 4.220854E+00 | loss scale: 1.0 | grad norm: 1.429 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9173.29 |
+[ip-26-0-155-69:7]:2023-06-21 17:40:39,051 [Rank 63]: time (ms) | forward-compute: 223.09 | backward-compute: 398.10 | backward-params-all-reduce: 225.96 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.07 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.79 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.81 | batch-generator: 1.85
+[ip-26-0-155-69:7]:2023-06-21 17:40:47,984 [Rank 63]:  iteration      820/  150000 | consumed samples:        52480 | elapsed time per iteration (ms): 893.3 | learning rate: 1.230E-04 | global batch size:    64 | lm loss: 4.197039E+00 | loss scale: 1.0 | grad norm: 1.459 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.92 | tokens-per-second-per-gpu: 9170.08 |
+[ip-26-0-155-69:7]:2023-06-21 17:40:47,984 [Rank 63]: time (ms) | forward-compute: 223.04 | backward-compute: 398.09 | backward-params-all-reduce: 226.27 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.37 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.81 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.07 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.86 | batch-generator: 1.81
+[ip-26-0-155-69:7]:2023-06-21 17:40:56,913 [Rank 63]:  iteration      830/  150000 | consumed samples:        53120 | elapsed time per iteration (ms): 893.0 | learning rate: 1.245E-04 | global batch size:    64 | lm loss: 4.195742E+00 | loss scale: 1.0 | grad norm: 1.413 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.97 | tokens-per-second-per-gpu: 9173.90 |
+[ip-26-0-155-69:7]:2023-06-21 17:40:56,914 [Rank 63]: time (ms) | forward-compute: 223.13 | backward-compute: 398.11 | backward-params-all-reduce: 225.95 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.04 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.78 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.77 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:41:05,842 [Rank 63]:  iteration      840/  150000 | consumed samples:        53760 | elapsed time per iteration (ms): 892.9 | learning rate: 1.260E-04 | global batch size:    64 | lm loss: 4.195538E+00 | loss scale: 1.0 | grad norm: 1.445 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9174.70 |
+[ip-26-0-155-69:7]:2023-06-21 17:41:05,843 [Rank 63]: time (ms) | forward-compute: 223.01 | backward-compute: 398.10 | backward-params-all-reduce: 226.01 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.11 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.78 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.74 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:41:14,772 [Rank 63]:  iteration      850/  150000 | consumed samples:        54400 | elapsed time per iteration (ms): 893.0 | learning rate: 1.275E-04 | global batch size:    64 | lm loss: 4.161403E+00 | loss scale: 1.0 | grad norm: 1.486 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.97 | tokens-per-second-per-gpu: 9173.72 |
+[ip-26-0-155-69:7]:2023-06-21 17:41:14,773 [Rank 63]: time (ms) | forward-compute: 222.86 | backward-compute: 398.12 | backward-params-all-reduce: 226.07 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.18 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.86 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.06 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.89 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:41:23,710 [Rank 63]:  iteration      860/  150000 | consumed samples:        55040 | elapsed time per iteration (ms): 893.7 | learning rate: 1.290E-04 | global batch size:    64 | lm loss: 4.088557E+00 | loss scale: 1.0 | grad norm: 1.173 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.88 | tokens-per-second-per-gpu: 9165.91 |
+[ip-26-0-155-69:7]:2023-06-21 17:41:23,710 [Rank 63]: time (ms) | forward-compute: 223.19 | backward-compute: 398.17 | backward-params-all-reduce: 226.41 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 226.54 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.81 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.35 | optimizer: 41.84 | batch-generator: 1.83
+[ip-26-0-155-69:7]:2023-06-21 17:41:32,642 [Rank 63]:  iteration      870/  150000 | consumed samples:        55680 | elapsed time per iteration (ms): 893.2 | learning rate: 1.305E-04 | global batch size:    64 | lm loss: 4.082836E+00 | loss scale: 1.0 | grad norm: 1.657 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.60 |
+[ip-26-0-155-69:7]:2023-06-21 17:41:32,642 [Rank 63]: time (ms) | forward-compute: 223.09 | backward-compute: 398.07 | backward-params-all-reduce: 226.18 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.28 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.79 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.77 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:41:41,577 [Rank 63]:  iteration      880/  150000 | consumed samples:        56320 | elapsed time per iteration (ms): 893.5 | learning rate: 1.320E-04 | global batch size:    64 | lm loss: 4.020747E+00 | loss scale: 1.0 | grad norm: 1.410 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.90 | tokens-per-second-per-gpu: 9168.23 |
+[ip-26-0-155-69:7]:2023-06-21 17:41:41,577 [Rank 63]: time (ms) | forward-compute: 223.35 | backward-compute: 398.08 | backward-params-all-reduce: 226.25 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.36 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.78 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.78 | batch-generator: 1.80
+[ip-26-0-155-69:7]:2023-06-21 17:41:50,509 [Rank 63]:  iteration      890/  150000 | consumed samples:        56960 | elapsed time per iteration (ms): 893.2 | learning rate: 1.335E-04 | global batch size:    64 | lm loss: 3.955831E+00 | loss scale: 1.0 | grad norm: 1.574 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.55 |
+[ip-26-0-155-69:7]:2023-06-21 17:41:50,509 [Rank 63]: time (ms) | forward-compute: 222.86 | backward-compute: 398.11 | backward-params-all-reduce: 226.29 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.40 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.81 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.81 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:41:59,437 [Rank 63]:  iteration      900/  150000 | consumed samples:        57600 | elapsed time per iteration (ms): 892.8 | learning rate: 1.350E-04 | global batch size:    64 | lm loss: 3.915794E+00 | loss scale: 1.0 | grad norm: 1.664 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.99 | tokens-per-second-per-gpu: 9175.65 |
+[ip-26-0-155-69:7]:2023-06-21 17:41:59,437 [Rank 63]: time (ms) | forward-compute: 222.47 | backward-compute: 398.08 | backward-params-all-reduce: 226.34 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.45 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.57 | optimizer-clip-main-grad: 10.79 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.84 | batch-generator: 1.81
+[ip-26-0-155-69:7]:2023-06-21 17:42:08,368 [Rank 63]:  iteration      910/  150000 | consumed samples:        58240 | elapsed time per iteration (ms): 893.2 | learning rate: 1.365E-04 | global batch size:    64 | lm loss: 3.910498E+00 | loss scale: 1.0 | grad norm: 1.533 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.01 |
+[ip-26-0-155-69:7]:2023-06-21 17:42:08,369 [Rank 63]: time (ms) | forward-compute: 222.92 | backward-compute: 398.08 | backward-params-all-reduce: 226.31 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.42 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.79 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.78 | batch-generator: 1.81
+[ip-26-0-155-69:7]:2023-06-21 17:42:17,297 [Rank 63]:  iteration      920/  150000 | consumed samples:        58880 | elapsed time per iteration (ms): 892.9 | learning rate: 1.380E-04 | global batch size:    64 | lm loss: 3.750729E+00 | loss scale: 1.0 | grad norm: 1.919 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9174.53 |
+[ip-26-0-155-69:7]:2023-06-21 17:42:17,298 [Rank 63]: time (ms) | forward-compute: 222.75 | backward-compute: 398.16 | backward-params-all-reduce: 226.17 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.28 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.78 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.78 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:42:26,228 [Rank 63]:  iteration      930/  150000 | consumed samples:        59520 | elapsed time per iteration (ms): 893.1 | learning rate: 1.395E-04 | global batch size:    64 | lm loss: 3.721997E+00 | loss scale: 1.0 | grad norm: 2.101 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9172.86 |
+[ip-26-0-155-69:7]:2023-06-21 17:42:26,229 [Rank 63]: time (ms) | forward-compute: 222.94 | backward-compute: 398.10 | backward-params-all-reduce: 226.09 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.21 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.81 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.83 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:42:35,161 [Rank 63]:  iteration      940/  150000 | consumed samples:        60160 | elapsed time per iteration (ms): 893.2 | learning rate: 1.410E-04 | global batch size:    64 | lm loss: 3.698772E+00 | loss scale: 1.0 | grad norm: 2.170 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.16 |
+[ip-26-0-155-69:7]:2023-06-21 17:42:35,161 [Rank 63]: time (ms) | forward-compute: 223.03 | backward-compute: 398.12 | backward-params-all-reduce: 226.31 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.41 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.77 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.74 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:42:44,093 [Rank 63]:  iteration      950/  150000 | consumed samples:        60800 | elapsed time per iteration (ms): 893.3 | learning rate: 1.425E-04 | global batch size:    64 | lm loss: 3.612666E+00 | loss scale: 1.0 | grad norm: 2.250 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.57 |
+[ip-26-0-155-69:7]:2023-06-21 17:42:44,094 [Rank 63]: time (ms) | forward-compute: 223.30 | backward-compute: 398.12 | backward-params-all-reduce: 226.13 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.22 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.77 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.72 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:42:53,022 [Rank 63]:  iteration      960/  150000 | consumed samples:        61440 | elapsed time per iteration (ms): 892.9 | learning rate: 1.440E-04 | global batch size:    64 | lm loss: 3.540173E+00 | loss scale: 1.0 | grad norm: 1.799 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9175.07 |
+[ip-26-0-155-69:7]:2023-06-21 17:42:53,023 [Rank 63]: time (ms) | forward-compute: 223.02 | backward-compute: 398.11 | backward-params-all-reduce: 225.90 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.00 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.57 | optimizer-clip-main-grad: 10.81 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.81 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:43:01,950 [Rank 63]:  iteration      970/  150000 | consumed samples:        62080 | elapsed time per iteration (ms): 892.8 | learning rate: 1.455E-04 | global batch size:    64 | lm loss: 3.456714E+00 | loss scale: 1.0 | grad norm: 2.277 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.99 | tokens-per-second-per-gpu: 9175.49 |
+[ip-26-0-155-69:7]:2023-06-21 17:43:01,951 [Rank 63]: time (ms) | forward-compute: 223.06 | backward-compute: 398.09 | backward-params-all-reduce: 225.88 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.99 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.77 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.75 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:43:10,886 [Rank 63]:  iteration      980/  150000 | consumed samples:        62720 | elapsed time per iteration (ms): 893.6 | learning rate: 1.470E-04 | global batch size:    64 | lm loss: 3.340820E+00 | loss scale: 1.0 | grad norm: 2.357 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.89 | tokens-per-second-per-gpu: 9167.39 |
+[ip-26-0-155-69:7]:2023-06-21 17:43:10,887 [Rank 63]: time (ms) | forward-compute: 223.15 | backward-compute: 398.13 | backward-params-all-reduce: 226.50 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.60 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.77 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.76 | batch-generator: 1.80
+[ip-26-0-155-69:7]:2023-06-21 17:43:19,814 [Rank 63]:  iteration      990/  150000 | consumed samples:        63360 | elapsed time per iteration (ms): 892.8 | learning rate: 1.485E-04 | global batch size:    64 | lm loss: 3.300872E+00 | loss scale: 1.0 | grad norm: 2.031 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.99 | tokens-per-second-per-gpu: 9175.77 |
+[ip-26-0-155-69:7]:2023-06-21 17:43:19,815 [Rank 63]: time (ms) | forward-compute: 223.08 | backward-compute: 398.14 | backward-params-all-reduce: 225.75 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.85 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.78 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.77 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:43:28,744 [Rank 63]:  iteration     1000/  150000 | consumed samples:        64000 | elapsed time per iteration (ms): 893.0 | learning rate: 1.500E-04 | global batch size:    64 | lm loss: 3.208639E+00 | loss scale: 1.0 | grad norm: 2.356 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.97 | tokens-per-second-per-gpu: 9173.69 |
+[ip-26-0-155-69:7]:2023-06-21 17:43:28,745 [Rank 63]: time (ms) | forward-compute: 222.89 | backward-compute: 398.17 | backward-params-all-reduce: 225.98 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 226.10 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.80 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.81 | batch-generator: 1.82
+[ip-26-0-155-69:7]:2023-06-21 17:43:37,675 [Rank 63]:  iteration     1010/  150000 | consumed samples:        64640 | elapsed time per iteration (ms): 893.1 | learning rate: 1.515E-04 | global batch size:    64 | lm loss: 3.153380E+00 | loss scale: 1.0 | grad norm: 2.461 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.12 |
+[ip-26-0-155-69:7]:2023-06-21 17:43:37,676 [Rank 63]: time (ms) | forward-compute: 223.40 | backward-compute: 398.16 | backward-params-all-reduce: 225.78 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.88 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.77 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.75 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:43:46,608 [Rank 63]:  iteration     1020/  150000 | consumed samples:        65280 | elapsed time per iteration (ms): 893.3 | learning rate: 1.530E-04 | global batch size:    64 | lm loss: 3.091166E+00 | loss scale: 1.0 | grad norm: 2.092 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.69 |
+[ip-26-0-155-69:7]:2023-06-21 17:43:46,609 [Rank 63]: time (ms) | forward-compute: 223.38 | backward-compute: 398.19 | backward-params-all-reduce: 225.96 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.06 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.77 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.74 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:43:55,540 [Rank 63]:  iteration     1030/  150000 | consumed samples:        65920 | elapsed time per iteration (ms): 893.1 | learning rate: 1.545E-04 | global batch size:    64 | lm loss: 3.045628E+00 | loss scale: 1.0 | grad norm: 1.924 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.13 |
+[ip-26-0-155-69:7]:2023-06-21 17:43:55,540 [Rank 63]: time (ms) | forward-compute: 223.22 | backward-compute: 398.19 | backward-params-all-reduce: 225.82 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.93 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.86 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.87 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:44:04,470 [Rank 63]:  iteration     1040/  150000 | consumed samples:        66560 | elapsed time per iteration (ms): 893.0 | learning rate: 1.560E-04 | global batch size:    64 | lm loss: 2.996407E+00 | loss scale: 1.0 | grad norm: 2.341 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9173.22 |
+[ip-26-0-155-69:7]:2023-06-21 17:44:04,470 [Rank 63]: time (ms) | forward-compute: 222.96 | backward-compute: 398.11 | backward-params-all-reduce: 226.06 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.16 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.84 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:44:13,403 [Rank 63]:  iteration     1050/  150000 | consumed samples:        67200 | elapsed time per iteration (ms): 893.3 | learning rate: 1.575E-04 | global batch size:    64 | lm loss: 2.971989E+00 | loss scale: 1.0 | grad norm: 1.647 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.44 |
+[ip-26-0-155-69:7]:2023-06-21 17:44:13,404 [Rank 63]: time (ms) | forward-compute: 223.33 | backward-compute: 398.12 | backward-params-all-reduce: 225.96 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.07 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.86 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:44:22,335 [Rank 63]:  iteration     1060/  150000 | consumed samples:        67840 | elapsed time per iteration (ms): 893.2 | learning rate: 1.590E-04 | global batch size:    64 | lm loss: 2.915565E+00 | loss scale: 1.0 | grad norm: 1.364 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.74 |
+[ip-26-0-155-69:7]:2023-06-21 17:44:22,335 [Rank 63]: time (ms) | forward-compute: 223.23 | backward-compute: 398.12 | backward-params-all-reduce: 225.89 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.00 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.89 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:44:31,267 [Rank 63]:  iteration     1070/  150000 | consumed samples:        68480 | elapsed time per iteration (ms): 893.2 | learning rate: 1.605E-04 | global batch size:    64 | lm loss: 2.847284E+00 | loss scale: 1.0 | grad norm: 1.993 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.37 |
+[ip-26-0-155-69:7]:2023-06-21 17:44:31,268 [Rank 63]: time (ms) | forward-compute: 222.99 | backward-compute: 398.21 | backward-params-all-reduce: 225.95 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.06 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.92 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:44:40,199 [Rank 63]:  iteration     1080/  150000 | consumed samples:        69120 | elapsed time per iteration (ms): 893.2 | learning rate: 1.620E-04 | global batch size:    64 | lm loss: 2.839899E+00 | loss scale: 1.0 | grad norm: 1.423 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.61 |
+[ip-26-0-155-69:7]:2023-06-21 17:44:40,199 [Rank 63]: time (ms) | forward-compute: 223.16 | backward-compute: 398.13 | backward-params-all-reduce: 225.91 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.02 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.90 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:44:49,127 [Rank 63]:  iteration     1090/  150000 | consumed samples:        69760 | elapsed time per iteration (ms): 892.8 | learning rate: 1.635E-04 | global batch size:    64 | lm loss: 2.849504E+00 | loss scale: 1.0 | grad norm: 1.503 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.99 | tokens-per-second-per-gpu: 9175.53 |
+[ip-26-0-155-69:7]:2023-06-21 17:44:49,127 [Rank 63]: time (ms) | forward-compute: 222.69 | backward-compute: 398.16 | backward-params-all-reduce: 226.01 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.12 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.88 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:44:58,056 [Rank 63]:  iteration     1100/  150000 | consumed samples:        70400 | elapsed time per iteration (ms): 892.9 | learning rate: 1.650E-04 | global batch size:    64 | lm loss: 2.744584E+00 | loss scale: 1.0 | grad norm: 2.211 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9174.57 |
+[ip-26-0-155-69:7]:2023-06-21 17:44:58,056 [Rank 63]: time (ms) | forward-compute: 223.09 | backward-compute: 398.13 | backward-params-all-reduce: 225.80 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.90 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.85 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:45:06,989 [Rank 63]:  iteration     1110/  150000 | consumed samples:        71040 | elapsed time per iteration (ms): 893.3 | learning rate: 1.665E-04 | global batch size:    64 | lm loss: 2.695924E+00 | loss scale: 1.0 | grad norm: 2.254 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.87 |
+[ip-26-0-155-69:7]:2023-06-21 17:45:06,989 [Rank 63]: time (ms) | forward-compute: 222.99 | backward-compute: 398.16 | backward-params-all-reduce: 226.14 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.25 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.90 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:45:15,926 [Rank 63]:  iteration     1120/  150000 | consumed samples:        71680 | elapsed time per iteration (ms): 893.7 | learning rate: 1.680E-04 | global batch size:    64 | lm loss: 2.687495E+00 | loss scale: 1.0 | grad norm: 1.497 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.88 | tokens-per-second-per-gpu: 9166.41 |
+[ip-26-0-155-69:7]:2023-06-21 17:45:15,926 [Rank 63]: time (ms) | forward-compute: 223.50 | backward-compute: 398.08 | backward-params-all-reduce: 226.30 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.39 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.51 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.83 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:45:24,859 [Rank 63]:  iteration     1130/  150000 | consumed samples:        72320 | elapsed time per iteration (ms): 893.3 | learning rate: 1.695E-04 | global batch size:    64 | lm loss: 2.661385E+00 | loss scale: 1.0 | grad norm: 1.295 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.92 | tokens-per-second-per-gpu: 9170.08 |
+[ip-26-0-155-69:7]:2023-06-21 17:45:24,860 [Rank 63]: time (ms) | forward-compute: 223.32 | backward-compute: 398.16 | backward-params-all-reduce: 225.83 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.93 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.97 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.97 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:45:33,789 [Rank 63]:  iteration     1140/  150000 | consumed samples:        72960 | elapsed time per iteration (ms): 893.0 | learning rate: 1.710E-04 | global batch size:    64 | lm loss: 2.660387E+00 | loss scale: 1.0 | grad norm: 1.376 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.97 | tokens-per-second-per-gpu: 9173.97 |
+[ip-26-0-155-69:7]:2023-06-21 17:45:33,789 [Rank 63]: time (ms) | forward-compute: 223.03 | backward-compute: 398.13 | backward-params-all-reduce: 225.88 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.98 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.88 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:45:42,717 [Rank 63]:  iteration     1150/  150000 | consumed samples:        73600 | elapsed time per iteration (ms): 892.8 | learning rate: 1.725E-04 | global batch size:    64 | lm loss: 2.605412E+00 | loss scale: 1.0 | grad norm: 1.620 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.99 | tokens-per-second-per-gpu: 9175.41 |
+[ip-26-0-155-69:7]:2023-06-21 17:45:42,718 [Rank 63]: time (ms) | forward-compute: 223.02 | backward-compute: 398.14 | backward-params-all-reduce: 225.72 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 225.84 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.87 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:45:51,652 [Rank 63]:  iteration     1160/  150000 | consumed samples:        74240 | elapsed time per iteration (ms): 893.5 | learning rate: 1.740E-04 | global batch size:    64 | lm loss: 2.514670E+00 | loss scale: 1.0 | grad norm: 1.174 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.91 | tokens-per-second-per-gpu: 9168.58 |
+[ip-26-0-155-69:7]:2023-06-21 17:45:51,652 [Rank 63]: time (ms) | forward-compute: 223.31 | backward-compute: 398.13 | backward-params-all-reduce: 226.15 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.24 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.83 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:46:00,581 [Rank 63]:  iteration     1170/  150000 | consumed samples:        74880 | elapsed time per iteration (ms): 892.9 | learning rate: 1.755E-04 | global batch size:    64 | lm loss: 2.608316E+00 | loss scale: 1.0 | grad norm: 1.315 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9174.82 |
+[ip-26-0-155-69:7]:2023-06-21 17:46:00,581 [Rank 63]: time (ms) | forward-compute: 223.28 | backward-compute: 398.17 | backward-params-all-reduce: 225.60 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.69 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.51 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.80 | batch-generator: 1.80
+[ip-26-0-155-69:7]:2023-06-21 17:46:09,511 [Rank 63]:  iteration     1180/  150000 | consumed samples:        75520 | elapsed time per iteration (ms): 893.0 | learning rate: 1.770E-04 | global batch size:    64 | lm loss: 2.505637E+00 | loss scale: 1.0 | grad norm: 1.357 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9173.48 |
+[ip-26-0-155-69:7]:2023-06-21 17:46:09,511 [Rank 63]: time (ms) | forward-compute: 223.09 | backward-compute: 398.21 | backward-params-all-reduce: 225.58 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.69 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.94 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.34 | optimizer: 41.94 | batch-generator: 1.80
+[ip-26-0-155-69:7]:2023-06-21 17:46:18,434 [Rank 63]:  iteration     1190/  150000 | consumed samples:        76160 | elapsed time per iteration (ms): 892.4 | learning rate: 1.785E-04 | global batch size:    64 | lm loss: 2.492915E+00 | loss scale: 1.0 | grad norm: 1.201 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.04 | tokens-per-second-per-gpu: 9180.16 |
+[ip-26-0-155-69:7]:2023-06-21 17:46:18,435 [Rank 63]: time (ms) | forward-compute: 222.93 | backward-compute: 398.13 | backward-params-all-reduce: 225.89 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.00 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.30 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.34 | optimizer: 41.33 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:46:27,363 [Rank 63]:  iteration     1200/  150000 | consumed samples:        76800 | elapsed time per iteration (ms): 892.9 | learning rate: 1.800E-04 | global batch size:    64 | lm loss: 2.512564E+00 | loss scale: 1.0 | grad norm: 1.095 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9174.77 |
+[ip-26-0-155-69:7]:2023-06-21 17:46:27,364 [Rank 63]: time (ms) | forward-compute: 222.97 | backward-compute: 398.21 | backward-params-all-reduce: 225.73 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.84 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.88 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:46:36,293 [Rank 63]:  iteration     1210/  150000 | consumed samples:        77440 | elapsed time per iteration (ms): 893.0 | learning rate: 1.815E-04 | global batch size:    64 | lm loss: 2.474167E+00 | loss scale: 1.0 | grad norm: 1.300 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9173.25 |
+[ip-26-0-155-69:7]:2023-06-21 17:46:36,294 [Rank 63]: time (ms) | forward-compute: 223.01 | backward-compute: 398.15 | backward-params-all-reduce: 225.78 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.89 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.91 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:46:45,226 [Rank 63]:  iteration     1220/  150000 | consumed samples:        78080 | elapsed time per iteration (ms): 893.3 | learning rate: 1.830E-04 | global batch size:    64 | lm loss: 2.465782E+00 | loss scale: 1.0 | grad norm: 1.207 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.77 |
+[ip-26-0-155-69:7]:2023-06-21 17:46:45,227 [Rank 63]: time (ms) | forward-compute: 222.95 | backward-compute: 398.13 | backward-params-all-reduce: 226.02 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 226.14 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.36 | optimizer: 41.97 | batch-generator: 1.84
+[ip-26-0-155-69:7]:2023-06-21 17:46:54,158 [Rank 63]:  iteration     1230/  150000 | consumed samples:        78720 | elapsed time per iteration (ms): 893.2 | learning rate: 1.845E-04 | global batch size:    64 | lm loss: 2.344203E+00 | loss scale: 1.0 | grad norm: 1.232 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.55 |
+[ip-26-0-155-69:7]:2023-06-21 17:46:54,159 [Rank 63]: time (ms) | forward-compute: 223.12 | backward-compute: 398.05 | backward-params-all-reduce: 226.02 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.14 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.87 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:47:03,102 [Rank 63]:  iteration     1240/  150000 | consumed samples:        79360 | elapsed time per iteration (ms): 894.3 | learning rate: 1.860E-04 | global batch size:    64 | lm loss: 2.391261E+00 | loss scale: 1.0 | grad norm: 1.046 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.81 | tokens-per-second-per-gpu: 9159.81 |
+[ip-26-0-155-69:7]:2023-06-21 17:47:03,102 [Rank 63]: time (ms) | forward-compute: 224.88 | backward-compute: 398.05 | backward-params-all-reduce: 226.06 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.17 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.30 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.29 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:47:12,218 [Rank 63]:  iteration     1250/  150000 | consumed samples:        80000 | elapsed time per iteration (ms): 911.7 | learning rate: 1.875E-04 | global batch size:    64 | lm loss: 2.366144E+00 | loss scale: 1.0 | grad norm: 1.226 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 103.80 | tokens-per-second-per-gpu: 8985.86 |
+[ip-26-0-155-69:7]:2023-06-21 17:47:12,222 [Rank 63]: time (ms) | forward-compute: 227.78 | backward-compute: 410.39 | backward-params-all-reduce: 226.88 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.99 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 1.11 | optimizer-clip-main-grad: 10.94 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.34 | optimizer: 42.50 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:47:21,436 [Rank 63]:  iteration     1260/  150000 | consumed samples:        80640 | elapsed time per iteration (ms): 921.8 | learning rate: 1.890E-04 | global batch size:    64 | lm loss: 2.398231E+00 | loss scale: 1.0 | grad norm: 1.184 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 102.66 | tokens-per-second-per-gpu: 8887.10 |
+[ip-26-0-155-69:7]:2023-06-21 17:47:21,436 [Rank 63]: time (ms) | forward-compute: 239.96 | backward-compute: 398.08 | backward-params-all-reduce: 238.05 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 238.15 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.31 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.33 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:47:30,378 [Rank 63]:  iteration     1270/  150000 | consumed samples:        81280 | elapsed time per iteration (ms): 894.2 | learning rate: 1.905E-04 | global batch size:    64 | lm loss: 2.373417E+00 | loss scale: 1.0 | grad norm: 1.207 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.82 | tokens-per-second-per-gpu: 9160.81 |
+[ip-26-0-155-69:7]:2023-06-21 17:47:30,379 [Rank 63]: time (ms) | forward-compute: 223.89 | backward-compute: 398.48 | backward-params-all-reduce: 227.15 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 227.24 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 9.71 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 40.68 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:47:39,307 [Rank 63]:  iteration     1280/  150000 | consumed samples:        81920 | elapsed time per iteration (ms): 892.9 | learning rate: 1.920E-04 | global batch size:    64 | lm loss: 2.270634E+00 | loss scale: 1.0 | grad norm: 0.874 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9174.81 |
+[ip-26-0-155-69:7]:2023-06-21 17:47:39,308 [Rank 63]: time (ms) | forward-compute: 223.30 | backward-compute: 398.17 | backward-params-all-reduce: 226.13 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.22 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.29 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.26 | batch-generator: 1.73
+[ip-26-0-155-69:7]:2023-06-21 17:47:48,229 [Rank 63]:  iteration     1290/  150000 | consumed samples:        82560 | elapsed time per iteration (ms): 892.2 | learning rate: 1.935E-04 | global batch size:    64 | lm loss: 2.339629E+00 | loss scale: 1.0 | grad norm: 1.124 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.06 | tokens-per-second-per-gpu: 9181.62 |
+[ip-26-0-155-69:7]:2023-06-21 17:47:48,230 [Rank 63]: time (ms) | forward-compute: 224.19 | backward-compute: 398.22 | backward-params-all-reduce: 226.23 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.33 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 8.52 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 39.49 | batch-generator: 1.73
+[ip-26-0-155-69:7]:2023-06-21 17:47:57,137 [Rank 63]:  iteration     1300/  150000 | consumed samples:        83200 | elapsed time per iteration (ms): 890.7 | learning rate: 1.950E-04 | global batch size:    64 | lm loss: 2.293056E+00 | loss scale: 1.0 | grad norm: 1.426 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.24 | tokens-per-second-per-gpu: 9196.99 |
+[ip-26-0-155-69:7]:2023-06-21 17:47:57,137 [Rank 63]: time (ms) | forward-compute: 223.00 | backward-compute: 398.18 | backward-params-all-reduce: 226.00 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.10 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 8.50 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 39.44 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:48:06,068 [Rank 63]:  iteration     1310/  150000 | consumed samples:        83840 | elapsed time per iteration (ms): 893.1 | learning rate: 1.965E-04 | global batch size:    64 | lm loss: 2.309850E+00 | loss scale: 1.0 | grad norm: 0.914 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.45 |
+[ip-26-0-155-69:7]:2023-06-21 17:48:06,068 [Rank 63]: time (ms) | forward-compute: 224.07 | backward-compute: 398.18 | backward-params-all-reduce: 226.15 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.25 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 9.68 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 40.63 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:48:14,955 [Rank 63]:  iteration     1320/  150000 | consumed samples:        84480 | elapsed time per iteration (ms): 888.7 | learning rate: 1.980E-04 | global batch size:    64 | lm loss: 2.227564E+00 | loss scale: 1.0 | grad norm: 0.763 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.48 | tokens-per-second-per-gpu: 9218.03 |
+[ip-26-0-155-69:7]:2023-06-21 17:48:14,955 [Rank 63]: time (ms) | forward-compute: 223.51 | backward-compute: 398.19 | backward-params-all-reduce: 225.81 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.92 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 6.12 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 37.12 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:48:23,868 [Rank 63]:  iteration     1330/  150000 | consumed samples:        85120 | elapsed time per iteration (ms): 891.3 | learning rate: 1.995E-04 | global batch size:    64 | lm loss: 2.216090E+00 | loss scale: 1.0 | grad norm: 0.923 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.16 | tokens-per-second-per-gpu: 9190.61 |
+[ip-26-0-155-69:7]:2023-06-21 17:48:23,869 [Rank 63]: time (ms) | forward-compute: 224.40 | backward-compute: 398.15 | backward-params-all-reduce: 226.54 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.64 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 7.29 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 38.25 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:48:32,800 [Rank 63]:  iteration     1340/  150000 | consumed samples:        85760 | elapsed time per iteration (ms): 893.1 | learning rate: 2.010E-04 | global batch size:    64 | lm loss: 2.260810E+00 | loss scale: 1.0 | grad norm: 1.052 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.09 |
+[ip-26-0-155-69:7]:2023-06-21 17:48:32,801 [Rank 63]: time (ms) | forward-compute: 224.46 | backward-compute: 398.23 | backward-params-all-reduce: 226.26 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.36 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.58 | optimizer-clip-main-grad: 8.73 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 39.76 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:48:41,863 [Rank 63]:  iteration     1350/  150000 | consumed samples:        86400 | elapsed time per iteration (ms): 906.3 | learning rate: 2.025E-04 | global batch size:    64 | lm loss: 2.207488E+00 | loss scale: 1.0 | grad norm: 0.992 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 104.41 | tokens-per-second-per-gpu: 9038.78 |
+[ip-26-0-155-69:7]:2023-06-21 17:48:41,863 [Rank 63]: time (ms) | forward-compute: 232.47 | backward-compute: 398.53 | backward-params-all-reduce: 227.51 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 227.60 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 9.47 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.07 | optimizer-copy-main-to-model-params: 9.17 | optimizer: 41.33 | batch-generator: 3.51
+[ip-26-0-155-69:7]:2023-06-21 17:48:50,846 [Rank 63]:  iteration     1360/  150000 | consumed samples:        87040 | elapsed time per iteration (ms): 898.3 | learning rate: 2.040E-04 | global batch size:    64 | lm loss: 2.270252E+00 | loss scale: 1.0 | grad norm: 0.899 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.34 | tokens-per-second-per-gpu: 9119.72 |
+[ip-26-0-155-69:7]:2023-06-21 17:48:50,846 [Rank 63]: time (ms) | forward-compute: 229.50 | backward-compute: 398.07 | backward-params-all-reduce: 225.71 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.80 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 9.72 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 40.68 | batch-generator: 2.08
+[ip-26-0-155-69:7]:2023-06-21 17:48:59,739 [Rank 63]:  iteration     1370/  150000 | consumed samples:        87680 | elapsed time per iteration (ms): 889.4 | learning rate: 2.055E-04 | global batch size:    64 | lm loss: 2.186746E+00 | loss scale: 1.0 | grad norm: 0.852 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.40 | tokens-per-second-per-gpu: 9211.04 |
+[ip-26-0-155-69:7]:2023-06-21 17:48:59,740 [Rank 63]: time (ms) | forward-compute: 223.31 | backward-compute: 398.11 | backward-params-all-reduce: 225.63 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.73 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 7.31 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 38.26 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:49:08,608 [Rank 63]:  iteration     1380/  150000 | consumed samples:        88320 | elapsed time per iteration (ms): 886.9 | learning rate: 2.070E-04 | global batch size:    64 | lm loss: 2.170895E+00 | loss scale: 1.0 | grad norm: 0.820 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.69 | tokens-per-second-per-gpu: 9236.76 |
+[ip-26-0-155-69:7]:2023-06-21 17:49:08,609 [Rank 63]: time (ms) | forward-compute: 223.06 | backward-compute: 398.13 | backward-params-all-reduce: 225.81 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.91 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.86 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:49:17,478 [Rank 63]:  iteration     1390/  150000 | consumed samples:        88960 | elapsed time per iteration (ms): 887.0 | learning rate: 2.085E-04 | global batch size:    64 | lm loss: 2.184438E+00 | loss scale: 1.0 | grad norm: 0.667 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.69 | tokens-per-second-per-gpu: 9236.05 |
+[ip-26-0-155-69:7]:2023-06-21 17:49:17,479 [Rank 63]: time (ms) | forward-compute: 223.03 | backward-compute: 398.14 | backward-params-all-reduce: 225.85 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.94 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.87 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:49:26,349 [Rank 63]:  iteration     1400/  150000 | consumed samples:        89600 | elapsed time per iteration (ms): 887.1 | learning rate: 2.100E-04 | global batch size:    64 | lm loss: 2.191436E+00 | loss scale: 1.0 | grad norm: 0.900 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.67 | tokens-per-second-per-gpu: 9234.72 |
+[ip-26-0-155-69:7]:2023-06-21 17:49:26,349 [Rank 63]: time (ms) | forward-compute: 223.15 | backward-compute: 398.13 | backward-params-all-reduce: 225.85 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.95 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.87 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:49:35,217 [Rank 63]:  iteration     1410/  150000 | consumed samples:        90240 | elapsed time per iteration (ms): 886.9 | learning rate: 2.115E-04 | global batch size:    64 | lm loss: 2.105275E+00 | loss scale: 1.0 | grad norm: 0.648 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.70 | tokens-per-second-per-gpu: 9236.90 |
+[ip-26-0-155-69:7]:2023-06-21 17:49:35,218 [Rank 63]: time (ms) | forward-compute: 223.11 | backward-compute: 398.07 | backward-params-all-reduce: 225.83 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.92 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.83 | batch-generator: 1.72
+[ip-26-0-155-69:7]:2023-06-21 17:49:44,300 [Rank 63]:  iteration     1420/  150000 | consumed samples:        90880 | elapsed time per iteration (ms): 908.2 | learning rate: 2.130E-04 | global batch size:    64 | lm loss: 2.148899E+00 | loss scale: 1.0 | grad norm: 0.771 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 104.19 | tokens-per-second-per-gpu: 9019.68 |
+[ip-26-0-155-69:7]:2023-06-21 17:49:44,300 [Rank 63]: time (ms) | forward-compute: 244.20 | backward-compute: 398.14 | backward-params-all-reduce: 226.03 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.14 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.82 | batch-generator: 1.72
+[ip-26-0-155-69:7]:2023-06-21 17:49:53,437 [Rank 63]:  iteration     1430/  150000 | consumed samples:        91520 | elapsed time per iteration (ms): 913.8 | learning rate: 2.145E-04 | global batch size:    64 | lm loss: 2.106895E+00 | loss scale: 1.0 | grad norm: 0.953 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 103.56 | tokens-per-second-per-gpu: 8965.18 |
+[ip-26-0-155-69:7]:2023-06-21 17:49:53,438 [Rank 63]: time (ms) | forward-compute: 249.00 | backward-compute: 398.13 | backward-params-all-reduce: 226.04 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.15 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 5.53 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 36.51 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:50:02,328 [Rank 63]:  iteration     1440/  150000 | consumed samples:        92160 | elapsed time per iteration (ms): 889.1 | learning rate: 2.160E-04 | global batch size:    64 | lm loss: 2.082574E+00 | loss scale: 1.0 | grad norm: 0.964 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.43 | tokens-per-second-per-gpu: 9214.22 |
+[ip-26-0-155-69:7]:2023-06-21 17:50:02,329 [Rank 63]: time (ms) | forward-compute: 224.51 | backward-compute: 398.18 | backward-params-all-reduce: 225.82 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.93 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 5.51 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 36.49 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:50:11,226 [Rank 63]:  iteration     1450/  150000 | consumed samples:        92800 | elapsed time per iteration (ms): 889.8 | learning rate: 2.175E-04 | global batch size:    64 | lm loss: 2.120988E+00 | loss scale: 1.0 | grad norm: 1.176 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.35 | tokens-per-second-per-gpu: 9206.48 |
+[ip-26-0-155-69:7]:2023-06-21 17:50:11,227 [Rank 63]: time (ms) | forward-compute: 222.94 | backward-compute: 398.08 | backward-params-all-reduce: 225.92 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.02 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 7.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 38.85 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:50:20,109 [Rank 63]:  iteration     1460/  150000 | consumed samples:        93440 | elapsed time per iteration (ms): 888.3 | learning rate: 2.190E-04 | global batch size:    64 | lm loss: 2.086980E+00 | loss scale: 1.0 | grad norm: 0.844 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.52 | tokens-per-second-per-gpu: 9221.81 |
+[ip-26-0-155-69:7]:2023-06-21 17:50:20,110 [Rank 63]: time (ms) | forward-compute: 223.28 | backward-compute: 398.14 | backward-params-all-reduce: 225.81 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.92 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 6.09 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.00 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 37.06 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:50:28,993 [Rank 63]:  iteration     1470/  150000 | consumed samples:        94080 | elapsed time per iteration (ms): 888.4 | learning rate: 2.205E-04 | global batch size:    64 | lm loss: 2.114236E+00 | loss scale: 1.0 | grad norm: 0.840 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.52 | tokens-per-second-per-gpu: 9221.27 |
+[ip-26-0-155-69:7]:2023-06-21 17:50:28,994 [Rank 63]: time (ms) | forward-compute: 223.55 | backward-compute: 398.22 | backward-params-all-reduce: 225.90 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.01 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 5.57 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 36.59 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:50:37,865 [Rank 63]:  iteration     1480/  150000 | consumed samples:        94720 | elapsed time per iteration (ms): 887.2 | learning rate: 2.220E-04 | global batch size:    64 | lm loss: 2.105153E+00 | loss scale: 1.0 | grad norm: 0.724 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.66 | tokens-per-second-per-gpu: 9233.89 |
+[ip-26-0-155-69:7]:2023-06-21 17:50:37,866 [Rank 63]: time (ms) | forward-compute: 223.00 | backward-compute: 398.19 | backward-params-all-reduce: 226.00 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 226.13 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.87 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:50:46,748 [Rank 63]:  iteration     1490/  150000 | consumed samples:        95360 | elapsed time per iteration (ms): 888.3 | learning rate: 2.235E-04 | global batch size:    64 | lm loss: 2.085087E+00 | loss scale: 1.0 | grad norm: 0.795 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.53 | tokens-per-second-per-gpu: 9222.58 |
+[ip-26-0-155-69:7]:2023-06-21 17:50:46,748 [Rank 63]: time (ms) | forward-compute: 222.63 | backward-compute: 398.13 | backward-params-all-reduce: 226.30 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.40 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 6.10 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 37.07 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:50:55,624 [Rank 63]:  iteration     1500/  150000 | consumed samples:        96000 | elapsed time per iteration (ms): 887.6 | learning rate: 2.250E-04 | global batch size:    64 | lm loss: 2.111531E+00 | loss scale: 1.0 | grad norm: 0.722 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.60 | tokens-per-second-per-gpu: 9228.97 |
+[ip-26-0-155-69:7]:2023-06-21 17:50:55,624 [Rank 63]: time (ms) | forward-compute: 222.95 | backward-compute: 398.16 | backward-params-all-reduce: 225.95 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.06 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 5.52 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 36.50 | batch-generator: 1.80
+[ip-26-0-155-69:7]:2023-06-21 17:51:04,505 [Rank 63]:  iteration     1510/  150000 | consumed samples:        96640 | elapsed time per iteration (ms): 888.1 | learning rate: 2.265E-04 | global batch size:    64 | lm loss: 2.091608E+00 | loss scale: 1.0 | grad norm: 0.586 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.55 | tokens-per-second-per-gpu: 9224.57 |
+[ip-26-0-155-69:7]:2023-06-21 17:51:04,505 [Rank 63]: time (ms) | forward-compute: 222.86 | backward-compute: 398.17 | backward-params-all-reduce: 225.94 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.03 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 6.10 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 37.05 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:51:13,378 [Rank 63]:  iteration     1520/  150000 | consumed samples:        97280 | elapsed time per iteration (ms): 887.3 | learning rate: 2.280E-04 | global batch size:    64 | lm loss: 2.089523E+00 | loss scale: 1.0 | grad norm: 0.847 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.64 | tokens-per-second-per-gpu: 9232.23 |
+[ip-26-0-155-69:7]:2023-06-21 17:51:13,378 [Rank 63]: time (ms) | forward-compute: 222.85 | backward-compute: 398.20 | backward-params-all-reduce: 226.44 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.53 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.84 | batch-generator: 1.73
+[ip-26-0-155-69:7]:2023-06-21 17:51:22,247 [Rank 63]:  iteration     1530/  150000 | consumed samples:        97920 | elapsed time per iteration (ms): 886.9 | learning rate: 2.295E-04 | global batch size:    64 | lm loss: 2.017301E+00 | loss scale: 1.0 | grad norm: 0.643 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.69 | tokens-per-second-per-gpu: 9236.22 |
+[ip-26-0-155-69:7]:2023-06-21 17:51:22,248 [Rank 63]: time (ms) | forward-compute: 222.99 | backward-compute: 398.20 | backward-params-all-reduce: 225.83 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.94 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 4.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.88 | batch-generator: 1.73
+[ip-26-0-155-69:7]:2023-06-21 17:51:31,117 [Rank 63]:  iteration     1540/  150000 | consumed samples:        98560 | elapsed time per iteration (ms): 887.0 | learning rate: 2.310E-04 | global batch size:    64 | lm loss: 2.060225E+00 | loss scale: 1.0 | grad norm: 0.548 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.68 | tokens-per-second-per-gpu: 9235.67 |
+[ip-26-0-155-69:7]:2023-06-21 17:51:31,118 [Rank 63]: time (ms) | forward-compute: 222.73 | backward-compute: 398.16 | backward-params-all-reduce: 226.12 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.23 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 4.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 35.91 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:51:39,995 [Rank 63]:  iteration     1550/  150000 | consumed samples:        99200 | elapsed time per iteration (ms): 887.8 | learning rate: 2.325E-04 | global batch size:    64 | lm loss: 2.057044E+00 | loss scale: 1.0 | grad norm: 0.851 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.58 | tokens-per-second-per-gpu: 9227.20 |
+[ip-26-0-155-69:7]:2023-06-21 17:51:39,996 [Rank 63]: time (ms) | forward-compute: 223.17 | backward-compute: 398.18 | backward-params-all-reduce: 225.97 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.07 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 5.49 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 36.45 | batch-generator: 1.72
+[ip-26-0-155-69:7]:2023-06-21 17:51:48,889 [Rank 63]:  iteration     1560/  150000 | consumed samples:        99840 | elapsed time per iteration (ms): 889.4 | learning rate: 2.340E-04 | global batch size:    64 | lm loss: 2.083456E+00 | loss scale: 1.0 | grad norm: 1.189 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.40 | tokens-per-second-per-gpu: 9210.97 |
+[ip-26-0-155-69:7]:2023-06-21 17:51:48,890 [Rank 63]: time (ms) | forward-compute: 223.48 | backward-compute: 398.14 | backward-params-all-reduce: 226.02 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.12 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 6.69 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 37.67 | batch-generator: 1.73
+[ip-26-0-155-69:7]:2023-06-21 17:51:57,785 [Rank 63]:  iteration     1570/  150000 | consumed samples:       100480 | elapsed time per iteration (ms): 889.6 | learning rate: 2.355E-04 | global batch size:    64 | lm loss: 2.047602E+00 | loss scale: 1.0 | grad norm: 0.527 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.38 | tokens-per-second-per-gpu: 9209.10 |
+[ip-26-0-155-69:7]:2023-06-21 17:51:57,785 [Rank 63]: time (ms) | forward-compute: 223.80 | backward-compute: 398.15 | backward-params-all-reduce: 227.62 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 227.73 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.90 | batch-generator: 1.72
+[ip-26-0-155-69:7]:2023-06-21 17:52:06,667 [Rank 63]:  iteration     1580/  150000 | consumed samples:       101120 | elapsed time per iteration (ms): 888.2 | learning rate: 2.370E-04 | global batch size:    64 | lm loss: 2.038838E+00 | loss scale: 1.0 | grad norm: 0.620 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.53 | tokens-per-second-per-gpu: 9222.74 |
+[ip-26-0-155-69:7]:2023-06-21 17:52:06,668 [Rank 63]: time (ms) | forward-compute: 223.71 | backward-compute: 398.12 | backward-params-all-reduce: 225.95 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.05 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.51 | optimizer-clip-main-grad: 5.48 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 36.44 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:52:15,538 [Rank 63]:  iteration     1590/  150000 | consumed samples:       101760 | elapsed time per iteration (ms): 887.1 | learning rate: 2.385E-04 | global batch size:    64 | lm loss: 2.056241E+00 | loss scale: 1.0 | grad norm: 0.598 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.67 | tokens-per-second-per-gpu: 9234.62 |
+[ip-26-0-155-69:7]:2023-06-21 17:52:15,539 [Rank 63]: time (ms) | forward-compute: 223.34 | backward-compute: 398.15 | backward-params-all-reduce: 225.69 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.79 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.84 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:52:24,416 [Rank 63]:  iteration     1600/  150000 | consumed samples:       102400 | elapsed time per iteration (ms): 887.8 | learning rate: 2.400E-04 | global batch size:    64 | lm loss: 2.030428E+00 | loss scale: 1.0 | grad norm: 0.765 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.59 | tokens-per-second-per-gpu: 9227.45 |
+[ip-26-0-155-69:7]:2023-06-21 17:52:24,416 [Rank 63]: time (ms) | forward-compute: 223.26 | backward-compute: 398.18 | backward-params-all-reduce: 225.75 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.86 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.57 | optimizer-clip-main-grad: 5.49 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 36.50 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:52:33,292 [Rank 63]:  iteration     1610/  150000 | consumed samples:       103040 | elapsed time per iteration (ms): 887.6 | learning rate: 2.415E-04 | global batch size:    64 | lm loss: 2.053657E+00 | loss scale: 1.0 | grad norm: 0.599 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.61 | tokens-per-second-per-gpu: 9229.35 |
+[ip-26-0-155-69:7]:2023-06-21 17:52:33,292 [Rank 63]: time (ms) | forward-compute: 223.23 | backward-compute: 398.13 | backward-params-all-reduce: 226.22 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.33 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.34 | optimizer: 35.93 | batch-generator: 1.73
+[ip-26-0-155-69:7]:2023-06-21 17:52:42,171 [Rank 63]:  iteration     1620/  150000 | consumed samples:       103680 | elapsed time per iteration (ms): 887.9 | learning rate: 2.430E-04 | global batch size:    64 | lm loss: 1.990064E+00 | loss scale: 1.0 | grad norm: 0.870 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.57 | tokens-per-second-per-gpu: 9225.78 |
+[ip-26-0-155-69:7]:2023-06-21 17:52:42,172 [Rank 63]: time (ms) | forward-compute: 223.60 | backward-compute: 398.14 | backward-params-all-reduce: 226.30 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.41 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.84 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:52:51,054 [Rank 63]:  iteration     1630/  150000 | consumed samples:       104320 | elapsed time per iteration (ms): 888.3 | learning rate: 2.445E-04 | global batch size:    64 | lm loss: 2.021906E+00 | loss scale: 1.0 | grad norm: 1.073 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.52 | tokens-per-second-per-gpu: 9222.04 |
+[ip-26-0-155-69:7]:2023-06-21 17:52:51,055 [Rank 63]: time (ms) | forward-compute: 223.80 | backward-compute: 398.08 | backward-params-all-reduce: 225.86 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.97 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 5.51 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 36.47 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:52:59,938 [Rank 63]:  iteration     1640/  150000 | consumed samples:       104960 | elapsed time per iteration (ms): 888.4 | learning rate: 2.460E-04 | global batch size:    64 | lm loss: 2.026669E+00 | loss scale: 1.0 | grad norm: 0.577 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.52 | tokens-per-second-per-gpu: 9221.25 |
+[ip-26-0-155-69:7]:2023-06-21 17:52:59,939 [Rank 63]: time (ms) | forward-compute: 223.40 | backward-compute: 398.12 | backward-params-all-reduce: 225.74 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.85 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 6.08 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 37.08 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:53:08,815 [Rank 63]:  iteration     1650/  150000 | consumed samples:       105600 | elapsed time per iteration (ms): 887.7 | learning rate: 2.475E-04 | global batch size:    64 | lm loss: 2.012196E+00 | loss scale: 1.0 | grad norm: 0.685 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.60 | tokens-per-second-per-gpu: 9228.24 |
+[ip-26-0-155-69:7]:2023-06-21 17:53:08,816 [Rank 63]: time (ms) | forward-compute: 223.94 | backward-compute: 398.15 | backward-params-all-reduce: 225.63 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.74 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.90 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:53:17,694 [Rank 63]:  iteration     1660/  150000 | consumed samples:       106240 | elapsed time per iteration (ms): 887.8 | learning rate: 2.490E-04 | global batch size:    64 | lm loss: 1.991595E+00 | loss scale: 1.0 | grad norm: 0.674 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.58 | tokens-per-second-per-gpu: 9227.17 |
+[ip-26-0-155-69:7]:2023-06-21 17:53:17,694 [Rank 63]: time (ms) | forward-compute: 223.28 | backward-compute: 398.17 | backward-params-all-reduce: 225.78 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.89 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 5.53 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 36.53 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:53:26,570 [Rank 63]:  iteration     1670/  150000 | consumed samples:       106880 | elapsed time per iteration (ms): 887.6 | learning rate: 2.505E-04 | global batch size:    64 | lm loss: 2.005218E+00 | loss scale: 1.0 | grad norm: 0.549 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.61 | tokens-per-second-per-gpu: 9229.19 |
+[ip-26-0-155-69:7]:2023-06-21 17:53:26,570 [Rank 63]: time (ms) | forward-compute: 223.06 | backward-compute: 398.21 | backward-params-all-reduce: 225.77 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.88 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 5.50 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 36.48 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:53:35,447 [Rank 63]:  iteration     1680/  150000 | consumed samples:       107520 | elapsed time per iteration (ms): 887.7 | learning rate: 2.520E-04 | global batch size:    64 | lm loss: 1.973621E+00 | loss scale: 1.0 | grad norm: 0.650 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.60 | tokens-per-second-per-gpu: 9228.36 |
+[ip-26-0-155-69:7]:2023-06-21 17:53:35,447 [Rank 63]: time (ms) | forward-compute: 223.70 | backward-compute: 398.18 | backward-params-all-reduce: 225.75 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.85 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.95 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.92 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:53:44,329 [Rank 63]:  iteration     1690/  150000 | consumed samples:       108160 | elapsed time per iteration (ms): 888.2 | learning rate: 2.535E-04 | global batch size:    64 | lm loss: 2.005421E+00 | loss scale: 1.0 | grad norm: 0.619 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.54 | tokens-per-second-per-gpu: 9223.39 |
+[ip-26-0-155-69:7]:2023-06-21 17:53:44,329 [Rank 63]: time (ms) | forward-compute: 223.25 | backward-compute: 398.22 | backward-params-all-reduce: 226.18 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.27 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 5.51 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 36.47 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:53:53,210 [Rank 63]:  iteration     1700/  150000 | consumed samples:       108800 | elapsed time per iteration (ms): 888.1 | learning rate: 2.550E-04 | global batch size:    64 | lm loss: 2.014932E+00 | loss scale: 1.0 | grad norm: 1.424 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.55 | tokens-per-second-per-gpu: 9224.22 |
+[ip-26-0-155-69:7]:2023-06-21 17:53:53,210 [Rank 63]: time (ms) | forward-compute: 223.47 | backward-compute: 398.15 | backward-params-all-reduce: 225.99 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.09 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 5.49 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 36.47 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:54:02,092 [Rank 63]:  iteration     1710/  150000 | consumed samples:       109440 | elapsed time per iteration (ms): 888.2 | learning rate: 2.565E-04 | global batch size:    64 | lm loss: 1.982130E+00 | loss scale: 1.0 | grad norm: 1.028 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.53 | tokens-per-second-per-gpu: 9222.91 |
+[ip-26-0-155-69:7]:2023-06-21 17:54:02,092 [Rank 63]: time (ms) | forward-compute: 223.23 | backward-compute: 398.14 | backward-params-all-reduce: 225.77 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.87 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 6.08 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 37.01 | batch-generator: 1.81
+[ip-26-0-155-69:7]:2023-06-21 17:54:10,981 [Rank 63]:  iteration     1720/  150000 | consumed samples:       110080 | elapsed time per iteration (ms): 888.9 | learning rate: 2.580E-04 | global batch size:    64 | lm loss: 1.984422E+00 | loss scale: 1.0 | grad norm: 1.076 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.45 | tokens-per-second-per-gpu: 9215.77 |
+[ip-26-0-155-69:7]:2023-06-21 17:54:10,981 [Rank 63]: time (ms) | forward-compute: 223.52 | backward-compute: 398.12 | backward-params-all-reduce: 226.12 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.22 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 6.11 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 37.07 | batch-generator: 1.83
+[ip-26-0-155-69:7]:2023-06-21 17:54:19,854 [Rank 63]:  iteration     1730/  150000 | consumed samples:       110720 | elapsed time per iteration (ms): 887.3 | learning rate: 2.595E-04 | global batch size:    64 | lm loss: 1.928419E+00 | loss scale: 1.0 | grad norm: 0.959 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.65 | tokens-per-second-per-gpu: 9232.87 |
+[ip-26-0-155-69:7]:2023-06-21 17:54:19,854 [Rank 63]: time (ms) | forward-compute: 223.60 | backward-compute: 398.14 | backward-params-all-reduce: 225.60 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.69 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.86 | batch-generator: 1.83
+[ip-26-0-155-69:7]:2023-06-21 17:54:28,726 [Rank 63]:  iteration     1740/  150000 | consumed samples:       111360 | elapsed time per iteration (ms): 887.2 | learning rate: 2.610E-04 | global batch size:    64 | lm loss: 1.964355E+00 | loss scale: 1.0 | grad norm: 0.532 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.65 | tokens-per-second-per-gpu: 9233.06 |
+[ip-26-0-155-69:7]:2023-06-21 17:54:28,727 [Rank 63]: time (ms) | forward-compute: 223.54 | backward-compute: 398.09 | backward-params-all-reduce: 225.77 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.86 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.82 | batch-generator: 1.73
+[ip-26-0-155-69:7]:2023-06-21 17:54:37,602 [Rank 63]:  iteration     1750/  150000 | consumed samples:       112000 | elapsed time per iteration (ms): 887.5 | learning rate: 2.625E-04 | global batch size:    64 | lm loss: 1.910443E+00 | loss scale: 1.0 | grad norm: 0.638 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.62 | tokens-per-second-per-gpu: 9229.94 |
+[ip-26-0-155-69:7]:2023-06-21 17:54:37,602 [Rank 63]: time (ms) | forward-compute: 223.35 | backward-compute: 398.10 | backward-params-all-reduce: 226.15 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.25 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.90 | batch-generator: 1.73
+[ip-26-0-155-69:7]:2023-06-21 17:54:46,474 [Rank 63]:  iteration     1760/  150000 | consumed samples:       112640 | elapsed time per iteration (ms): 887.2 | learning rate: 2.640E-04 | global batch size:    64 | lm loss: 1.922675E+00 | loss scale: 1.0 | grad norm: 0.677 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.65 | tokens-per-second-per-gpu: 9233.07 |
+[ip-26-0-155-69:7]:2023-06-21 17:54:46,475 [Rank 63]: time (ms) | forward-compute: 223.35 | backward-compute: 398.15 | backward-params-all-reduce: 225.83 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.93 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.85 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:54:55,357 [Rank 63]:  iteration     1770/  150000 | consumed samples:       113280 | elapsed time per iteration (ms): 888.3 | learning rate: 2.655E-04 | global batch size:    64 | lm loss: 1.933500E+00 | loss scale: 1.0 | grad norm: 0.982 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.52 | tokens-per-second-per-gpu: 9222.02 |
+[ip-26-0-155-69:7]:2023-06-21 17:54:55,358 [Rank 63]: time (ms) | forward-compute: 223.06 | backward-compute: 398.07 | backward-params-all-reduce: 226.09 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.18 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 6.09 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 37.03 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:55:04,236 [Rank 63]:  iteration     1780/  150000 | consumed samples:       113920 | elapsed time per iteration (ms): 887.9 | learning rate: 2.670E-04 | global batch size:    64 | lm loss: 1.984064E+00 | loss scale: 1.0 | grad norm: 0.498 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.58 | tokens-per-second-per-gpu: 9226.77 |
+[ip-26-0-155-69:7]:2023-06-21 17:55:04,236 [Rank 63]: time (ms) | forward-compute: 223.09 | backward-compute: 398.08 | backward-params-all-reduce: 226.23 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.33 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 5.47 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 36.42 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:55:13,132 [Rank 63]:  iteration     1790/  150000 | consumed samples:       114560 | elapsed time per iteration (ms): 889.6 | learning rate: 2.685E-04 | global batch size:    64 | lm loss: 1.964734E+00 | loss scale: 1.0 | grad norm: 1.575 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.36 | tokens-per-second-per-gpu: 9208.16 |
+[ip-26-0-155-69:7]:2023-06-21 17:55:13,132 [Rank 63]: time (ms) | forward-compute: 222.83 | backward-compute: 398.18 | backward-params-all-reduce: 226.33 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.43 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 7.32 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 38.29 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:55:22,003 [Rank 63]:  iteration     1800/  150000 | consumed samples:       115200 | elapsed time per iteration (ms): 887.1 | learning rate: 2.700E-04 | global batch size:    64 | lm loss: 1.968569E+00 | loss scale: 1.0 | grad norm: 0.595 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.67 | tokens-per-second-per-gpu: 9234.51 |
+[ip-26-0-155-69:7]:2023-06-21 17:55:22,004 [Rank 63]: time (ms) | forward-compute: 222.95 | backward-compute: 398.03 | backward-params-all-reduce: 226.28 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.37 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.84 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:55:30,876 [Rank 63]:  iteration     1810/  150000 | consumed samples:       115840 | elapsed time per iteration (ms): 887.2 | learning rate: 2.715E-04 | global batch size:    64 | lm loss: 1.939122E+00 | loss scale: 1.0 | grad norm: 0.444 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.65 | tokens-per-second-per-gpu: 9233.04 |
+[ip-26-0-155-69:7]:2023-06-21 17:55:30,876 [Rank 63]: time (ms) | forward-compute: 222.91 | backward-compute: 398.21 | backward-params-all-reduce: 226.15 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.26 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 4.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.91 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:55:39,747 [Rank 63]:  iteration     1820/  150000 | consumed samples:       116480 | elapsed time per iteration (ms): 887.2 | learning rate: 2.730E-04 | global batch size:    64 | lm loss: 1.910997E+00 | loss scale: 1.0 | grad norm: 0.426 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.66 | tokens-per-second-per-gpu: 9233.77 |
+[ip-26-0-155-69:7]:2023-06-21 17:55:39,748 [Rank 63]: time (ms) | forward-compute: 223.12 | backward-compute: 398.14 | backward-params-all-reduce: 226.03 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.14 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.86 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:55:48,618 [Rank 63]:  iteration     1830/  150000 | consumed samples:       117120 | elapsed time per iteration (ms): 887.0 | learning rate: 2.745E-04 | global batch size:    64 | lm loss: 1.864534E+00 | loss scale: 1.0 | grad norm: 0.498 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.68 | tokens-per-second-per-gpu: 9235.12 |
+[ip-26-0-155-69:7]:2023-06-21 17:55:48,618 [Rank 63]: time (ms) | forward-compute: 222.90 | backward-compute: 398.03 | backward-params-all-reduce: 226.18 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.28 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.89 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:55:57,504 [Rank 63]:  iteration     1840/  150000 | consumed samples:       117760 | elapsed time per iteration (ms): 888.6 | learning rate: 2.760E-04 | global batch size:    64 | lm loss: 1.888983E+00 | loss scale: 1.0 | grad norm: 0.654 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.48 | tokens-per-second-per-gpu: 9218.55 |
+[ip-26-0-155-69:7]:2023-06-21 17:55:57,505 [Rank 63]: time (ms) | forward-compute: 223.24 | backward-compute: 398.08 | backward-params-all-reduce: 226.20 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.29 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 6.09 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 37.07 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 17:56:06,390 [Rank 63]:  iteration     1850/  150000 | consumed samples:       118400 | elapsed time per iteration (ms): 888.6 | learning rate: 2.775E-04 | global batch size:    64 | lm loss: 1.936798E+00 | loss scale: 1.0 | grad norm: 0.526 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.49 | tokens-per-second-per-gpu: 9219.29 |
+[ip-26-0-155-69:7]:2023-06-21 17:56:06,391 [Rank 63]: time (ms) | forward-compute: 223.70 | backward-compute: 398.08 | backward-params-all-reduce: 226.23 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.33 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 5.51 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 36.48 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:56:15,291 [Rank 63]:  iteration     1860/  150000 | consumed samples:       119040 | elapsed time per iteration (ms): 890.1 | learning rate: 2.790E-04 | global batch size:    64 | lm loss: 1.896784E+00 | loss scale: 1.0 | grad norm: 0.453 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.31 | tokens-per-second-per-gpu: 9203.43 |
+[ip-26-0-155-69:7]:2023-06-21 17:56:15,292 [Rank 63]: time (ms) | forward-compute: 223.72 | backward-compute: 398.05 | backward-params-all-reduce: 228.31 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 228.42 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.94 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.92 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:56:24,175 [Rank 63]:  iteration     1870/  150000 | consumed samples:       119680 | elapsed time per iteration (ms): 888.4 | learning rate: 2.805E-04 | global batch size:    64 | lm loss: 1.881472E+00 | loss scale: 1.0 | grad norm: 0.505 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.52 | tokens-per-second-per-gpu: 9221.52 |
+[ip-26-0-155-69:7]:2023-06-21 17:56:24,175 [Rank 63]: time (ms) | forward-compute: 223.62 | backward-compute: 398.02 | backward-params-all-reduce: 226.21 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.31 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 5.49 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 36.46 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:56:33,048 [Rank 63]:  iteration     1880/  150000 | consumed samples:       120320 | elapsed time per iteration (ms): 887.3 | learning rate: 2.820E-04 | global batch size:    64 | lm loss: 1.873608E+00 | loss scale: 1.0 | grad norm: 0.489 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.64 | tokens-per-second-per-gpu: 9232.14 |
+[ip-26-0-155-69:7]:2023-06-21 17:56:33,049 [Rank 63]: time (ms) | forward-compute: 223.33 | backward-compute: 398.05 | backward-params-all-reduce: 226.10 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.19 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.85 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:56:41,925 [Rank 63]:  iteration     1890/  150000 | consumed samples:       120960 | elapsed time per iteration (ms): 887.7 | learning rate: 2.835E-04 | global batch size:    64 | lm loss: 1.878910E+00 | loss scale: 1.0 | grad norm: 0.452 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.60 | tokens-per-second-per-gpu: 9228.79 |
+[ip-26-0-155-69:7]:2023-06-21 17:56:41,925 [Rank 63]: time (ms) | forward-compute: 223.66 | backward-compute: 398.04 | backward-params-all-reduce: 226.13 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.22 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.00 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.81 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:56:50,806 [Rank 63]:  iteration     1900/  150000 | consumed samples:       121600 | elapsed time per iteration (ms): 888.1 | learning rate: 2.850E-04 | global batch size:    64 | lm loss: 1.867750E+00 | loss scale: 1.0 | grad norm: 0.446 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.55 | tokens-per-second-per-gpu: 9224.04 |
+[ip-26-0-155-69:7]:2023-06-21 17:56:50,806 [Rank 63]: time (ms) | forward-compute: 223.87 | backward-compute: 398.06 | backward-params-all-reduce: 226.18 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.28 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 35.94 | batch-generator: 1.81
+[ip-26-0-155-69:7]:2023-06-21 17:56:59,708 [Rank 63]:  iteration     1910/  150000 | consumed samples:       122240 | elapsed time per iteration (ms): 890.3 | learning rate: 2.865E-04 | global batch size:    64 | lm loss: 1.903045E+00 | loss scale: 1.0 | grad norm: 0.461 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.29 | tokens-per-second-per-gpu: 9201.73 |
+[ip-26-0-155-69:7]:2023-06-21 17:56:59,709 [Rank 63]: time (ms) | forward-compute: 223.93 | backward-compute: 398.16 | backward-params-all-reduce: 227.09 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 227.18 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.57 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.90 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 17:57:08,726 [Rank 63]:  iteration     1920/  150000 | consumed samples:       122880 | elapsed time per iteration (ms): 901.7 | learning rate: 2.880E-04 | global batch size:    64 | lm loss: 1.804187E+00 | loss scale: 1.0 | grad norm: 0.463 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 104.94 | tokens-per-second-per-gpu: 9084.71 |
+[ip-26-0-155-69:7]:2023-06-21 17:57:08,726 [Rank 63]: time (ms) | forward-compute: 237.47 | backward-compute: 398.10 | backward-params-all-reduce: 226.30 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.39 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.82 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:57:18,334 [Rank 63]:  iteration     1930/  150000 | consumed samples:       123520 | elapsed time per iteration (ms): 960.8 | learning rate: 2.895E-04 | global batch size:    64 | lm loss: 1.886427E+00 | loss scale: 1.0 | grad norm: 1.246 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 98.49 | tokens-per-second-per-gpu: 8526.48 |
+[ip-26-0-155-69:7]:2023-06-21 17:57:18,334 [Rank 63]: time (ms) | forward-compute: 257.77 | backward-compute: 398.14 | backward-params-all-reduce: 262.94 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 263.04 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 6.81 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 37.78 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:57:27,268 [Rank 63]:  iteration     1940/  150000 | consumed samples:       124160 | elapsed time per iteration (ms): 893.4 | learning rate: 2.910E-04 | global batch size:    64 | lm loss: 1.821943E+00 | loss scale: 1.0 | grad norm: 0.496 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.92 | tokens-per-second-per-gpu: 9169.53 |
+[ip-26-0-155-69:7]:2023-06-21 17:57:27,268 [Rank 63]: time (ms) | forward-compute: 226.21 | backward-compute: 397.94 | backward-params-all-reduce: 226.73 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.83 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.51 | optimizer-clip-main-grad: 4.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.80 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:57:36,161 [Rank 63]:  iteration     1950/  150000 | consumed samples:       124800 | elapsed time per iteration (ms): 889.4 | learning rate: 2.925E-04 | global batch size:    64 | lm loss: 1.846065E+00 | loss scale: 1.0 | grad norm: 0.370 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.40 | tokens-per-second-per-gpu: 9211.12 |
+[ip-26-0-155-69:7]:2023-06-21 17:57:36,162 [Rank 63]: time (ms) | forward-compute: 224.99 | backward-compute: 397.97 | backward-params-all-reduce: 226.49 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.59 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.85 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:57:45,045 [Rank 63]:  iteration     1960/  150000 | consumed samples:       125440 | elapsed time per iteration (ms): 888.4 | learning rate: 2.940E-04 | global batch size:    64 | lm loss: 1.848107E+00 | loss scale: 1.0 | grad norm: 0.439 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.52 | tokens-per-second-per-gpu: 9221.40 |
+[ip-26-0-155-69:7]:2023-06-21 17:57:45,045 [Rank 63]: time (ms) | forward-compute: 224.49 | backward-compute: 398.01 | backward-params-all-reduce: 225.82 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.93 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 35.89 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:57:53,934 [Rank 63]:  iteration     1970/  150000 | consumed samples:       126080 | elapsed time per iteration (ms): 888.9 | learning rate: 2.955E-04 | global batch size:    64 | lm loss: 1.835699E+00 | loss scale: 1.0 | grad norm: 0.533 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.46 | tokens-per-second-per-gpu: 9216.04 |
+[ip-26-0-155-69:7]:2023-06-21 17:57:53,934 [Rank 63]: time (ms) | forward-compute: 223.71 | backward-compute: 398.03 | backward-params-all-reduce: 225.90 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.01 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 6.13 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 37.13 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:58:02,814 [Rank 63]:  iteration     1980/  150000 | consumed samples:       126720 | elapsed time per iteration (ms): 888.0 | learning rate: 2.970E-04 | global batch size:    64 | lm loss: 1.814756E+00 | loss scale: 1.0 | grad norm: 0.656 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.56 | tokens-per-second-per-gpu: 9224.94 |
+[ip-26-0-155-69:7]:2023-06-21 17:58:02,814 [Rank 63]: time (ms) | forward-compute: 223.60 | backward-compute: 398.00 | backward-params-all-reduce: 226.54 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.65 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.84 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:58:11,721 [Rank 63]:  iteration     1990/  150000 | consumed samples:       127360 | elapsed time per iteration (ms): 890.7 | learning rate: 2.985E-04 | global batch size:    64 | lm loss: 1.804731E+00 | loss scale: 1.0 | grad norm: 0.541 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.24 | tokens-per-second-per-gpu: 9197.44 |
+[ip-26-0-155-69:7]:2023-06-21 17:58:11,721 [Rank 63]: time (ms) | forward-compute: 226.70 | backward-compute: 398.09 | backward-params-all-reduce: 225.99 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.09 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.85 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:58:20,597 [Rank 63]:  iteration     2000/  150000 | consumed samples:       128000 | elapsed time per iteration (ms): 887.6 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.843141E+00 | loss scale: 1.0 | grad norm: 0.463 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.61 | tokens-per-second-per-gpu: 9229.20 |
+[ip-26-0-155-69:7]:2023-06-21 17:58:20,597 [Rank 63]: time (ms) | forward-compute: 223.66 | backward-compute: 398.16 | backward-params-all-reduce: 225.95 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.05 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.00 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.82 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:58:29,475 [Rank 63]:  iteration     2010/  150000 | consumed samples:       128640 | elapsed time per iteration (ms): 887.8 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.846218E+00 | loss scale: 1.0 | grad norm: 0.441 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.58 | tokens-per-second-per-gpu: 9226.83 |
+[ip-26-0-155-69:7]:2023-06-21 17:58:29,476 [Rank 63]: time (ms) | forward-compute: 223.69 | backward-compute: 398.27 | backward-params-all-reduce: 225.86 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.96 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 4.94 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.93 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:58:38,353 [Rank 63]:  iteration     2020/  150000 | consumed samples:       129280 | elapsed time per iteration (ms): 887.7 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.828299E+00 | loss scale: 1.0 | grad norm: 0.364 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.60 | tokens-per-second-per-gpu: 9228.28 |
+[ip-26-0-155-69:7]:2023-06-21 17:58:38,353 [Rank 63]: time (ms) | forward-compute: 223.21 | backward-compute: 398.27 | backward-params-all-reduce: 225.76 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.86 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.51 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.83 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 17:58:47,381 [Rank 63]:  iteration     2030/  150000 | consumed samples:       129920 | elapsed time per iteration (ms): 902.8 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.837031E+00 | loss scale: 1.0 | grad norm: 0.462 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 104.81 | tokens-per-second-per-gpu: 9073.96 |
+[ip-26-0-155-69:7]:2023-06-21 17:58:47,381 [Rank 63]: time (ms) | forward-compute: 238.88 | backward-compute: 398.15 | backward-params-all-reduce: 225.84 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.95 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.85 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:58:56,356 [Rank 63]:  iteration     2040/  150000 | consumed samples:       130560 | elapsed time per iteration (ms): 897.5 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.806408E+00 | loss scale: 1.0 | grad norm: 0.349 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 105.43 | tokens-per-second-per-gpu: 9127.12 |
+[ip-26-0-155-69:7]:2023-06-21 17:58:56,356 [Rank 63]: time (ms) | forward-compute: 224.36 | backward-compute: 398.22 | backward-params-all-reduce: 234.67 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 234.77 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 4.95 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 35.93 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:59:05,227 [Rank 63]:  iteration     2050/  150000 | consumed samples:       131200 | elapsed time per iteration (ms): 887.1 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.813356E+00 | loss scale: 1.0 | grad norm: 0.400 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.67 | tokens-per-second-per-gpu: 9234.61 |
+[ip-26-0-155-69:7]:2023-06-21 17:59:05,228 [Rank 63]: time (ms) | forward-compute: 223.01 | backward-compute: 398.25 | backward-params-all-reduce: 225.91 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.01 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.85 | batch-generator: 1.81
+[ip-26-0-155-69:7]:2023-06-21 17:59:14,101 [Rank 63]:  iteration     2060/  150000 | consumed samples:       131840 | elapsed time per iteration (ms): 887.4 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.818771E+00 | loss scale: 1.0 | grad norm: 0.470 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.63 | tokens-per-second-per-gpu: 9231.31 |
+[ip-26-0-155-69:7]:2023-06-21 17:59:14,102 [Rank 63]: time (ms) | forward-compute: 223.20 | backward-compute: 398.28 | backward-params-all-reduce: 225.99 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.08 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.86 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 17:59:22,974 [Rank 63]:  iteration     2070/  150000 | consumed samples:       132480 | elapsed time per iteration (ms): 887.3 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.796845E+00 | loss scale: 1.0 | grad norm: 0.401 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.64 | tokens-per-second-per-gpu: 9232.27 |
+[ip-26-0-155-69:7]:2023-06-21 17:59:22,975 [Rank 63]: time (ms) | forward-compute: 222.97 | backward-compute: 398.28 | backward-params-all-reduce: 226.12 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.22 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.85 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 17:59:31,879 [Rank 63]:  iteration     2080/  150000 | consumed samples:       133120 | elapsed time per iteration (ms): 890.4 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.824200E+00 | loss scale: 1.0 | grad norm: 0.370 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.27 | tokens-per-second-per-gpu: 9200.08 |
+[ip-26-0-155-69:7]:2023-06-21 17:59:31,879 [Rank 63]: time (ms) | forward-compute: 225.75 | backward-compute: 398.24 | backward-params-all-reduce: 226.47 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.56 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.89 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 17:59:40,757 [Rank 63]:  iteration     2090/  150000 | consumed samples:       133760 | elapsed time per iteration (ms): 887.8 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.760048E+00 | loss scale: 1.0 | grad norm: 0.412 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.58 | tokens-per-second-per-gpu: 9226.97 |
+[ip-26-0-155-69:7]:2023-06-21 17:59:40,757 [Rank 63]: time (ms) | forward-compute: 222.84 | backward-compute: 398.21 | backward-params-all-reduce: 226.93 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 227.02 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.84 | batch-generator: 1.73
+[ip-26-0-155-69:7]:2023-06-21 17:59:49,631 [Rank 63]:  iteration     2100/  150000 | consumed samples:       134400 | elapsed time per iteration (ms): 887.4 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.805639E+00 | loss scale: 1.0 | grad norm: 0.564 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.63 | tokens-per-second-per-gpu: 9231.04 |
+[ip-26-0-155-69:7]:2023-06-21 17:59:49,632 [Rank 63]: time (ms) | forward-compute: 222.98 | backward-compute: 398.20 | backward-params-all-reduce: 226.35 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.45 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 4.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.89 | batch-generator: 1.72
+[ip-26-0-155-69:7]:2023-06-21 17:59:58,520 [Rank 63]:  iteration     2110/  150000 | consumed samples:       135040 | elapsed time per iteration (ms): 888.8 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.775501E+00 | loss scale: 1.0 | grad norm: 0.398 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.46 | tokens-per-second-per-gpu: 9216.42 |
+[ip-26-0-155-69:7]:2023-06-21 17:59:58,520 [Rank 63]: time (ms) | forward-compute: 224.48 | backward-compute: 398.15 | backward-params-all-reduce: 226.28 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.38 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.88 | batch-generator: 1.72
+[ip-26-0-155-69:7]:2023-06-21 18:00:07,399 [Rank 63]:  iteration     2120/  150000 | consumed samples:       135680 | elapsed time per iteration (ms): 887.9 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.761086E+00 | loss scale: 1.0 | grad norm: 0.475 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.58 | tokens-per-second-per-gpu: 9226.59 |
+[ip-26-0-155-69:7]:2023-06-21 18:00:07,399 [Rank 63]: time (ms) | forward-compute: 223.72 | backward-compute: 398.08 | backward-params-all-reduce: 226.17 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.26 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.85 | batch-generator: 1.73
+[ip-26-0-155-69:7]:2023-06-21 18:00:16,270 [Rank 63]:  iteration     2130/  150000 | consumed samples:       136320 | elapsed time per iteration (ms): 887.2 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.825589E+00 | loss scale: 1.0 | grad norm: 0.381 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.66 | tokens-per-second-per-gpu: 9233.87 |
+[ip-26-0-155-69:7]:2023-06-21 18:00:16,271 [Rank 63]: time (ms) | forward-compute: 222.78 | backward-compute: 398.11 | backward-params-all-reduce: 226.35 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.44 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 4.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.89 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 18:00:25,141 [Rank 63]:  iteration     2140/  150000 | consumed samples:       136960 | elapsed time per iteration (ms): 887.0 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.760596E+00 | loss scale: 1.0 | grad norm: 0.346 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.68 | tokens-per-second-per-gpu: 9235.20 |
+[ip-26-0-155-69:7]:2023-06-21 18:00:25,141 [Rank 63]: time (ms) | forward-compute: 223.01 | backward-compute: 398.11 | backward-params-all-reduce: 226.01 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.11 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.86 | batch-generator: 1.73
+[ip-26-0-155-69:7]:2023-06-21 18:00:34,022 [Rank 63]:  iteration     2150/  150000 | consumed samples:       137600 | elapsed time per iteration (ms): 888.1 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.764927E+00 | loss scale: 1.0 | grad norm: 0.420 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.55 | tokens-per-second-per-gpu: 9224.08 |
+[ip-26-0-155-69:7]:2023-06-21 18:00:34,023 [Rank 63]: time (ms) | forward-compute: 223.06 | backward-compute: 398.09 | backward-params-all-reduce: 226.92 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 227.03 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.94 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.92 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 18:00:42,895 [Rank 63]:  iteration     2160/  150000 | consumed samples:       138240 | elapsed time per iteration (ms): 887.3 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.762487E+00 | loss scale: 1.0 | grad norm: 0.949 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.65 | tokens-per-second-per-gpu: 9232.88 |
+[ip-26-0-155-69:7]:2023-06-21 18:00:42,895 [Rank 63]: time (ms) | forward-compute: 222.95 | backward-compute: 398.11 | backward-params-all-reduce: 226.27 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.37 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.86 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 18:00:51,766 [Rank 63]:  iteration     2170/  150000 | consumed samples:       138880 | elapsed time per iteration (ms): 887.2 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.754759E+00 | loss scale: 1.0 | grad norm: 0.527 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.66 | tokens-per-second-per-gpu: 9234.05 |
+[ip-26-0-155-69:7]:2023-06-21 18:00:51,766 [Rank 63]: time (ms) | forward-compute: 223.00 | backward-compute: 398.11 | backward-params-all-reduce: 226.17 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.26 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.85 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 18:01:00,638 [Rank 63]:  iteration     2180/  150000 | consumed samples:       139520 | elapsed time per iteration (ms): 887.2 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.740713E+00 | loss scale: 1.0 | grad norm: 0.444 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.66 | tokens-per-second-per-gpu: 9233.40 |
+[ip-26-0-155-69:7]:2023-06-21 18:01:00,639 [Rank 63]: time (ms) | forward-compute: 223.01 | backward-compute: 398.08 | backward-params-all-reduce: 226.23 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.34 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.51 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 35.84 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 18:01:09,539 [Rank 63]:  iteration     2190/  150000 | consumed samples:       140160 | elapsed time per iteration (ms): 890.1 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.739953E+00 | loss scale: 1.0 | grad norm: 0.428 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.32 | tokens-per-second-per-gpu: 9203.97 |
+[ip-26-0-155-69:7]:2023-06-21 18:01:09,539 [Rank 63]: time (ms) | forward-compute: 225.38 | backward-compute: 398.10 | backward-params-all-reduce: 226.63 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.73 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.00 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.86 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 18:01:18,414 [Rank 63]:  iteration     2200/  150000 | consumed samples:       140800 | elapsed time per iteration (ms): 887.5 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.741342E+00 | loss scale: 1.0 | grad norm: 0.442 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.62 | tokens-per-second-per-gpu: 9230.52 |
+[ip-26-0-155-69:7]:2023-06-21 18:01:18,414 [Rank 63]: time (ms) | forward-compute: 222.81 | backward-compute: 398.06 | backward-params-all-reduce: 226.72 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.82 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.00 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.85 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 18:01:27,317 [Rank 63]:  iteration     2210/  150000 | consumed samples:       141440 | elapsed time per iteration (ms): 890.3 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.756096E+00 | loss scale: 1.0 | grad norm: 0.364 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.28 | tokens-per-second-per-gpu: 9201.21 |
+[ip-26-0-155-69:7]:2023-06-21 18:01:27,317 [Rank 63]: time (ms) | forward-compute: 225.62 | backward-compute: 398.12 | backward-params-all-reduce: 226.64 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.75 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.82 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 18:01:36,189 [Rank 63]:  iteration     2220/  150000 | consumed samples:       142080 | elapsed time per iteration (ms): 887.2 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.738577E+00 | loss scale: 1.0 | grad norm: 0.474 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.66 | tokens-per-second-per-gpu: 9233.73 |
+[ip-26-0-155-69:7]:2023-06-21 18:01:36,189 [Rank 63]: time (ms) | forward-compute: 222.81 | backward-compute: 398.07 | backward-params-all-reduce: 226.32 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.43 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 35.89 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 18:01:45,059 [Rank 63]:  iteration     2230/  150000 | consumed samples:       142720 | elapsed time per iteration (ms): 887.1 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.702038E+00 | loss scale: 1.0 | grad norm: 0.623 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.67 | tokens-per-second-per-gpu: 9234.98 |
+[ip-26-0-155-69:7]:2023-06-21 18:01:45,060 [Rank 63]: time (ms) | forward-compute: 222.88 | backward-compute: 398.13 | backward-params-all-reduce: 226.15 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.25 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.51 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.00 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.82 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 18:01:53,930 [Rank 63]:  iteration     2240/  150000 | consumed samples:       143360 | elapsed time per iteration (ms): 887.1 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.779816E+00 | loss scale: 1.0 | grad norm: 0.470 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.67 | tokens-per-second-per-gpu: 9234.95 |
+[ip-26-0-155-69:7]:2023-06-21 18:01:53,930 [Rank 63]: time (ms) | forward-compute: 223.21 | backward-compute: 398.17 | backward-params-all-reduce: 225.80 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.90 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 21.99 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.81 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 18:02:02,802 [Rank 63]:  iteration     2250/  150000 | consumed samples:       144000 | elapsed time per iteration (ms): 887.2 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.703117E+00 | loss scale: 1.0 | grad norm: 0.395 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.66 | tokens-per-second-per-gpu: 9233.46 |
+[ip-26-0-155-69:7]:2023-06-21 18:02:02,803 [Rank 63]: time (ms) | forward-compute: 223.12 | backward-compute: 398.17 | backward-params-all-reduce: 226.02 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.12 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.51 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.82 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 18:02:11,677 [Rank 63]:  iteration     2260/  150000 | consumed samples:       144640 | elapsed time per iteration (ms): 887.5 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.744578E+00 | loss scale: 1.0 | grad norm: 0.372 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.62 | tokens-per-second-per-gpu: 9230.24 |
+[ip-26-0-155-69:7]:2023-06-21 18:02:11,678 [Rank 63]: time (ms) | forward-compute: 222.97 | backward-compute: 398.17 | backward-params-all-reduce: 226.33 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.44 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 35.90 | batch-generator: 1.83
+[ip-26-0-155-69:7]:2023-06-21 18:02:20,552 [Rank 63]:  iteration     2270/  150000 | consumed samples:       145280 | elapsed time per iteration (ms): 887.5 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.698719E+00 | loss scale: 1.0 | grad norm: 1.019 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.63 | tokens-per-second-per-gpu: 9230.86 |
+[ip-26-0-155-69:7]:2023-06-21 18:02:20,552 [Rank 63]: time (ms) | forward-compute: 222.83 | backward-compute: 398.14 | backward-params-all-reduce: 225.95 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.05 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 5.49 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 36.44 | batch-generator: 1.75
+[ip-26-0-155-69:7]:2023-06-21 18:02:29,424 [Rank 63]:  iteration     2280/  150000 | consumed samples:       145920 | elapsed time per iteration (ms): 887.2 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.711292E+00 | loss scale: 1.0 | grad norm: 0.430 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.66 | tokens-per-second-per-gpu: 9233.44 |
+[ip-26-0-155-69:7]:2023-06-21 18:02:29,424 [Rank 63]: time (ms) | forward-compute: 223.25 | backward-compute: 398.11 | backward-params-all-reduce: 225.98 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.08 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.81 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 18:02:38,293 [Rank 63]:  iteration     2290/  150000 | consumed samples:       146560 | elapsed time per iteration (ms): 886.9 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.690299E+00 | loss scale: 1.0 | grad norm: 0.336 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.69 | tokens-per-second-per-gpu: 9236.16 |
+[ip-26-0-155-69:7]:2023-06-21 18:02:38,294 [Rank 63]: time (ms) | forward-compute: 222.88 | backward-compute: 398.13 | backward-params-all-reduce: 226.03 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.13 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.84 | batch-generator: 1.80
+[ip-26-0-155-69:7]:2023-06-21 18:02:47,168 [Rank 63]:  iteration     2300/  150000 | consumed samples:       147200 | elapsed time per iteration (ms): 887.4 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.739259E+00 | loss scale: 1.0 | grad norm: 0.317 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.63 | tokens-per-second-per-gpu: 9231.03 |
+[ip-26-0-155-69:7]:2023-06-21 18:02:47,168 [Rank 63]: time (ms) | forward-compute: 223.02 | backward-compute: 398.20 | backward-params-all-reduce: 226.27 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.37 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.88 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 18:02:56,039 [Rank 63]:  iteration     2310/  150000 | consumed samples:       147840 | elapsed time per iteration (ms): 887.1 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.689472E+00 | loss scale: 1.0 | grad norm: 0.328 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.67 | tokens-per-second-per-gpu: 9234.68 |
+[ip-26-0-155-69:7]:2023-06-21 18:02:56,039 [Rank 63]: time (ms) | forward-compute: 222.87 | backward-compute: 398.11 | backward-params-all-reduce: 226.25 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.35 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.83 | batch-generator: 1.76
+[ip-26-0-155-69:7]:2023-06-21 18:03:04,914 [Rank 63]:  iteration     2320/  150000 | consumed samples:       148480 | elapsed time per iteration (ms): 887.5 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.747082E+00 | loss scale: 1.0 | grad norm: 0.409 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.62 | tokens-per-second-per-gpu: 9229.93 |
+[ip-26-0-155-69:7]:2023-06-21 18:03:04,915 [Rank 63]: time (ms) | forward-compute: 222.94 | backward-compute: 398.12 | backward-params-all-reduce: 226.05 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.15 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 5.47 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 36.42 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 18:03:13,788 [Rank 63]:  iteration     2330/  150000 | consumed samples:       149120 | elapsed time per iteration (ms): 887.3 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.746517E+00 | loss scale: 1.0 | grad norm: 0.334 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.64 | tokens-per-second-per-gpu: 9232.08 |
+[ip-26-0-155-69:7]:2023-06-21 18:03:13,788 [Rank 63]: time (ms) | forward-compute: 222.63 | backward-compute: 398.17 | backward-params-all-reduce: 226.47 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.58 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.97 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 35.98 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 18:03:22,657 [Rank 63]:  iteration     2340/  150000 | consumed samples:       149760 | elapsed time per iteration (ms): 886.9 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.695744E+00 | loss scale: 1.0 | grad norm: 0.464 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.69 | tokens-per-second-per-gpu: 9236.53 |
+[ip-26-0-155-69:7]:2023-06-21 18:03:22,658 [Rank 63]: time (ms) | forward-compute: 222.78 | backward-compute: 398.11 | backward-params-all-reduce: 226.04 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.15 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.87 | batch-generator: 1.82
+[ip-26-0-155-69:7]:2023-06-21 18:03:31,525 [Rank 63]:  iteration     2350/  150000 | consumed samples:       150400 | elapsed time per iteration (ms): 886.8 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.696679E+00 | loss scale: 1.0 | grad norm: 0.349 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.70 | tokens-per-second-per-gpu: 9237.27 |
+[ip-26-0-155-69:7]:2023-06-21 18:03:31,526 [Rank 63]: time (ms) | forward-compute: 222.73 | backward-compute: 398.13 | backward-params-all-reduce: 226.04 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.15 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.51 | optimizer-clip-main-grad: 4.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.00 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.83 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 18:03:40,393 [Rank 63]:  iteration     2360/  150000 | consumed samples:       151040 | elapsed time per iteration (ms): 886.8 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.703598E+00 | loss scale: 1.0 | grad norm: 0.336 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.71 | tokens-per-second-per-gpu: 9237.73 |
+[ip-26-0-155-69:7]:2023-06-21 18:03:40,394 [Rank 63]: time (ms) | forward-compute: 222.71 | backward-compute: 398.16 | backward-params-all-reduce: 226.05 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.15 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.83 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 18:03:49,262 [Rank 63]:  iteration     2370/  150000 | consumed samples:       151680 | elapsed time per iteration (ms): 886.9 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.699918E+00 | loss scale: 1.0 | grad norm: 0.313 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.70 | tokens-per-second-per-gpu: 9236.87 |
+[ip-26-0-155-69:7]:2023-06-21 18:03:49,262 [Rank 63]: time (ms) | forward-compute: 223.09 | backward-compute: 398.17 | backward-params-all-reduce: 225.66 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.75 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.89 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 18:03:58,134 [Rank 63]:  iteration     2380/  150000 | consumed samples:       152320 | elapsed time per iteration (ms): 887.2 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.646003E+00 | loss scale: 1.0 | grad norm: 0.406 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.66 | tokens-per-second-per-gpu: 9233.53 |
+[ip-26-0-155-69:7]:2023-06-21 18:03:58,135 [Rank 63]: time (ms) | forward-compute: 223.34 | backward-compute: 398.13 | backward-params-all-reduce: 225.88 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.97 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.81 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 18:04:07,003 [Rank 63]:  iteration     2390/  150000 | consumed samples:       152960 | elapsed time per iteration (ms): 886.9 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.681774E+00 | loss scale: 1.0 | grad norm: 0.443 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.69 | tokens-per-second-per-gpu: 9236.35 |
+[ip-26-0-155-69:7]:2023-06-21 18:04:07,004 [Rank 63]: time (ms) | forward-compute: 222.92 | backward-compute: 398.16 | backward-params-all-reduce: 225.95 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.05 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.86 | batch-generator: 1.79
+[ip-26-0-155-69:7]:2023-06-21 18:04:15,879 [Rank 63]:  iteration     2400/  150000 | consumed samples:       153600 | elapsed time per iteration (ms): 887.6 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.655367E+00 | loss scale: 1.0 | grad norm: 0.279 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.61 | tokens-per-second-per-gpu: 9229.47 |
+[ip-26-0-155-69:7]:2023-06-21 18:04:15,880 [Rank 63]: time (ms) | forward-compute: 223.47 | backward-compute: 398.12 | backward-params-all-reduce: 226.07 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.16 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.00 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.87 | batch-generator: 1.78
+[ip-26-0-155-69:7]:2023-06-21 18:04:24,754 [Rank 63]:  iteration     2410/  150000 | consumed samples:       154240 | elapsed time per iteration (ms): 887.5 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.728302E+00 | loss scale: 1.0 | grad norm: 0.383 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.62 | tokens-per-second-per-gpu: 9230.69 |
+[ip-26-0-155-69:7]:2023-06-21 18:04:24,755 [Rank 63]: time (ms) | forward-compute: 223.28 | backward-compute: 398.13 | backward-params-all-reduce: 226.23 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.33 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.51 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.82 | batch-generator: 1.77
+[ip-26-0-155-69:7]:2023-06-21 18:04:33,624 [Rank 63]:  iteration     2420/  150000 | consumed samples:       154880 | elapsed time per iteration (ms): 886.9 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.646320E+00 | loss scale: 1.0 | grad norm: 0.326 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.69 | tokens-per-second-per-gpu: 9236.17 |
+[ip-26-0-155-69:7]:2023-06-21 18:04:33,624 [Rank 63]: time (ms) | forward-compute: 223.16 | backward-compute: 398.14 | backward-params-all-reduce: 225.76 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.86 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 21.99 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.86 | batch-generator: 1.74
+[ip-26-0-155-69:7]:2023-06-21 18:04:42,499 [Rank 63]:  iteration     2430/  150000 | consumed samples:       155520 | elapsed time per iteration (ms): 887.5 | learning rate: 3.000E-04 | global batch size:    64 | lm loss: 1.704535E+00 | loss scale: 1.0 | grad norm: 0.447 | number of skipped iterations:   0 | number of nan iterations:   0 | TFLOPs: 106.62 | tokens-per-second-per-gpu: 9230.11 |
+[ip-26-0-155-69:7]:2023-06-21 18:04:42,499 [Rank 63]: time (ms) | forward-compute: 223.36 | backward-compute: 398.12 | backward-params-all-reduce: 226.19 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.29 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.83 | batch-generator: 1.74
+srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
+slurmstepd: error: *** JOB 161653 ON ip-26-0-150-19 CANCELLED AT 2023-06-21T18:04:43 ***
+WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers
+WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3652424 closing signal SIGTERM
+WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3652425 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3652426 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3652427 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 281174 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 281175 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 281176 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3652428 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 711971 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3652429 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3652430 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 281177 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 281178 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 281179 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 281180 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 281181 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 711972 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 711973 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 711974 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 711975 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3652431 closing signal SIGTERM
+WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 711976 closing signal SIGTERM