diff --git "a/logs/megatron-1b-code-161653.out" "b/logs/megatron-1b-code-161653.out" new file mode 100644--- /dev/null +++ "b/logs/megatron-1b-code-161653.out" @@ -0,0 +1,9945 @@ ++ source /admin/home/loubna/.bashrc +++ HISTCONTROL=ignoreboth +++ shopt -s histappend +++ HISTSIZE=1000 +++ HISTFILESIZE=2000 +++ shopt -s checkwinsize +++ '[' -x /usr/bin/lesspipe ']' ++++ SHELL=/bin/sh ++++ lesspipe +++ eval 'export LESSOPEN="| /usr/bin/lesspipe %s"; +export LESSCLOSE="/usr/bin/lesspipe %s %s";' ++++ export 'LESSOPEN=| /usr/bin/lesspipe %s' ++++ LESSOPEN='| /usr/bin/lesspipe %s' ++++ export 'LESSCLOSE=/usr/bin/lesspipe %s %s' ++++ LESSCLOSE='/usr/bin/lesspipe %s %s' +++ '[' -z '' ']' +++ '[' -r /etc/debian_chroot ']' +++ case "$TERM" in +++ color_prompt=yes +++ '[' -n '' ']' +++ '[' yes = yes ']' +++ PS1='${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ ' +++ unset color_prompt force_color_prompt +++ case "$TERM" in +++ PS1='\[\e]0;\u@\h: \w\a\]${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ ' +++ '[' -x /usr/bin/dircolors ']' +++ test -r /admin/home/loubna/.dircolors ++++ dircolors -b +++ eval 'LS_COLORS='\''rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:'\''; +export LS_COLORS' ++++ LS_COLORS='rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:' ++++ export LS_COLORS +++ alias 'ls=ls --color=auto' +++ alias 'grep=grep --color=auto' +++ alias 'fgrep=fgrep --color=auto' +++ alias 'egrep=egrep --color=auto' +++ alias 'll=ls -alF' +++ alias 'la=ls -A' +++ alias 'l=ls -CF' +++ alias 'alert=notify-send --urgency=low -i "$([ $? = 0 ] && echo terminal || echo error)" "$(history|tail -n1|sed -e '\''s/^\s*[0-9]\+\s*//;s/[;&|]\s*alert$//'\'')"' +++ '[' -f /admin/home/loubna/.bash_aliases ']' +++ shopt -oq posix +++ '[' -f /usr/share/bash-completion/bash_completion ']' +++ . /usr/share/bash-completion/bash_completion ++++ BASH_COMPLETION_VERSINFO=(2 10) ++++ [[ ehxB == *v* ]] ++++ BASH_COMPLETION_ORIGINAL_V_VALUE=+v ++++ [[ -n '' ]] ++++ set +v ++++ _blacklist_glob='@(acroread.sh)' ++++ shopt -s extglob progcomp ++++ complete -u groups slay w sux ++++ complete -A stopped -P '"%' -S '"' bg ++++ complete -j -P '"%' -S '"' fg jobs disown ++++ complete -v readonly unset ++++ complete -A setopt set ++++ complete -A shopt shopt ++++ complete -A helptopic help ++++ complete -a unalias ++++ complete -c command type which ++++ complete -b builtin ++++ [[ linux-gnu == *@(solaris|aix)* ]] ++++ [[ linux-gnu == *@(solaris|aix)* ]] ++++ [[ linux-gnu == *@(solaris|aix)* ]] ++++ _backup_glob='@(#*#|*@(~|.@(bak|orig|rej|swp|dpkg*|rpm@(orig|new|save))))' ++++ complete -F _service service ++++ _sysvdirs ++++ sysvdirs=() ++++ [[ -d /etc/rc.d/init.d ]] ++++ [[ -d /etc/init.d ]] ++++ sysvdirs+=(/etc/init.d) ++++ [[ -f /etc/slackware-version ]] ++++ return 0 ++++ for svcdir in "${sysvdirs[@]}" ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/acpid ]] ++++ complete -F _service /etc/init.d/acpid ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/apparmor ]] ++++ complete -F _service /etc/init.d/apparmor ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/apport ]] ++++ complete -F _service /etc/init.d/apport ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/atd ]] ++++ complete -F _service /etc/init.d/atd ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/chrony ]] ++++ complete -F _service /etc/init.d/chrony ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/console-setup.sh ]] ++++ complete -F _service /etc/init.d/console-setup.sh ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/cron ]] ++++ complete -F _service /etc/init.d/cron ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/cryptdisks ]] ++++ complete -F _service /etc/init.d/cryptdisks ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/cryptdisks-early ]] ++++ complete -F _service /etc/init.d/cryptdisks-early ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/datadog-agent ]] ++++ complete -F _service /etc/init.d/datadog-agent ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/datadog-agent-process ]] ++++ complete -F _service /etc/init.d/datadog-agent-process ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/datadog-agent-security ]] ++++ complete -F _service /etc/init.d/datadog-agent-security ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/datadog-agent-trace ]] ++++ complete -F _service /etc/init.d/datadog-agent-trace ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/dbus ]] ++++ complete -F _service /etc/init.d/dbus ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/docker ]] ++++ complete -F _service /etc/init.d/docker ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/gdrdrv ]] ++++ complete -F _service /etc/init.d/gdrdrv ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/grub-common ]] ++++ complete -F _service /etc/init.d/grub-common ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/hwclock.sh ]] ++++ complete -F _service /etc/init.d/hwclock.sh ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/irqbalance ]] ++++ complete -F _service /etc/init.d/irqbalance ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/iscsid ]] ++++ complete -F _service /etc/init.d/iscsid ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/iwpmd ]] ++++ complete -F _service /etc/init.d/iwpmd ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/keyboard-setup.sh ]] ++++ complete -F _service /etc/init.d/keyboard-setup.sh ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/kmod ]] ++++ complete -F _service /etc/init.d/kmod ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/lvm2 ]] ++++ complete -F _service /etc/init.d/lvm2 ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/lvm2-lvmpolld ]] ++++ complete -F _service /etc/init.d/lvm2-lvmpolld ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/multipath-tools ]] ++++ complete -F _service /etc/init.d/multipath-tools ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/munge ]] ++++ complete -F _service /etc/init.d/munge ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/nfs-common ]] ++++ complete -F _service /etc/init.d/nfs-common ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/open-iscsi ]] ++++ complete -F _service /etc/init.d/open-iscsi ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/open-vm-tools ]] ++++ complete -F _service /etc/init.d/open-vm-tools ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/plymouth ]] ++++ complete -F _service /etc/init.d/plymouth ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/plymouth-log ]] ++++ complete -F _service /etc/init.d/plymouth-log ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/procps ]] ++++ complete -F _service /etc/init.d/procps ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/rpcbind ]] ++++ complete -F _service /etc/init.d/rpcbind ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/rsync ]] ++++ complete -F _service /etc/init.d/rsync ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/rsyslog ]] ++++ complete -F _service /etc/init.d/rsyslog ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/screen-cleanup ]] ++++ complete -F _service /etc/init.d/screen-cleanup ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/sendmail ]] ++++ complete -F _service /etc/init.d/sendmail ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/ssh ]] ++++ complete -F _service /etc/init.d/ssh ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/sysstat ]] ++++ complete -F _service /etc/init.d/sysstat ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/udev ]] ++++ complete -F _service /etc/init.d/udev ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/ufw ]] ++++ complete -F _service /etc/init.d/ufw ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/unattended-upgrades ]] ++++ complete -F _service /etc/init.d/unattended-upgrades ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/uuidd ]] ++++ complete -F _service /etc/init.d/uuidd ++++ for svc in $svcdir/!($_backup_glob) ++++ [[ -x /etc/init.d/x11-common ]] ++++ complete -F _service /etc/init.d/x11-common ++++ unset svc svcdir sysvdirs ++++ [[ linux-gnu == *freebsd* ]] ++++ shopt -u hostcomplete ++++ complete -F _user_at_host talk ytalk finger ++++ complete -F _known_hosts traceroute traceroute6 fping fping6 telnet rsh rlogin ftp dig mtr ssh-installkeys showmount ++++ shopt -q cdable_vars ++++ complete -F _cd -o nospace cd pushd ++++ complete -F _command aoss command do else eval exec ltrace nice nohup padsp then time tsocks vsound xargs ++++ complete -F _root_command fakeroot gksu gksudo kdesudo really ++++ complete -F _longopt a2ps awk base64 bash bc bison cat chroot colordiff cp csplit cut date df diff dir du enscript env expand fmt fold gperf grep grub head irb ld ldd less ln ls m4 md5sum mkdir mkfifo mknod mv netstat nl nm objcopy objdump od paste pr ptx readelf rm rmdir sed seq shasum sha1sum sha224sum sha256sum sha384sum sha512sum shar sort split strip sum tac tail tee texindex touch tr uname unexpand uniq units vdir wc who ++++ [[ 5 -gt 4 ]] ++++ declare -Ag _xspecs ++++ _install_xspec '!*.?(t)bz?(2)' bunzip2 bzcat pbunzip2 pbzcat lbunzip2 lbzcat ++++ local 'xspec=!*.?(t)bz?(2)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.?(t)bz?(2)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.?(t)bz?(2)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.?(t)bz?(2)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.?(t)bz?(2)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.?(t)bz?(2)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.?(t)bz?(2)' ++++ _install_xspec '!*.@(zip|[egjswx]ar|exe|pk3|wsz|zargo|xpi|s[tx][cdiw]|sx[gm]|o[dt][tspgfc]|od[bm]|oxt|epub|apk|aab|ipa|do[ct][xm]|p[op]t[mx]|xl[st][xm]|pyz|whl)' unzip zipinfo ++++ local 'xspec=!*.@(zip|[egjswx]ar|exe|pk3|wsz|zargo|xpi|s[tx][cdiw]|sx[gm]|o[dt][tspgfc]|od[bm]|oxt|epub|apk|aab|ipa|do[ct][xm]|p[op]t[mx]|xl[st][xm]|pyz|whl)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(zip|[egjswx]ar|exe|pk3|wsz|zargo|xpi|s[tx][cdiw]|sx[gm]|o[dt][tspgfc]|od[bm]|oxt|epub|apk|aab|ipa|do[ct][xm]|p[op]t[mx]|xl[st][xm]|pyz|whl)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(zip|[egjswx]ar|exe|pk3|wsz|zargo|xpi|s[tx][cdiw]|sx[gm]|o[dt][tspgfc]|od[bm]|oxt|epub|apk|aab|ipa|do[ct][xm]|p[op]t[mx]|xl[st][xm]|pyz|whl)' ++++ _install_xspec '*.Z' compress znew ++++ local 'xspec=*.Z' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='*.Z' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='*.Z' ++++ _install_xspec '!*.@(Z|[gGd]z|t[ag]z)' gunzip zcat ++++ local 'xspec=!*.@(Z|[gGd]z|t[ag]z)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(Z|[gGd]z|t[ag]z)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(Z|[gGd]z|t[ag]z)' ++++ _install_xspec '!*.@(Z|[gGdz]z|t[ag]z)' unpigz ++++ local 'xspec=!*.@(Z|[gGdz]z|t[ag]z)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(Z|[gGdz]z|t[ag]z)' ++++ _install_xspec '!*.Z' uncompress ++++ local 'xspec=!*.Z' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.Z' ++++ _install_xspec '!*.@(tlz|lzma)' lzcat lzegrep lzfgrep lzgrep lzless lzmore unlzma ++++ local 'xspec=!*.@(tlz|lzma)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(tlz|lzma)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(tlz|lzma)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(tlz|lzma)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(tlz|lzma)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(tlz|lzma)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(tlz|lzma)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(tlz|lzma)' ++++ _install_xspec '!*.@(?(t)xz|tlz|lzma)' unxz xzcat ++++ local 'xspec=!*.@(?(t)xz|tlz|lzma)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?(t)xz|tlz|lzma)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?(t)xz|tlz|lzma)' ++++ _install_xspec '!*.lrz' lrunzip ++++ local 'xspec=!*.lrz' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.lrz' ++++ _install_xspec '!*.@(gif|jp?(e)g|miff|tif?(f)|pn[gm]|p[bgp]m|bmp|xpm|ico|xwd|tga|pcx)' ee ++++ local 'xspec=!*.@(gif|jp?(e)g|miff|tif?(f)|pn[gm]|p[bgp]m|bmp|xpm|ico|xwd|tga|pcx)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(gif|jp?(e)g|miff|tif?(f)|pn[gm]|p[bgp]m|bmp|xpm|ico|xwd|tga|pcx)' ++++ _install_xspec '!*.@(gif|jp?(e)g|tif?(f)|png|p[bgp]m|bmp|x[bp]m|rle|rgb|pcx|fits|pm|svg)' qiv ++++ local 'xspec=!*.@(gif|jp?(e)g|tif?(f)|png|p[bgp]m|bmp|x[bp]m|rle|rgb|pcx|fits|pm|svg)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(gif|jp?(e)g|tif?(f)|png|p[bgp]m|bmp|x[bp]m|rle|rgb|pcx|fits|pm|svg)' ++++ _install_xspec '!*.@(gif|jp?(e)g?(2)|j2[ck]|jp[2f]|tif?(f)|png|p[bgp]m|bmp|x[bp]m|rle|rgb|pcx|fits|pm|?(e)ps)' xv ++++ local 'xspec=!*.@(gif|jp?(e)g?(2)|j2[ck]|jp[2f]|tif?(f)|png|p[bgp]m|bmp|x[bp]m|rle|rgb|pcx|fits|pm|?(e)ps)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(gif|jp?(e)g?(2)|j2[ck]|jp[2f]|tif?(f)|png|p[bgp]m|bmp|x[bp]m|rle|rgb|pcx|fits|pm|?(e)ps)' ++++ _install_xspec '!*.@(@(?(e)ps|?(E)PS|pdf|PDF)?(.gz|.GZ|.bz2|.BZ2|.Z))' gv ggv kghostview ++++ local 'xspec=!*.@(@(?(e)ps|?(E)PS|pdf|PDF)?(.gz|.GZ|.bz2|.BZ2|.Z))' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(@(?(e)ps|?(E)PS|pdf|PDF)?(.gz|.GZ|.bz2|.BZ2|.Z))' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(@(?(e)ps|?(E)PS|pdf|PDF)?(.gz|.GZ|.bz2|.BZ2|.Z))' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(@(?(e)ps|?(E)PS|pdf|PDF)?(.gz|.GZ|.bz2|.BZ2|.Z))' ++++ _install_xspec '!*.@(dvi|DVI)?(.@(gz|Z|bz2))' xdvi kdvi ++++ local 'xspec=!*.@(dvi|DVI)?(.@(gz|Z|bz2))' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(dvi|DVI)?(.@(gz|Z|bz2))' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(dvi|DVI)?(.@(gz|Z|bz2))' ++++ _install_xspec '!*.dvi' dvips dviselect dvitype dvipdf advi dvipdfm dvipdfmx ++++ local 'xspec=!*.dvi' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.dvi' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.dvi' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.dvi' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.dvi' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.dvi' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.dvi' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.dvi' ++++ _install_xspec '!*.[pf]df' acroread gpdf ++++ local 'xspec=!*.[pf]df' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.[pf]df' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.[pf]df' ++++ _install_xspec '!*.@(pdf|fdf)?(.@(gz|GZ|bz2|BZ2|Z))' xpdf ++++ local 'xspec=!*.@(pdf|fdf)?(.@(gz|GZ|bz2|BZ2|Z))' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(pdf|fdf)?(.@(gz|GZ|bz2|BZ2|Z))' ++++ _install_xspec '!*.@(?(e)ps|pdf)' kpdf ++++ local 'xspec=!*.@(?(e)ps|pdf)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?(e)ps|pdf)' ++++ _install_xspec '!*.@(okular|@(?(e|x)ps|?(E|X)PS|[pf]df|[PF]DF|dvi|DVI|cb[rz]|CB[RZ]|djv?(u)|DJV?(U)|dvi|DVI|gif|jp?(e)g|miff|tif?(f)|pn[gm]|p[bgp]m|bmp|xpm|ico|xwd|tga|pcx|GIF|JP?(E)G|MIFF|TIF?(F)|PN[GM]|P[BGP]M|BMP|XPM|ICO|XWD|TGA|PCX|epub|EPUB|odt|ODT|fb?(2)|FB?(2)|mobi|MOBI|g3|G3|chm|CHM)?(.?(gz|GZ|bz2|BZ2|xz|XZ)))' okular ++++ local 'xspec=!*.@(okular|@(?(e|x)ps|?(E|X)PS|[pf]df|[PF]DF|dvi|DVI|cb[rz]|CB[RZ]|djv?(u)|DJV?(U)|dvi|DVI|gif|jp?(e)g|miff|tif?(f)|pn[gm]|p[bgp]m|bmp|xpm|ico|xwd|tga|pcx|GIF|JP?(E)G|MIFF|TIF?(F)|PN[GM]|P[BGP]M|BMP|XPM|ICO|XWD|TGA|PCX|epub|EPUB|odt|ODT|fb?(2)|FB?(2)|mobi|MOBI|g3|G3|chm|CHM)?(.?(gz|GZ|bz2|BZ2|xz|XZ)))' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(okular|@(?(e|x)ps|?(E|X)PS|[pf]df|[PF]DF|dvi|DVI|cb[rz]|CB[RZ]|djv?(u)|DJV?(U)|dvi|DVI|gif|jp?(e)g|miff|tif?(f)|pn[gm]|p[bgp]m|bmp|xpm|ico|xwd|tga|pcx|GIF|JP?(E)G|MIFF|TIF?(F)|PN[GM]|P[BGP]M|BMP|XPM|ICO|XWD|TGA|PCX|epub|EPUB|odt|ODT|fb?(2)|FB?(2)|mobi|MOBI|g3|G3|chm|CHM)?(.?(gz|GZ|bz2|BZ2|xz|XZ)))' ++++ _install_xspec '!*.pdf' epdfview pdfunite ++++ local 'xspec=!*.pdf' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.pdf' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.pdf' ++++ _install_xspec '!*.@(cb[rz7t]|djv?(u)|?(e)ps|pdf)' zathura ++++ local 'xspec=!*.@(cb[rz7t]|djv?(u)|?(e)ps|pdf)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(cb[rz7t]|djv?(u)|?(e)ps|pdf)' ++++ _install_xspec '!*.@(?(e)ps|pdf)' ps2pdf ps2pdf12 ps2pdf13 ps2pdf14 ps2pdfwr ++++ local 'xspec=!*.@(?(e)ps|pdf)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?(e)ps|pdf)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?(e)ps|pdf)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?(e)ps|pdf)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?(e)ps|pdf)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?(e)ps|pdf)' ++++ _install_xspec '!*.texi*' makeinfo texi2html ++++ local 'xspec=!*.texi*' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.texi*' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.texi*' ++++ _install_xspec '!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)' tex latex slitex jadetex pdfjadetex pdftex pdflatex texi2dvi xetex xelatex luatex lualatex ++++ local 'xspec=!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?(la)tex|texi|dtx|ins|ltx|dbj)' ++++ _install_xspec '!*.mp3' mpg123 mpg321 madplay ++++ local 'xspec=!*.mp3' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.mp3' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.mp3' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.mp3' ++++ _install_xspec '!*@(.@(mp?(e)g|MP?(E)G|wm[av]|WM[AV]|avi|AVI|asf|vob|VOB|bin|dat|divx|DIVX|vcd|ps|pes|fli|flv|FLV|fxm|FXM|viv|rm|ram|yuv|mov|MOV|qt|QT|web[am]|WEB[AM]|mp[234]|MP[234]|m?(p)4[av]|M?(P)4[AV]|mkv|MKV|og[agmvx]|OG[AGMVX]|t[ps]|T[PS]|m2t?(s)|M2T?(S)|mts|MTS|wav|WAV|flac|FLAC|asx|ASX|mng|MNG|srt|m[eo]d|M[EO]D|s[3t]m|S[3T]M|it|IT|xm|XM)|+([0-9]).@(vdr|VDR))?(.@(crdownload|part))' xine aaxine fbxine ++++ local 'xspec=!*@(.@(mp?(e)g|MP?(E)G|wm[av]|WM[AV]|avi|AVI|asf|vob|VOB|bin|dat|divx|DIVX|vcd|ps|pes|fli|flv|FLV|fxm|FXM|viv|rm|ram|yuv|mov|MOV|qt|QT|web[am]|WEB[AM]|mp[234]|MP[234]|m?(p)4[av]|M?(P)4[AV]|mkv|MKV|og[agmvx]|OG[AGMVX]|t[ps]|T[PS]|m2t?(s)|M2T?(S)|mts|MTS|wav|WAV|flac|FLAC|asx|ASX|mng|MNG|srt|m[eo]d|M[EO]D|s[3t]m|S[3T]M|it|IT|xm|XM)|+([0-9]).@(vdr|VDR))?(.@(crdownload|part))' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*@(.@(mp?(e)g|MP?(E)G|wm[av]|WM[AV]|avi|AVI|asf|vob|VOB|bin|dat|divx|DIVX|vcd|ps|pes|fli|flv|FLV|fxm|FXM|viv|rm|ram|yuv|mov|MOV|qt|QT|web[am]|WEB[AM]|mp[234]|MP[234]|m?(p)4[av]|M?(P)4[AV]|mkv|MKV|og[agmvx]|OG[AGMVX]|t[ps]|T[PS]|m2t?(s)|M2T?(S)|mts|MTS|wav|WAV|flac|FLAC|asx|ASX|mng|MNG|srt|m[eo]d|M[EO]D|s[3t]m|S[3T]M|it|IT|xm|XM)|+([0-9]).@(vdr|VDR))?(.@(crdownload|part))' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*@(.@(mp?(e)g|MP?(E)G|wm[av]|WM[AV]|avi|AVI|asf|vob|VOB|bin|dat|divx|DIVX|vcd|ps|pes|fli|flv|FLV|fxm|FXM|viv|rm|ram|yuv|mov|MOV|qt|QT|web[am]|WEB[AM]|mp[234]|MP[234]|m?(p)4[av]|M?(P)4[AV]|mkv|MKV|og[agmvx]|OG[AGMVX]|t[ps]|T[PS]|m2t?(s)|M2T?(S)|mts|MTS|wav|WAV|flac|FLAC|asx|ASX|mng|MNG|srt|m[eo]d|M[EO]D|s[3t]m|S[3T]M|it|IT|xm|XM)|+([0-9]).@(vdr|VDR))?(.@(crdownload|part))' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*@(.@(mp?(e)g|MP?(E)G|wm[av]|WM[AV]|avi|AVI|asf|vob|VOB|bin|dat|divx|DIVX|vcd|ps|pes|fli|flv|FLV|fxm|FXM|viv|rm|ram|yuv|mov|MOV|qt|QT|web[am]|WEB[AM]|mp[234]|MP[234]|m?(p)4[av]|M?(P)4[AV]|mkv|MKV|og[agmvx]|OG[AGMVX]|t[ps]|T[PS]|m2t?(s)|M2T?(S)|mts|MTS|wav|WAV|flac|FLAC|asx|ASX|mng|MNG|srt|m[eo]d|M[EO]D|s[3t]m|S[3T]M|it|IT|xm|XM)|+([0-9]).@(vdr|VDR))?(.@(crdownload|part))' ++++ _install_xspec '!*@(.@(mp?(e)g|MP?(E)G|wm[av]|WM[AV]|avi|AVI|asf|vob|VOB|bin|dat|divx|DIVX|vcd|ps|pes|fli|flv|FLV|fxm|FXM|viv|rm|ram|yuv|mov|MOV|qt|QT|web[am]|WEB[AM]|mp[234]|MP[234]|m?(p)4[av]|M?(P)4[AV]|mkv|MKV|og[agmvx]|OG[AGMVX]|t[ps]|T[PS]|m2t?(s)|M2T?(S)|mts|MTS|wav|WAV|flac|FLAC|asx|ASX|mng|MNG|srt|m[eo]d|M[EO]D|s[3t]m|S[3T]M|it|IT|xm|XM|iso|ISO)|+([0-9]).@(vdr|VDR))?(.@(crdownload|part))' kaffeine dragon ++++ local 'xspec=!*@(.@(mp?(e)g|MP?(E)G|wm[av]|WM[AV]|avi|AVI|asf|vob|VOB|bin|dat|divx|DIVX|vcd|ps|pes|fli|flv|FLV|fxm|FXM|viv|rm|ram|yuv|mov|MOV|qt|QT|web[am]|WEB[AM]|mp[234]|MP[234]|m?(p)4[av]|M?(P)4[AV]|mkv|MKV|og[agmvx]|OG[AGMVX]|t[ps]|T[PS]|m2t?(s)|M2T?(S)|mts|MTS|wav|WAV|flac|FLAC|asx|ASX|mng|MNG|srt|m[eo]d|M[EO]D|s[3t]m|S[3T]M|it|IT|xm|XM|iso|ISO)|+([0-9]).@(vdr|VDR))?(.@(crdownload|part))' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*@(.@(mp?(e)g|MP?(E)G|wm[av]|WM[AV]|avi|AVI|asf|vob|VOB|bin|dat|divx|DIVX|vcd|ps|pes|fli|flv|FLV|fxm|FXM|viv|rm|ram|yuv|mov|MOV|qt|QT|web[am]|WEB[AM]|mp[234]|MP[234]|m?(p)4[av]|M?(P)4[AV]|mkv|MKV|og[agmvx]|OG[AGMVX]|t[ps]|T[PS]|m2t?(s)|M2T?(S)|mts|MTS|wav|WAV|flac|FLAC|asx|ASX|mng|MNG|srt|m[eo]d|M[EO]D|s[3t]m|S[3T]M|it|IT|xm|XM|iso|ISO)|+([0-9]).@(vdr|VDR))?(.@(crdownload|part))' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*@(.@(mp?(e)g|MP?(E)G|wm[av]|WM[AV]|avi|AVI|asf|vob|VOB|bin|dat|divx|DIVX|vcd|ps|pes|fli|flv|FLV|fxm|FXM|viv|rm|ram|yuv|mov|MOV|qt|QT|web[am]|WEB[AM]|mp[234]|MP[234]|m?(p)4[av]|M?(P)4[AV]|mkv|MKV|og[agmvx]|OG[AGMVX]|t[ps]|T[PS]|m2t?(s)|M2T?(S)|mts|MTS|wav|WAV|flac|FLAC|asx|ASX|mng|MNG|srt|m[eo]d|M[EO]D|s[3t]m|S[3T]M|it|IT|xm|XM|iso|ISO)|+([0-9]).@(vdr|VDR))?(.@(crdownload|part))' ++++ _install_xspec '!*.@(avi|asf|wmv)' aviplay ++++ local 'xspec=!*.@(avi|asf|wmv)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(avi|asf|wmv)' ++++ _install_xspec '!*.@(rm?(j)|ra?(m)|smi?(l))' realplay ++++ local 'xspec=!*.@(rm?(j)|ra?(m)|smi?(l))' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(rm?(j)|ra?(m)|smi?(l))' ++++ _install_xspec '!*.@(mpg|mpeg|avi|mov|qt)' xanim ++++ local 'xspec=!*.@(mpg|mpeg|avi|mov|qt)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(mpg|mpeg|avi|mov|qt)' ++++ _install_xspec '!*.@(og[ag]|m3u|flac|spx)' ogg123 ++++ local 'xspec=!*.@(og[ag]|m3u|flac|spx)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(og[ag]|m3u|flac|spx)' ++++ _install_xspec '!*.@(mp3|og[ag]|pls|m3u)' gqmpeg freeamp ++++ local 'xspec=!*.@(mp3|og[ag]|pls|m3u)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(mp3|og[ag]|pls|m3u)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(mp3|og[ag]|pls|m3u)' ++++ _install_xspec '!*.fig' xfig ++++ local 'xspec=!*.fig' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.fig' ++++ _install_xspec '!*.@(mid?(i)|cmf)' playmidi ++++ local 'xspec=!*.@(mid?(i)|cmf)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(mid?(i)|cmf)' ++++ _install_xspec '!*.@(mid?(i)|rmi|rcp|[gr]36|g18|mod|xm|it|x3m|s[3t]m|kar)' timidity ++++ local 'xspec=!*.@(mid?(i)|rmi|rcp|[gr]36|g18|mod|xm|it|x3m|s[3t]m|kar)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(mid?(i)|rmi|rcp|[gr]36|g18|mod|xm|it|x3m|s[3t]m|kar)' ++++ _install_xspec '!*.@(669|abc|am[fs]|d[bs]m|dmf|far|it|mdl|m[eo]d|mid?(i)|mt[2m]|oct|okt?(a)|p[st]m|s[3t]m|ult|umx|wav|xm)' modplugplay modplug123 ++++ local 'xspec=!*.@(669|abc|am[fs]|d[bs]m|dmf|far|it|mdl|m[eo]d|mid?(i)|mt[2m]|oct|okt?(a)|p[st]m|s[3t]m|ult|umx|wav|xm)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(669|abc|am[fs]|d[bs]m|dmf|far|it|mdl|m[eo]d|mid?(i)|mt[2m]|oct|okt?(a)|p[st]m|s[3t]m|ult|umx|wav|xm)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(669|abc|am[fs]|d[bs]m|dmf|far|it|mdl|m[eo]d|mid?(i)|mt[2m]|oct|okt?(a)|p[st]m|s[3t]m|ult|umx|wav|xm)' ++++ _install_xspec '*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)' vi vim gvim rvim view rview rgvim rgview gview emacs xemacs sxemacs kate kwrite ++++ local 'xspec=*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='*.@([ao]|so|so.!(conf|*/*)|[rs]pm|gif|jp?(e)g|mp3|mp?(e)g|avi|asf|ogg|class)' ++++ _install_xspec '!*.@(zip|z|gz|tgz)' bzme ++++ local 'xspec=!*.@(zip|z|gz|tgz)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(zip|z|gz|tgz)' ++++ _install_xspec '!*.@(?([xX]|[sS])[hH][tT][mM]?([lL]))' netscape mozilla lynx galeon dillo elinks amaya epiphany ++++ local 'xspec=!*.@(?([xX]|[sS])[hH][tT][mM]?([lL]))' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?([xX]|[sS])[hH][tT][mM]?([lL]))' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?([xX]|[sS])[hH][tT][mM]?([lL]))' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?([xX]|[sS])[hH][tT][mM]?([lL]))' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?([xX]|[sS])[hH][tT][mM]?([lL]))' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?([xX]|[sS])[hH][tT][mM]?([lL]))' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?([xX]|[sS])[hH][tT][mM]?([lL]))' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?([xX]|[sS])[hH][tT][mM]?([lL]))' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(?([xX]|[sS])[hH][tT][mM]?([lL]))' ++++ _install_xspec '!*.@(sxw|stw|sxg|sgl|doc?([mx])|dot?([mx])|rtf|txt|htm|html|?(f)odt|ott|odm|pdf)' oowriter lowriter ++++ local 'xspec=!*.@(sxw|stw|sxg|sgl|doc?([mx])|dot?([mx])|rtf|txt|htm|html|?(f)odt|ott|odm|pdf)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(sxw|stw|sxg|sgl|doc?([mx])|dot?([mx])|rtf|txt|htm|html|?(f)odt|ott|odm|pdf)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(sxw|stw|sxg|sgl|doc?([mx])|dot?([mx])|rtf|txt|htm|html|?(f)odt|ott|odm|pdf)' ++++ _install_xspec '!*.@(sxi|sti|pps?(x)|ppt?([mx])|pot?([mx])|?(f)odp|otp)' ooimpress loimpress ++++ local 'xspec=!*.@(sxi|sti|pps?(x)|ppt?([mx])|pot?([mx])|?(f)odp|otp)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(sxi|sti|pps?(x)|ppt?([mx])|pot?([mx])|?(f)odp|otp)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(sxi|sti|pps?(x)|ppt?([mx])|pot?([mx])|?(f)odp|otp)' ++++ _install_xspec '!*.@(sxc|stc|xls?([bmx])|xlw|xlt?([mx])|[ct]sv|?(f)ods|ots)' oocalc localc ++++ local 'xspec=!*.@(sxc|stc|xls?([bmx])|xlw|xlt?([mx])|[ct]sv|?(f)ods|ots)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(sxc|stc|xls?([bmx])|xlw|xlt?([mx])|[ct]sv|?(f)ods|ots)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(sxc|stc|xls?([bmx])|xlw|xlt?([mx])|[ct]sv|?(f)ods|ots)' ++++ _install_xspec '!*.@(sxd|std|sda|sdd|?(f)odg|otg)' oodraw lodraw ++++ local 'xspec=!*.@(sxd|std|sda|sdd|?(f)odg|otg)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(sxd|std|sda|sdd|?(f)odg|otg)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(sxd|std|sda|sdd|?(f)odg|otg)' ++++ _install_xspec '!*.@(sxm|smf|mml|odf)' oomath lomath ++++ local 'xspec=!*.@(sxm|smf|mml|odf)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(sxm|smf|mml|odf)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(sxm|smf|mml|odf)' ++++ _install_xspec '!*.odb' oobase lobase ++++ local 'xspec=!*.odb' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.odb' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.odb' ++++ _install_xspec '!*.[rs]pm' rpm2cpio ++++ local 'xspec=!*.[rs]pm' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.[rs]pm' ++++ _install_xspec '!*.aux' bibtex ++++ local 'xspec=!*.aux' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.aux' ++++ _install_xspec '!*.po' poedit gtranslator kbabel lokalize ++++ local 'xspec=!*.po' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.po' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.po' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.po' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.po' ++++ _install_xspec '!*.@([Pp][Rr][Gg]|[Cc][Ll][Pp])' harbour gharbour hbpp ++++ local 'xspec=!*.@([Pp][Rr][Gg]|[Cc][Ll][Pp])' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@([Pp][Rr][Gg]|[Cc][Ll][Pp])' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@([Pp][Rr][Gg]|[Cc][Ll][Pp])' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@([Pp][Rr][Gg]|[Cc][Ll][Pp])' ++++ _install_xspec '!*.[Hh][Rr][Bb]' hbrun ++++ local 'xspec=!*.[Hh][Rr][Bb]' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.[Hh][Rr][Bb]' ++++ _install_xspec '!*.ly' lilypond ly2dvi ++++ local 'xspec=!*.ly' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.ly' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.ly' ++++ _install_xspec '!*.@(dif?(f)|?(d)patch)?(.@([gx]z|bz2|lzma))' cdiff ++++ local 'xspec=!*.@(dif?(f)|?(d)patch)?(.@([gx]z|bz2|lzma))' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(dif?(f)|?(d)patch)?(.@([gx]z|bz2|lzma))' ++++ _install_xspec '!@(*.@(ks|jks|jceks|p12|pfx|bks|ubr|gkr|cer|crt|cert|p7b|pkipath|pem|p10|csr|crl)|cacerts)' portecle ++++ local 'xspec=!@(*.@(ks|jks|jceks|p12|pfx|bks|ubr|gkr|cer|crt|cert|p7b|pkipath|pem|p10|csr|crl)|cacerts)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!@(*.@(ks|jks|jceks|p12|pfx|bks|ubr|gkr|cer|crt|cert|p7b|pkipath|pem|p10|csr|crl)|cacerts)' ++++ _install_xspec '!*.@(mp[234c]|og[ag]|@(fl|a)ac|m4[abp]|spx|tta|w?(a)v|wma|aif?(f)|asf|ape)' kid3 kid3-qt ++++ local 'xspec=!*.@(mp[234c]|og[ag]|@(fl|a)ac|m4[abp]|spx|tta|w?(a)v|wma|aif?(f)|asf|ape)' cmd ++++ shift ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(mp[234c]|og[ag]|@(fl|a)ac|m4[abp]|spx|tta|w?(a)v|wma|aif?(f)|asf|ape)' ++++ for cmd in "$@" ++++ _xspecs[$cmd]='!*.@(mp[234c]|og[ag]|@(fl|a)ac|m4[abp]|spx|tta|w?(a)v|wma|aif?(f)|asf|ape)' ++++ unset -f _install_xspec ++++ complete -F _minimal '' ++++ complete -D -F _completion_loader ++++ compat_dir=/etc/bash_completion.d ++++ [[ -d /etc/bash_completion.d ]] ++++ [[ -r /etc/bash_completion.d ]] ++++ [[ -x /etc/bash_completion.d ]] ++++ for i in "$compat_dir"/* ++++ [[ apport_completion != @(@(#*#|*@(~|.@(bak|orig|rej|swp|dpkg*|rpm@(orig|new|save))))|Makefile*|@(acroread.sh)) ]] ++++ [[ -f /etc/bash_completion.d/apport_completion ]] ++++ [[ -r /etc/bash_completion.d/apport_completion ]] ++++ . /etc/bash_completion.d/apport_completion +++++ complete -F _apport-bug -o filenames -o dirnames ubuntu-bug +++++ complete -F _apport-bug -o filenames -o dirnames apport-bug +++++ complete -F _apport-cli -o filenames -o dirnames apport-cli +++++ complete -F _apport-unpack -o filenames -o dirnames apport-unpack +++++ complete -F _apport-collect apport-collect ++++ for i in "$compat_dir"/* ++++ [[ git-prompt != @(@(#*#|*@(~|.@(bak|orig|rej|swp|dpkg*|rpm@(orig|new|save))))|Makefile*|@(acroread.sh)) ]] ++++ [[ -f /etc/bash_completion.d/git-prompt ]] ++++ [[ -r /etc/bash_completion.d/git-prompt ]] ++++ . /etc/bash_completion.d/git-prompt +++++ [[ -e /usr/lib/git-core/git-sh-prompt ]] +++++ . /usr/lib/git-core/git-sh-prompt ++++++ __git_printf_supports_v= ++++++ printf -v __git_printf_supports_v -- %s yes ++++ unset compat_dir i _blacklist_glob ++++ user_completion=/admin/home/loubna/.bash_completion ++++ [[ /usr/share/bash-completion/bash_completion != /admin/home/loubna/.bash_completion ]] ++++ [[ -r /admin/home/loubna/.bash_completion ]] ++++ unset user_completion ++++ unset -f have ++++ unset have ++++ set +v ++++ unset BASH_COMPLETION_ORIGINAL_V_VALUE +++ export PATH=/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/envs/eval-harness/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin +++ PATH=/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/envs/eval-harness/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin ++++ /fsx/loubna/miniconda3/bin/conda shell.bash hook +++ __conda_setup='export CONDA_EXE='\''/fsx/loubna/miniconda3/bin/conda'\'' +export _CE_M='\'''\'' +export _CE_CONDA='\'''\'' +export CONDA_PYTHON_EXE='\''/fsx/loubna/miniconda3/bin/python'\'' + +# Copyright (C) 2012 Anaconda, Inc +# SPDX-License-Identifier: BSD-3-Clause + +__conda_exe() ( + "$CONDA_EXE" $_CE_M $_CE_CONDA "$@" +) + +__conda_hashr() { + if [ -n "${ZSH_VERSION:+x}" ]; then + \rehash + elif [ -n "${POSH_VERSION:+x}" ]; then + : # pass + else + \hash -r + fi +} + +__conda_activate() { + if [ -n "${CONDA_PS1_BACKUP:+x}" ]; then + # Handle transition from shell activated with conda <= 4.3 to a subsequent activation + # after conda updated to >= 4.4. See issue #6173. + PS1="$CONDA_PS1_BACKUP" + \unset CONDA_PS1_BACKUP + fi + \local ask_conda + ask_conda="$(PS1="${PS1:-}" __conda_exe shell.posix "$@")" || \return + \eval "$ask_conda" + __conda_hashr +} + +__conda_reactivate() { + \local ask_conda + ask_conda="$(PS1="${PS1:-}" __conda_exe shell.posix reactivate)" || \return + \eval "$ask_conda" + __conda_hashr +} + +conda() { + \local cmd="${1-__missing__}" + case "$cmd" in + activate|deactivate) + __conda_activate "$@" + ;; + install|update|upgrade|remove|uninstall) + __conda_exe "$@" || \return + __conda_reactivate + ;; + *) + __conda_exe "$@" + ;; + esac +} + +if [ -z "${CONDA_SHLVL+x}" ]; then + \export CONDA_SHLVL=0 + # In dev-mode CONDA_EXE is python.exe and on Windows + # it is in a different relative location to condabin. + if [ -n "${_CE_CONDA:+x}" ] && [ -n "${WINDIR+x}" ]; then + PATH="$(\dirname "$CONDA_EXE")/condabin${PATH:+":${PATH}"}" + else + PATH="$(\dirname "$(\dirname "$CONDA_EXE")")/condabin${PATH:+":${PATH}"}" + fi + \export PATH + + # We'\''re not allowing PS1 to be unbound. It must at least be set. + # However, we'\''re not exporting it, which can cause problems when starting a second shell + # via a first shell (i.e. starting zsh from bash). + if [ -z "${PS1+x}" ]; then + PS1= + fi +fi + +conda activate base' +++ '[' 0 -eq 0 ']' +++ eval 'export CONDA_EXE='\''/fsx/loubna/miniconda3/bin/conda'\'' +export _CE_M='\'''\'' +export _CE_CONDA='\'''\'' +export CONDA_PYTHON_EXE='\''/fsx/loubna/miniconda3/bin/python'\'' + +# Copyright (C) 2012 Anaconda, Inc +# SPDX-License-Identifier: BSD-3-Clause + +__conda_exe() ( + "$CONDA_EXE" $_CE_M $_CE_CONDA "$@" +) + +__conda_hashr() { + if [ -n "${ZSH_VERSION:+x}" ]; then + \rehash + elif [ -n "${POSH_VERSION:+x}" ]; then + : # pass + else + \hash -r + fi +} + +__conda_activate() { + if [ -n "${CONDA_PS1_BACKUP:+x}" ]; then + # Handle transition from shell activated with conda <= 4.3 to a subsequent activation + # after conda updated to >= 4.4. See issue #6173. + PS1="$CONDA_PS1_BACKUP" + \unset CONDA_PS1_BACKUP + fi + \local ask_conda + ask_conda="$(PS1="${PS1:-}" __conda_exe shell.posix "$@")" || \return + \eval "$ask_conda" + __conda_hashr +} + +__conda_reactivate() { + \local ask_conda + ask_conda="$(PS1="${PS1:-}" __conda_exe shell.posix reactivate)" || \return + \eval "$ask_conda" + __conda_hashr +} + +conda() { + \local cmd="${1-__missing__}" + case "$cmd" in + activate|deactivate) + __conda_activate "$@" + ;; + install|update|upgrade|remove|uninstall) + __conda_exe "$@" || \return + __conda_reactivate + ;; + *) + __conda_exe "$@" + ;; + esac +} + +if [ -z "${CONDA_SHLVL+x}" ]; then + \export CONDA_SHLVL=0 + # In dev-mode CONDA_EXE is python.exe and on Windows + # it is in a different relative location to condabin. + if [ -n "${_CE_CONDA:+x}" ] && [ -n "${WINDIR+x}" ]; then + PATH="$(\dirname "$CONDA_EXE")/condabin${PATH:+":${PATH}"}" + else + PATH="$(\dirname "$(\dirname "$CONDA_EXE")")/condabin${PATH:+":${PATH}"}" + fi + \export PATH + + # We'\''re not allowing PS1 to be unbound. It must at least be set. + # However, we'\''re not exporting it, which can cause problems when starting a second shell + # via a first shell (i.e. starting zsh from bash). + if [ -z "${PS1+x}" ]; then + PS1= + fi +fi + +conda activate base' ++++ export CONDA_EXE=/fsx/loubna/miniconda3/bin/conda ++++ CONDA_EXE=/fsx/loubna/miniconda3/bin/conda ++++ export _CE_M= ++++ _CE_M= ++++ export _CE_CONDA= ++++ _CE_CONDA= ++++ export CONDA_PYTHON_EXE=/fsx/loubna/miniconda3/bin/python ++++ CONDA_PYTHON_EXE=/fsx/loubna/miniconda3/bin/python ++++ '[' -z x ']' ++++ conda activate base ++++ local cmd=activate ++++ case "$cmd" in ++++ __conda_activate activate base ++++ '[' -n '' ']' ++++ local ask_conda +++++ PS1='\[\e]0;\u@\h: \w\a\]${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ ' +++++ __conda_exe shell.posix activate base +++++ /fsx/loubna/miniconda3/bin/conda shell.posix activate base ++++ ask_conda='PS1='\''(base) \[\e]0;\u@\h: \w\a\]${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '\'' +export PATH='\''/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin'\'' +export CONDA_PREFIX='\''/fsx/loubna/miniconda3'\'' +export CONDA_SHLVL='\''3'\'' +export CONDA_DEFAULT_ENV='\''base'\'' +export CONDA_PROMPT_MODIFIER='\''(base) '\'' +export CONDA_PREFIX_2='\''/fsx/loubna/miniconda3/envs/eval-harness'\'' +export CONDA_EXE='\''/fsx/loubna/miniconda3/bin/conda'\'' +export _CE_M='\'''\'' +export _CE_CONDA='\'''\'' +export CONDA_PYTHON_EXE='\''/fsx/loubna/miniconda3/bin/python'\''' ++++ eval 'PS1='\''(base) \[\e]0;\u@\h: \w\a\]${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '\'' +export PATH='\''/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin'\'' +export CONDA_PREFIX='\''/fsx/loubna/miniconda3'\'' +export CONDA_SHLVL='\''3'\'' +export CONDA_DEFAULT_ENV='\''base'\'' +export CONDA_PROMPT_MODIFIER='\''(base) '\'' +export CONDA_PREFIX_2='\''/fsx/loubna/miniconda3/envs/eval-harness'\'' +export CONDA_EXE='\''/fsx/loubna/miniconda3/bin/conda'\'' +export _CE_M='\'''\'' +export _CE_CONDA='\'''\'' +export CONDA_PYTHON_EXE='\''/fsx/loubna/miniconda3/bin/python'\''' +++++ PS1='(base) \[\e]0;\u@\h: \w\a\]${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ ' +++++ export PATH=/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin +++++ PATH=/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin +++++ export CONDA_PREFIX=/fsx/loubna/miniconda3 +++++ CONDA_PREFIX=/fsx/loubna/miniconda3 +++++ export CONDA_SHLVL=3 +++++ CONDA_SHLVL=3 +++++ export CONDA_DEFAULT_ENV=base +++++ CONDA_DEFAULT_ENV=base +++++ export 'CONDA_PROMPT_MODIFIER=(base) ' +++++ CONDA_PROMPT_MODIFIER='(base) ' +++++ export CONDA_PREFIX_2=/fsx/loubna/miniconda3/envs/eval-harness +++++ CONDA_PREFIX_2=/fsx/loubna/miniconda3/envs/eval-harness +++++ export CONDA_EXE=/fsx/loubna/miniconda3/bin/conda +++++ CONDA_EXE=/fsx/loubna/miniconda3/bin/conda +++++ export _CE_M= +++++ _CE_M= +++++ export _CE_CONDA= +++++ _CE_CONDA= +++++ export CONDA_PYTHON_EXE=/fsx/loubna/miniconda3/bin/python +++++ CONDA_PYTHON_EXE=/fsx/loubna/miniconda3/bin/python ++++ __conda_hashr ++++ '[' -n '' ']' ++++ '[' -n '' ']' ++++ hash -r +++ unset __conda_setup +++ export WANDB_CACHE_DIR=/fsx/loubna/.tmp/wandb +++ WANDB_CACHE_DIR=/fsx/loubna/.tmp/wandb +++ export TMPDIR=/fsx/loubna/.tmp +++ TMPDIR=/fsx/loubna/.tmp +++ export HUGGINGFACE_HUB_CACHE=/fsx/loubna/.cache +++ HUGGINGFACE_HUB_CACHE=/fsx/loubna/.cache +++ export HF_DATASETS_CACHE=/fsx/loubna/.cache +++ HF_DATASETS_CACHE=/fsx/loubna/.cache +++ '[' -f /fsx/loubna/google-cloud-sdk/path.bash.inc ']' +++ . /fsx/loubna/google-cloud-sdk/path.bash.inc +++++ command readlink /fsx/loubna/google-cloud-sdk/path.bash.inc +++++ readlink /fsx/loubna/google-cloud-sdk/path.bash.inc ++++ script_link= ++++ script_link=/fsx/loubna/google-cloud-sdk/path.bash.inc ++++ apparent_sdk_dir=/fsx/loubna/google-cloud-sdk ++++ '[' /fsx/loubna/google-cloud-sdk == /fsx/loubna/google-cloud-sdk/path.bash.inc ']' +++++ command cd -P /fsx/loubna/google-cloud-sdk +++++ cd -P /fsx/loubna/google-cloud-sdk +++++ command pwd -P +++++ pwd -P ++++ sdk_dir=/fsx/loubna/google-cloud-sdk ++++ bin_path=/fsx/loubna/google-cloud-sdk/bin ++++ [[ :/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin: != *\:\/\f\s\x\/\l\o\u\b\n\a\/\g\o\o\g\l\e\-\c\l\o\u\d\-\s\d\k\/\b\i\n\:* ]] +++ '[' -f /fsx/loubna/google-cloud-sdk/completion.bash.inc ']' +++ . /fsx/loubna/google-cloud-sdk/completion.bash.inc ++++ complete -o nospace -F _python_argcomplete gcloud ++++ unset bq_COMMANDS ++++ complete -F _bq_completer bq ++++ complete -o nospace -F _python_argcomplete gsutil ++ conda activate megatron ++ local cmd=activate ++ case "$cmd" in ++ __conda_activate activate megatron ++ '[' -n '' ']' ++ local ask_conda +++ PS1='(base) \[\e]0;\u@\h: \w\a\]${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ ' +++ __conda_exe shell.posix activate megatron +++ /fsx/loubna/miniconda3/bin/conda shell.posix activate megatron ++ ask_conda='PS1='\''(megatron) \[\e]0;\u@\h: \w\a\]${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '\'' +export PATH='\''/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/envs/megatron/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin'\'' +export CONDA_PREFIX='\''/fsx/loubna/miniconda3/envs/megatron'\'' +export CONDA_SHLVL='\''4'\'' +export CONDA_DEFAULT_ENV='\''megatron'\'' +export CONDA_PROMPT_MODIFIER='\''(megatron) '\'' +export CONDA_PREFIX_3='\''/fsx/loubna/miniconda3'\'' +export CONDA_EXE='\''/fsx/loubna/miniconda3/bin/conda'\'' +export _CE_M='\'''\'' +export _CE_CONDA='\'''\'' +export CONDA_PYTHON_EXE='\''/fsx/loubna/miniconda3/bin/python'\''' ++ eval 'PS1='\''(megatron) \[\e]0;\u@\h: \w\a\]${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '\'' +export PATH='\''/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/envs/megatron/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin'\'' +export CONDA_PREFIX='\''/fsx/loubna/miniconda3/envs/megatron'\'' +export CONDA_SHLVL='\''4'\'' +export CONDA_DEFAULT_ENV='\''megatron'\'' +export CONDA_PROMPT_MODIFIER='\''(megatron) '\'' +export CONDA_PREFIX_3='\''/fsx/loubna/miniconda3'\'' +export CONDA_EXE='\''/fsx/loubna/miniconda3/bin/conda'\'' +export _CE_M='\'''\'' +export _CE_CONDA='\'''\'' +export CONDA_PYTHON_EXE='\''/fsx/loubna/miniconda3/bin/python'\''' +++ PS1='(megatron) \[\e]0;\u@\h: \w\a\]${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ ' +++ export PATH=/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/envs/megatron/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin +++ PATH=/opt/slurm/bin:/opt/slurm/sbin:/opt/slurm/bin:/opt/slurm/sbin:/admin/home/loubna/.vscode-server/bin/d045a5eda657f4d7b676dedbfa7aab8207f8a075/bin/remote-cli:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/fsx/loubna/google-cloud-sdk/bin:/fsx/loubna/miniconda3/envs/megatron/bin:/fsx/loubna/miniconda3/condabin:/opt/slurm/bin:/opt/slurm/sbin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin +++ export CONDA_PREFIX=/fsx/loubna/miniconda3/envs/megatron +++ CONDA_PREFIX=/fsx/loubna/miniconda3/envs/megatron +++ export CONDA_SHLVL=4 +++ CONDA_SHLVL=4 +++ export CONDA_DEFAULT_ENV=megatron +++ CONDA_DEFAULT_ENV=megatron +++ export 'CONDA_PROMPT_MODIFIER=(megatron) ' +++ CONDA_PROMPT_MODIFIER='(megatron) ' +++ export CONDA_PREFIX_3=/fsx/loubna/miniconda3 +++ CONDA_PREFIX_3=/fsx/loubna/miniconda3 +++ export CONDA_EXE=/fsx/loubna/miniconda3/bin/conda +++ CONDA_EXE=/fsx/loubna/miniconda3/bin/conda +++ export _CE_M= +++ _CE_M= +++ export _CE_CONDA= +++ _CE_CONDA= +++ export CONDA_PYTHON_EXE=/fsx/loubna/miniconda3/bin/python +++ CONDA_PYTHON_EXE=/fsx/loubna/miniconda3/bin/python ++ __conda_hashr ++ '[' -n '' ']' ++ '[' -n '' ']' ++ hash -r +++ date ++ echo 'START TIME: Wed Jun 21 17:26:40 UTC 2023' +START TIME: Wed Jun 21 17:26:40 UTC 2023 ++ SCRIPT_REPO=/fsx/loubna/code/Megatron-LM ++ pushd /fsx/loubna/code/Megatron-LM +/fsx/loubna/code/Megatron-LM /fsx/loubna/code/fork/brrr/examples/gpt2_mqa/hub_logs ++ LOG_PATH=/fsx/loubna/code/Megatron-LM/main_log.txt ++ GPUS_PER_NODE=8 +++ head -n 1 +++ scontrol show hostnames 'ip-26-0-150-[19,31,54,70,122],ip-26-0-151-187,ip-26-0-155-[46,69]' ++ MASTER_ADDR=ip-26-0-150-19 ++ MASTER_PORT=6000 ++ NNODES=8 ++ NODE_RANK=0 ++ WORLD_SIZE=64 ++ CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/1b-starcoder ++ TOKENIZER_FILE=/fsx/loubna/starcoder-tokenizer/15b/tokenizer.json ++ WEIGHTS_TRAIN=/fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/train_data_paths.txt.tmp ++ WEIGHTS_VALID=/fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/valid_data_paths.txt.tmp ++ mkdir -p /fsx/bigcode/experiments/pretraining/1b-starcoder/tensorboard ++ GPT_ARGS=' --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 2048 --num-attention-heads 16 --attention-head-type multiquery --init-method-std 0.02209 --seq-length 8192 --max-position-embeddings 8192 --attention-dropout 0.1 --hidden-dropout 0.1 --micro-batch-size 1 --global-batch-size 64 --lr 0.0003 --min-lr 0.00003 --train-iters 150000 --lr-decay-iters 150000 --lr-decay-style cosine --lr-warmup-iters 2000 --weight-decay .1 --adam-beta2 .95 --clip-grad 1.0 --bf16 --use-flash-attn --log-interval 10 --save-interval 10000 --eval-interval 10000 --eval-iters 2 --valid-num-workers 0 ' ++ TENSORBOARD_ARGS='--tensorboard-dir /fsx/loubna/br4-experiments/tensorboard/debug' ++ CMD=' /fsx/loubna/code/Megatron-LM/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 2048 --num-attention-heads 16 --attention-head-type multiquery --init-method-std 0.02209 --seq-length 8192 --max-position-embeddings 8192 --attention-dropout 0.1 --hidden-dropout 0.1 --micro-batch-size 1 --global-batch-size 64 --lr 0.0003 --min-lr 0.00003 --train-iters 150000 --lr-decay-iters 150000 --lr-decay-style cosine --lr-warmup-iters 2000 --weight-decay .1 --adam-beta2 .95 --clip-grad 1.0 --bf16 --use-flash-attn --log-interval 10 --save-interval 10000 --eval-interval 10000 --eval-iters 2 --valid-num-workers 0 --tokenizer-type TokenizerFromFile --tokenizer-file /fsx/loubna/starcoder-tokenizer/15b/tokenizer.json --save /fsx/bigcode/experiments/pretraining/1b-starcoder --load /fsx/bigcode/experiments/pretraining/1b-starcoder --train-weighted-split-paths-path /fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/train_data_paths.txt.tmp --valid-weighted-split-paths-path /fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/valid_data_paths.txt.tmp --structured-logs --structured-logs-dir /fsx/bigcode/experiments/pretraining/1b-starcoder/logs --tensorboard-dir /fsx/loubna/br4-experiments/tensorboard/debug --wandb-entity-name loubnabnl --wandb-project-name 1b-model ' ++ export 'LAUNCHER=python -u -m torch.distributed.run --nproc_per_node 8 --nnodes 8 --rdzv_endpoint ip-26-0-150-19:6000 --rdzv_backend c10d --max_restarts 0 --tee 3 ' ++ LAUNCHER='python -u -m torch.distributed.run --nproc_per_node 8 --nnodes 8 --rdzv_endpoint ip-26-0-150-19:6000 --rdzv_backend c10d --max_restarts 0 --tee 3 ' ++ echo /fsx/loubna/code/Megatron-LM/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 2048 --num-attention-heads 16 --attention-head-type multiquery --init-method-std 0.02209 --seq-length 8192 --max-position-embeddings 8192 --attention-dropout 0.1 --hidden-dropout 0.1 --micro-batch-size 1 --global-batch-size 64 --lr 0.0003 --min-lr 0.00003 --train-iters 150000 --lr-decay-iters 150000 --lr-decay-style cosine --lr-warmup-iters 2000 --weight-decay .1 --adam-beta2 .95 --clip-grad 1.0 --bf16 --use-flash-attn --log-interval 10 --save-interval 10000 --eval-interval 10000 --eval-iters 2 --valid-num-workers 0 --tokenizer-type TokenizerFromFile --tokenizer-file /fsx/loubna/starcoder-tokenizer/15b/tokenizer.json --save /fsx/bigcode/experiments/pretraining/1b-starcoder --load /fsx/bigcode/experiments/pretraining/1b-starcoder --train-weighted-split-paths-path /fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/train_data_paths.txt.tmp --valid-weighted-split-paths-path /fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/valid_data_paths.txt.tmp --structured-logs --structured-logs-dir /fsx/bigcode/experiments/pretraining/1b-starcoder/logs --tensorboard-dir /fsx/loubna/br4-experiments/tensorboard/debug --wandb-entity-name loubnabnl --wandb-project-name 1b-model +/fsx/loubna/code/Megatron-LM/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 2048 --num-attention-heads 16 --attention-head-type multiquery --init-method-std 0.02209 --seq-length 8192 --max-position-embeddings 8192 --attention-dropout 0.1 --hidden-dropout 0.1 --micro-batch-size 1 --global-batch-size 64 --lr 0.0003 --min-lr 0.00003 --train-iters 150000 --lr-decay-iters 150000 --lr-decay-style cosine --lr-warmup-iters 2000 --weight-decay .1 --adam-beta2 .95 --clip-grad 1.0 --bf16 --use-flash-attn --log-interval 10 --save-interval 10000 --eval-interval 10000 --eval-iters 2 --valid-num-workers 0 --tokenizer-type TokenizerFromFile --tokenizer-file /fsx/loubna/starcoder-tokenizer/15b/tokenizer.json --save /fsx/bigcode/experiments/pretraining/1b-starcoder --load /fsx/bigcode/experiments/pretraining/1b-starcoder --train-weighted-split-paths-path /fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/train_data_paths.txt.tmp --valid-weighted-split-paths-path /fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/valid_data_paths.txt.tmp --structured-logs --structured-logs-dir /fsx/bigcode/experiments/pretraining/1b-starcoder/logs --tensorboard-dir /fsx/loubna/br4-experiments/tensorboard/debug --wandb-entity-name loubnabnl --wandb-project-name 1b-model ++ export NCCL_ASYNC_ERROR_HANDLING=1 ++ NCCL_ASYNC_ERROR_HANDLING=1 ++ export NCCL_PROTO=simple ++ NCCL_PROTO=simple ++ export RDMAV_FORK_SAFE=1 ++ RDMAV_FORK_SAFE=1 ++ export FI_EFA_FORK_SAFE=1 ++ FI_EFA_FORK_SAFE=1 ++ export FI_EFA_USE_DEVICE_RDMA=1 ++ FI_EFA_USE_DEVICE_RDMA=1 ++ export FI_PROVIDER=efa ++ FI_PROVIDER=efa ++ export FI_LOG_LEVEL=1 ++ FI_LOG_LEVEL=1 ++ export NCCL_IB_DISABLE=1 ++ NCCL_IB_DISABLE=1 ++ export NCCL_SOCKET_IFNAME=ens ++ NCCL_SOCKET_IFNAME=ens ++ export CUDA_HOME=/usr/local/cuda-11.6 ++ CUDA_HOME=/usr/local/cuda-11.6 ++ SRUN_ARGS=' --wait=60 --kill-on-bad-exit=1 ' ++ clear ++ srun --wait=60 --kill-on-bad-exit=1 --jobid 161653 bash -c 'python -u -m torch.distributed.run --nproc_per_node 8 --nnodes 8 --rdzv_endpoint ip-26-0-150-19:6000 --rdzv_backend c10d --max_restarts 0 --tee 3 --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: /fsx/loubna/code/Megatron-LM/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 24 --hidden-size 2048 --num-attention-heads 16 --attention-head-type multiquery --init-method-std 0.02209 --seq-length 8192 --max-position-embeddings 8192 --attention-dropout 0.1 --hidden-dropout 0.1 --micro-batch-size 1 --global-batch-size 64 --lr 0.0003 --min-lr 0.00003 --train-iters 150000 --lr-decay-iters 150000 --lr-decay-style cosine --lr-warmup-iters 2000 --weight-decay .1 --adam-beta2 .95 --clip-grad 1.0 --bf16 --use-flash-attn --log-interval 10 --save-interval 10000 --eval-interval 10000 --eval-iters 2 --valid-num-workers 0 --tokenizer-type TokenizerFromFile --tokenizer-file /fsx/loubna/starcoder-tokenizer/15b/tokenizer.json --save /fsx/bigcode/experiments/pretraining/1b-starcoder --load /fsx/bigcode/experiments/pretraining/1b-starcoder --train-weighted-split-paths-path /fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/train_data_paths.txt.tmp --valid-weighted-split-paths-path /fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/valid_data_paths.txt.tmp --structured-logs --structured-logs-dir /fsx/bigcode/experiments/pretraining/1b-starcoder/logs --tensorboard-dir /fsx/loubna/br4-experiments/tensorboard/debug --wandb-entity-name loubnabnl --wandb-project-name 1b-model ' ++ tee /fsx/loubna/code/Megatron-LM/main_log.txt +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +WARNING:__main__: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[ip-26-0-150-122:0]:using world size: 64, data-parallel-size: 64, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +[ip-26-0-150-122:0]:WARNING: overriding default arguments for tokenizer_type:GPT2BPETokenizer with tokenizer_type:TokenizerFromFile +[ip-26-0-150-122:0]:accumulate and all-reduce gradients in fp32 for bfloat16 data type. +[ip-26-0-150-122:0]:using torch.bfloat16 for parameters ... +[ip-26-0-150-122:0]:------------------------ arguments ------------------------ +[ip-26-0-150-122:0]: accumulate_allreduce_grads_in_fp32 .............. True +[ip-26-0-150-122:0]: adam_beta1 ...................................... 0.9 +[ip-26-0-150-122:0]: adam_beta2 ...................................... 0.95 +[ip-26-0-150-122:0]: adam_eps ........................................ 1e-08 +[ip-26-0-150-122:0]: adlr_autoresume ................................. False +[ip-26-0-150-122:0]: adlr_autoresume_interval ........................ 1000 +[ip-26-0-150-122:0]: apply_query_key_layer_scaling ................... True +[ip-26-0-150-122:0]: apply_residual_connection_post_layernorm ........ False +[ip-26-0-150-122:0]: async_tensor_model_parallel_allreduce ........... True +[ip-26-0-150-122:0]: attention_dropout ............................... 0.1 +[ip-26-0-150-122:0]: attention_head_type ............................. multiquery +[ip-26-0-150-122:0]: attention_softmax_in_fp32 ....................... False +[ip-26-0-150-122:0]: bert_binary_head ................................ True +[ip-26-0-150-122:0]: bert_load ....................................... None +[ip-26-0-150-122:0]: bf16 ............................................ True +[ip-26-0-150-122:0]: bias_dropout_fusion ............................. True +[ip-26-0-150-122:0]: bias_gelu_fusion ................................ True +[ip-26-0-150-122:0]: biencoder_projection_dim ........................ 0 +[ip-26-0-150-122:0]: biencoder_shared_query_context_model ............ False +[ip-26-0-150-122:0]: block_data_path ................................. None +[ip-26-0-150-122:0]: classes_fraction ................................ 1.0 +[ip-26-0-150-122:0]: clip_grad ....................................... 1.0 +[ip-26-0-150-122:0]: consumed_train_samples .......................... 0 +[ip-26-0-150-122:0]: consumed_valid_samples .......................... 0 +[ip-26-0-150-122:0]: data_impl ....................................... infer +[ip-26-0-150-122:0]: data_parallel_random_init ....................... False +[ip-26-0-150-122:0]: data_parallel_size .............................. 64 +[ip-26-0-150-122:0]: data_path ....................................... None +[ip-26-0-150-122:0]: data_per_class_fraction ......................... 1.0 +[ip-26-0-150-122:0]: data_sharding ................................... True +[ip-26-0-150-122:0]: dataloader_type ................................. single +[ip-26-0-150-122:0]: DDP_impl ........................................ local +[ip-26-0-150-122:0]: decoder_seq_length .............................. None +[ip-26-0-150-122:0]: dino_bottleneck_size ............................ 256 +[ip-26-0-150-122:0]: dino_freeze_last_layer .......................... 1 +[ip-26-0-150-122:0]: dino_head_hidden_size ........................... 2048 +[ip-26-0-150-122:0]: dino_local_crops_number ......................... 10 +[ip-26-0-150-122:0]: dino_local_img_size ............................. 96 +[ip-26-0-150-122:0]: dino_norm_last_layer ............................ False +[ip-26-0-150-122:0]: dino_teacher_temp ............................... 0.07 +[ip-26-0-150-122:0]: dino_warmup_teacher_temp ........................ 0.04 +[ip-26-0-150-122:0]: dino_warmup_teacher_temp_epochs ................. 30 +[ip-26-0-150-122:0]: distribute_saved_activations .................... False +[ip-26-0-150-122:0]: distributed_backend ............................. nccl +[ip-26-0-150-122:0]: distributed_timeout ............................. 600 +[ip-26-0-150-122:0]: embedding_path .................................. None +[ip-26-0-150-122:0]: empty_unused_memory_level ....................... 0 +[ip-26-0-150-122:0]: encoder_seq_length .............................. 8192 +[ip-26-0-150-122:0]: end_weight_decay ................................ 0.1 +[ip-26-0-150-122:0]: eod_mask_loss ................................... False +[ip-26-0-150-122:0]: eval_interval ................................... 10000 +[ip-26-0-150-122:0]: eval_iters ...................................... 2 +[ip-26-0-150-122:0]: evidence_data_path .............................. None +[ip-26-0-150-122:0]: exit_duration_in_mins ........................... None +[ip-26-0-150-122:0]: exit_interval ................................... None +[ip-26-0-150-122:0]: exit_signal_handler ............................. False +[ip-26-0-150-122:0]: ffn_hidden_size ................................. 8192 +[ip-26-0-150-122:0]: fim_rate ........................................ 0.0 +[ip-26-0-150-122:0]: fim_spm_rate .................................... 0.5 +[ip-26-0-150-122:0]: finetune ........................................ False +[ip-26-0-150-122:0]: finetune_from ................................... None +[ip-26-0-150-122:0]: fp16 ............................................ False +[ip-26-0-150-122:0]: fp16_lm_cross_entropy ........................... False +[ip-26-0-150-122:0]: fp32_residual_connection ........................ False +[ip-26-0-150-122:0]: global_batch_size ............................... 64 +[ip-26-0-150-122:0]: glu_activation .................................. None +[ip-26-0-150-122:0]: gradient_accumulation_fusion .................... True +[ip-26-0-150-122:0]: head_lr_mult .................................... 1.0 +[ip-26-0-150-122:0]: hidden_dropout .................................. 0.1 +[ip-26-0-150-122:0]: hidden_size ..................................... 2048 +[ip-26-0-150-122:0]: hysteresis ...................................... 2 +[ip-26-0-150-122:0]: ict_head_size ................................... None +[ip-26-0-150-122:0]: ict_load ........................................ None +[ip-26-0-150-122:0]: img_h ........................................... 224 +[ip-26-0-150-122:0]: img_w ........................................... 224 +[ip-26-0-150-122:0]: indexer_batch_size .............................. 128 +[ip-26-0-150-122:0]: indexer_log_interval ............................ 1000 +[ip-26-0-150-122:0]: inference_batch_times_seqlen_threshold .......... 512 +[ip-26-0-150-122:0]: init_method_std ................................. 0.02209 +[ip-26-0-150-122:0]: init_method_xavier_uniform ...................... False +[ip-26-0-150-122:0]: initial_loss_scale .............................. 4294967296 +[ip-26-0-150-122:0]: iter_per_epoch .................................. 1250 +[ip-26-0-150-122:0]: kv_channels ..................................... 128 +[ip-26-0-150-122:0]: layernorm_epsilon ............................... 1e-05 +[ip-26-0-150-122:0]: lazy_mpu_init ................................... None +[ip-26-0-150-122:0]: load ............................................ /fsx/bigcode/experiments/pretraining/1b-starcoder +[ip-26-0-150-122:0]: local_rank ...................................... 0 +[ip-26-0-150-122:0]: log_batch_size_to_tensorboard ................... False +[ip-26-0-150-122:0]: log_interval .................................... 10 +[ip-26-0-150-122:0]: log_learning_rate_to_tensorboard ................ True +[ip-26-0-150-122:0]: log_loss_scale_to_tensorboard ................... True +[ip-26-0-150-122:0]: log_memory_to_tensorboard ....................... False +[ip-26-0-150-122:0]: log_num_zeros_in_grad ........................... False +[ip-26-0-150-122:0]: log_params_norm ................................. False +[ip-26-0-150-122:0]: log_timers_to_tensorboard ....................... False +[ip-26-0-150-122:0]: log_validation_ppl_to_tensorboard ............... False +[ip-26-0-150-122:0]: log_world_size_to_tensorboard ................... False +[ip-26-0-150-122:0]: loss_scale ...................................... None +[ip-26-0-150-122:0]: loss_scale_window ............................... 1000 +[ip-26-0-150-122:0]: lr .............................................. 0.0003 +[ip-26-0-150-122:0]: lr_decay_iters .................................. 150000 +[ip-26-0-150-122:0]: lr_decay_samples ................................ None +[ip-26-0-150-122:0]: lr_decay_style .................................. cosine +[ip-26-0-150-122:0]: lr_warmup_fraction .............................. None +[ip-26-0-150-122:0]: lr_warmup_iters ................................. 2000 +[ip-26-0-150-122:0]: lr_warmup_samples ............................... 0 +[ip-26-0-150-122:0]: make_vocab_size_divisible_by .................... 128 +[ip-26-0-150-122:0]: mask_factor ..................................... 1.0 +[ip-26-0-150-122:0]: mask_prob ....................................... 0.15 +[ip-26-0-150-122:0]: mask_type ....................................... random +[ip-26-0-150-122:0]: masked_softmax_fusion ........................... True +[ip-26-0-150-122:0]: max_position_embeddings ......................... 8192 +[ip-26-0-150-122:0]: merge_file ...................................... None +[ip-26-0-150-122:0]: micro_batch_size ................................ 1 +[ip-26-0-150-122:0]: min_loss_scale .................................. 1.0 +[ip-26-0-150-122:0]: min_lr .......................................... 3e-05 +[ip-26-0-150-122:0]: mmap_warmup ..................................... False +[ip-26-0-150-122:0]: no_load_optim ................................... None +[ip-26-0-150-122:0]: no_load_rng ..................................... None +[ip-26-0-150-122:0]: no_persist_layer_norm ........................... False +[ip-26-0-150-122:0]: no_save_optim ................................... None +[ip-26-0-150-122:0]: no_save_rng ..................................... None +[ip-26-0-150-122:0]: num_attention_heads ............................. 16 +[ip-26-0-150-122:0]: num_channels .................................... 3 +[ip-26-0-150-122:0]: num_classes ..................................... 1000 +[ip-26-0-150-122:0]: num_experts ..................................... None +[ip-26-0-150-122:0]: num_layers ...................................... 24 +[ip-26-0-150-122:0]: num_layers_per_virtual_pipeline_stage ........... None +[ip-26-0-150-122:0]: num_workers ..................................... 2 +[ip-26-0-150-122:0]: onnx_safe ....................................... None +[ip-26-0-150-122:0]: openai_gelu ..................................... False +[ip-26-0-150-122:0]: optimizer ....................................... adam +[ip-26-0-150-122:0]: override_opt_param_scheduler .................... False +[ip-26-0-150-122:0]: params_dtype .................................... torch.bfloat16 +[ip-26-0-150-122:0]: patch_dim ....................................... 16 +[ip-26-0-150-122:0]: perform_initialization .......................... True +[ip-26-0-150-122:0]: pipeline_model_parallel_size .................... 1 +[ip-26-0-150-122:0]: pipeline_model_parallel_split_rank .............. None +[ip-26-0-150-122:0]: position_embedding_type ......................... PositionEmbeddingType.absolute +[ip-26-0-150-122:0]: query_in_block_prob ............................. 0.1 +[ip-26-0-150-122:0]: rampup_batch_size ............................... None +[ip-26-0-150-122:0]: rank ............................................ 0 +[ip-26-0-150-122:0]: recompute_granularity ........................... None +[ip-26-0-150-122:0]: recompute_method ................................ None +[ip-26-0-150-122:0]: recompute_num_layers ............................ 1 +[ip-26-0-150-122:0]: reset_attention_mask ............................ False +[ip-26-0-150-122:0]: reset_position_ids .............................. False +[ip-26-0-150-122:0]: retriever_report_topk_accuracies ................ [] +[ip-26-0-150-122:0]: retriever_score_scaling ......................... False +[ip-26-0-150-122:0]: retriever_seq_length ............................ 256 +[ip-26-0-150-122:0]: sample_rate ..................................... 1.0 +[ip-26-0-150-122:0]: save ............................................ /fsx/bigcode/experiments/pretraining/1b-starcoder +[ip-26-0-150-122:0]: save_interval ................................... 10000 +[ip-26-0-150-122:0]: scatter_gather_tensors_in_pipeline .............. True +[ip-26-0-150-122:0]: seed ............................................ 1234 +[ip-26-0-150-122:0]: seq_length ...................................... 8192 +[ip-26-0-150-122:0]: sequence_parallel ............................... False +[ip-26-0-150-122:0]: sgd_momentum .................................... 0.9 +[ip-26-0-150-122:0]: short_seq_prob .................................. 0.1 +[ip-26-0-150-122:0]: split ........................................... None +[ip-26-0-150-122:0]: standalone_embedding_stage ...................... False +[ip-26-0-150-122:0]: start_weight_decay .............................. 0.1 +[ip-26-0-150-122:0]: structured_logs ................................. True +[ip-26-0-150-122:0]: structured_logs_dir ............................. /fsx/bigcode/experiments/pretraining/1b-starcoder/logs +[ip-26-0-150-122:0]: swin_backbone_type .............................. tiny +[ip-26-0-150-122:0]: tensor_model_parallel_size ...................... 1 +[ip-26-0-150-122:0]: tensorboard_dir ................................. /fsx/loubna/br4-experiments/tensorboard/debug +[ip-26-0-150-122:0]: tensorboard_log_interval ........................ 1 +[ip-26-0-150-122:0]: tensorboard_queue_size .......................... 1000 +[ip-26-0-150-122:0]: test_weighted_split_paths ....................... None +[ip-26-0-150-122:0]: test_weighted_split_paths_path .................. None +[ip-26-0-150-122:0]: titles_data_path ................................ None +[ip-26-0-150-122:0]: tokenizer_file .................................. /fsx/loubna/starcoder-tokenizer/15b/tokenizer.json +[ip-26-0-150-122:0]: tokenizer_type .................................. TokenizerFromFile +[ip-26-0-150-122:0]: train_iters ..................................... 150000 +[ip-26-0-150-122:0]: train_samples ................................... None +[ip-26-0-150-122:0]: train_weighted_split_names ...................... ['TRAIN'] +[ip-26-0-150-122:0]: train_weighted_split_paths ...................... [['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document']] +[ip-26-0-150-122:0]: train_weighted_split_paths_path ................. None +[ip-26-0-150-122:0]: train_weighted_split_splits ..................... [['0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969', '0:0.969']] +[ip-26-0-150-122:0]: train_weighted_split_weights .................... [['3.0', '0.01', '53.89', '1.78', '0.85', '5.68', '0.01', '1.31', '0.98', '0.08', '0.03', '0.09', '1.12', '23.78', '0.7', '0.61', '0.26', '1.68', '2.23', '0.3', '0.31', '0.45', '0.12', '6.81', '9.11', '0.06', '44.66', '0.58', '2.23', '0.01', '1.25', '1.03', '1.31', '2.87', '0.01', '0.05', '3.32', '0.03', '0.19', '0.39', '5.2', '0.02', '1.56', '0.01', '0.07', '0.41', '3.66', '0.56', '0.03', '0.001', '0.23', '0.02', '0.01', '4.69', '0.35', '0.33', '0.01', '3.09', '0.46', '0.2', '0.05', '0.04', '11.09', '0.4', '0.3', '0.42', '48.92', '0.64', '1.4', '0.71', '0.91', '29.36', '86.94', '64.71', '74.93', '60.89', '60.4', '26.52', '0.001', '1.42', '0.94', '0.01', '0.0002', '0.11', '0.18', '0.05', '1.0', '1.0', '54.4', '32.0', '7.12', '6.0']] +[ip-26-0-150-122:0]: transformer_pipeline_model_parallel_size ........ 1 +[ip-26-0-150-122:0]: transformer_timers .............................. False +[ip-26-0-150-122:0]: use_checkpoint_args ............................. False +[ip-26-0-150-122:0]: use_checkpoint_opt_param_scheduler .............. False +[ip-26-0-150-122:0]: use_contiguous_buffers_in_local_ddp ............. True +[ip-26-0-150-122:0]: use_cpu_initialization .......................... None +[ip-26-0-150-122:0]: use_distributed_optimizer ....................... False +[ip-26-0-150-122:0]: use_flash_attn .................................. True +[ip-26-0-150-122:0]: use_one_sent_docs ............................... False +[ip-26-0-150-122:0]: valid_num_workers ............................... 0 +[ip-26-0-150-122:0]: valid_weighted_split_names ...................... ['VALID_css', 'VALID_prolog', 'VALID_c', 'VALID_fortran', 'VALID_solidity', 'VALID_kotlin', 'VALID_literate-agda', 'VALID_julia', 'VALID_java-server-pages', 'VALID_isabelle', 'VALID_idris', 'VALID_lean', 'VALID_powershell', 'VALID_go', 'VALID_erlang', 'VALID_f-sharp', 'VALID_ada', 'VALID_pascal', 'VALID_perl', 'VALID_r', 'VALID_protocol-buffer', 'VALID_cmake', 'VALID_sas', 'VALID_ruby', 'VALID_rust', 'VALID_rmarkdown', 'VALID_c-sharp', 'VALID_smalltalk', 'VALID_haskell', 'VALID_maple', 'VALID_mathematica', 'VALID_ocaml', 'VALID_makefile', 'VALID_lua', 'VALID_literate-coffeescript', 'VALID_literate-haskell', 'VALID_restructuredtext', 'VALID_racket', 'VALID_standard-ml', 'VALID_systemverilog', 'VALID_tex', 'VALID_awk', 'VALID_assembly', 'VALID_alloy', 'VALID_agda', 'VALID_emacs-lisp', 'VALID_dart', 'VALID_cuda', 'VALID_bluespec', 'VALID_augeas', 'VALID_batchfile', 'VALID_tcsh', 'VALID_stan', 'VALID_scala', 'VALID_tcl', 'VALID_stata', 'VALID_applescript', 'VALID_shell', 'VALID_clojure', 'VALID_scheme', 'VALID_antlr', 'VALID_sparql', 'VALID_sql', 'VALID_glsl', 'VALID_elm', 'VALID_dockerfile', 'VALID_cpp', 'VALID_coffeescript', 'VALID_common-lisp', 'VALID_elixir', 'VALID_groovy', 'VALID_html', 'VALID_java', 'VALID_javascript', 'VALID_markdown', 'VALID_php', 'VALID_python', 'VALID_typescript', 'VALID_verilog', 'VALID_visual-basic', 'VALID_vhdl', 'VALID_thrift', 'VALID_matlab', 'VALID_yacc', 'VALID_zig', 'VALID_xslt', 'VALID_json', 'VALID_yaml', 'VALID_gh_issues', 'VALID_gh_commits', 'VALID_notebook_scripts', 'VALID_notebook_structured', 'VALID_all_sources_weighted'] +[ip-26-0-150-122:0]: valid_weighted_split_paths ...................... [['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document'], ['/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document', '/fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document']] +[ip-26-0-150-122:0]: valid_weighted_split_paths_path ................. None +[ip-26-0-150-122:0]: valid_weighted_split_splits ..................... [['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999'], ['0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999', '0.969:0.999']] +[ip-26-0-150-122:0]: valid_weighted_split_weights .................... [['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['1'], ['3.0', '0.01', '53.89', '1.78', '0.85', '5.68', '0.01', '1.31', '0.98', '0.08', '0.03', '0.09', '1.12', '23.78', '0.7', '0.61', '0.26', '1.68', '2.23', '0.3', '0.31', '0.45', '0.12', '6.81', '9.11', '0.06', '44.66', '0.58', '2.23', '0.01', '1.25', '1.03', '1.31', '2.87', '0.01', '0.05', '3.32', '0.03', '0.19', '0.39', '5.2', '0.02', '1.56', '0.01', '0.07', '0.41', '3.66', '0.56', '0.03', '0.001', '0.23', '0.02', '0.01', '4.69', '0.35', '0.33', '0.01', '3.09', '0.46', '0.2', '0.05', '0.04', '11.09', '0.4', '0.3', '0.42', '48.92', '0.64', '1.4', '0.71', '0.91', '29.36', '86.94', '64.71', '74.93', '60.89', '60.4', '26.52', '0.001', '1.42', '0.94', '0.01', '0.0002', '0.11', '0.18', '0.05', '1.0', '1.0', '54.4', '32.0', '7.12', '6.0']] +[ip-26-0-150-122:0]: virtual_pipeline_model_parallel_size ............ None +[ip-26-0-150-122:0]: vision_backbone_type ............................ vit +[ip-26-0-150-122:0]: vision_pretraining .............................. False +[ip-26-0-150-122:0]: vision_pretraining_type ......................... classify +[ip-26-0-150-122:0]: vocab_extra_ids ................................. 0 +[ip-26-0-150-122:0]: vocab_file ...................................... None +[ip-26-0-150-122:0]: wandb_entity_name ............................... loubnabnl +[ip-26-0-150-122:0]: wandb_project_name .............................. 1b-model +[ip-26-0-150-122:0]: weight_decay .................................... 0.1 +[ip-26-0-150-122:0]: weight_decay_incr_style ......................... constant +[ip-26-0-150-122:0]: world_size ...................................... 64 +[ip-26-0-150-122:0]:-------------------- end of arguments --------------------- +[ip-26-0-150-122:0]:setting number of micro-batches to constant 1 +[ip-26-0-150-122:0]:> building TokenizerFromFile tokenizer ... +[ip-26-0-150-122:0]: > padded vocab (size: 49152) with 0 dummy tokens (new size: 49152) +[ip-26-0-150-122:0]:> initializing torch distributed ... +[ip-26-0-155-69:7]:> setting tensorboard ... +[ip-26-0-150-122:0]:> initializing tensor model parallel with size 1 +[ip-26-0-150-122:0]:> initializing pipeline model parallel with size 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:07,667 [Rank 0]: > setting random seeds to 1234 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:07,669 [Rank 0]: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 +[ip-26-0-150-122:0]:2023-06-21 17:27:07,669 [Rank 0]: > compiling dataset index builder ... +[ip-26-0-150-122:0]:make: Entering directory '/fsx/loubna/code/Megatron-LM/megatron/data' +[ip-26-0-150-122:0]:make: Nothing to be done for 'default'. +[ip-26-0-150-122:0]:make: Leaving directory '/fsx/loubna/code/Megatron-LM/megatron/data' +[ip-26-0-150-122:0]:2023-06-21 17:27:07,730 [Rank 0]: >>> done with dataset index builder. Compilation time: 0.061 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:07,730 [Rank 0]: > compiling and loading fused kernels ... +[ip-26-0-150-122:0]:2023-06-21 17:27:07,843 [Rank 0]: Detected CUDA files, patching ldflags +[ip-26-0-150-122:0]:2023-06-21 17:27:07,843 [Rank 0]: Emitting ninja build file /fsx/loubna/code/Megatron-LM/megatron/fused_kernels/build/build.ninja... +[ip-26-0-150-122:0]:2023-06-21 17:27:07,848 [Rank 0]: Building extension module scaled_upper_triang_masked_softmax_cuda... +[ip-26-0-150-122:0]:2023-06-21 17:27:07,848 [Rank 0]: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[ip-26-0-150-122:0]:ninja: no work to do. +[ip-26-0-150-122:0]:2023-06-21 17:27:07,966 [Rank 0]: Loading extension module scaled_upper_triang_masked_softmax_cuda... +[ip-26-0-150-122:0]:2023-06-21 17:27:08,063 [Rank 0]: Detected CUDA files, patching ldflags +[ip-26-0-150-122:0]:2023-06-21 17:27:08,063 [Rank 0]: Emitting ninja build file /fsx/loubna/code/Megatron-LM/megatron/fused_kernels/build/build.ninja... +[ip-26-0-150-122:0]:2023-06-21 17:27:08,066 [Rank 0]: Building extension module scaled_masked_softmax_cuda... +[ip-26-0-150-122:0]:2023-06-21 17:27:08,066 [Rank 0]: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[ip-26-0-150-122:0]:ninja: no work to do. +[ip-26-0-150-122:0]:2023-06-21 17:27:08,184 [Rank 0]: Loading extension module scaled_masked_softmax_cuda... +[ip-26-0-150-122:0]:2023-06-21 17:27:08,283 [Rank 0]: Detected CUDA files, patching ldflags +[ip-26-0-150-122:0]:2023-06-21 17:27:08,283 [Rank 0]: Emitting ninja build file /fsx/loubna/code/Megatron-LM/megatron/fused_kernels/build/build.ninja... +[ip-26-0-150-122:0]:2023-06-21 17:27:08,285 [Rank 0]: Building extension module scaled_softmax_cuda... +[ip-26-0-150-122:0]:2023-06-21 17:27:08,285 [Rank 0]: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[ip-26-0-150-122:0]:ninja: no work to do. +[ip-26-0-150-122:0]:2023-06-21 17:27:08,400 [Rank 0]: Loading extension module scaled_softmax_cuda... +[ip-26-0-150-122:0]:2023-06-21 17:27:08,523 [Rank 0]: Detected CUDA files, patching ldflags +[ip-26-0-150-122:0]:2023-06-21 17:27:08,523 [Rank 0]: Emitting ninja build file /fsx/loubna/code/Megatron-LM/megatron/fused_kernels/build/build.ninja... +[ip-26-0-150-122:0]:2023-06-21 17:27:08,525 [Rank 0]: Building extension module fused_mix_prec_layer_norm_cuda... +[ip-26-0-150-122:0]:2023-06-21 17:27:08,525 [Rank 0]: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[ip-26-0-150-122:0]:ninja: no work to do. +[ip-26-0-150-122:0]:2023-06-21 17:27:08,639 [Rank 0]: Loading extension module fused_mix_prec_layer_norm_cuda... +[ip-26-0-150-122:0]:2023-06-21 17:27:08,734 [Rank 0]: Detected CUDA files, patching ldflags +[ip-26-0-150-122:0]:2023-06-21 17:27:08,734 [Rank 0]: Emitting ninja build file /fsx/loubna/code/Megatron-LM/megatron/fused_kernels/build/build.ninja... +[ip-26-0-150-122:0]:2023-06-21 17:27:08,736 [Rank 0]: Building extension module fused_dense_cuda... +[ip-26-0-150-122:0]:2023-06-21 17:27:08,736 [Rank 0]: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[ip-26-0-150-122:0]:ninja: no work to do. +[ip-26-0-150-122:0]:2023-06-21 17:27:08,853 [Rank 0]: Loading extension module fused_dense_cuda... +[ip-26-0-150-122:0]:2023-06-21 17:27:20,633 [Rank 0]: >>> done with compiling and loading fused kernels. Compilation time: 12.903 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:22,218 [Rank 0]: time to initialize megatron (seconds): 20.911 +[ip-26-0-150-122:0]:2023-06-21 17:27:22,220 [Rank 0]: [after megatron is initialized] datetime: 2023-06-21 17:27:22 +[ip-26-0-150-122:0]:2023-06-21 17:27:22,220 [Rank 0]: building GPT model ... +[ip-26-0-150-122:0]:2023-06-21 17:27:22,697 [Rank 0]: > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 1137207296 +[ip-26-0-150-122:0]:2023-06-21 17:27:22,894 [Rank 0]: > learning rate decay style: cosine +[ip-26-0-150-122:0]:2023-06-21 17:27:22,908 [Rank 0]: WARNING: could not find the metadata file /fsx/bigcode/experiments/pretraining/1b-starcoder/latest_checkpointed_iteration.txt +[ip-26-0-150-122:0]:2023-06-21 17:27:22,909 [Rank 0]: will not load any checkpoints and will start from random +[ip-26-0-155-69:7]:2023-06-21 17:27:22,912 [Rank 63]: time (ms) | load-checkpoint: 5.72 +[ip-26-0-150-122:0]:2023-06-21 17:27:22,912 [Rank 0]: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-06-21 17:27:22 +[ip-26-0-150-122:0]:2023-06-21 17:27:22,912 [Rank 0]: > building train, validation, and test datasets ... +[ip-26-0-150-122:0]:2023-06-21 17:27:22,912 [Rank 0]: > datasets target sizes (minimum size): +[ip-26-0-150-122:0]:2023-06-21 17:27:22,912 [Rank 0]: train: 9600000 +[ip-26-0-150-122:0]:2023-06-21 17:27:22,912 [Rank 0]: validation: 2048 +[ip-26-0-150-122:0]:2023-06-21 17:27:22,912 [Rank 0]: test: 128 +[ip-26-0-150-122:0]:2023-06-21 17:27:22,912 [Rank 0]: > building train, validation, and test datasets for GPT ... +[ip-26-0-150-122:0]:2023-06-21 17:27:22,912 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:22,930 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:22,930 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:22,930 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:22,930 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:22,930 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:22,930 [Rank 0]: > finished creating indexed dataset in 0.017856 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:22,930 [Rank 0]: number of documents: 2721616 +[ip-26-0-150-122:0]:2023-06-21 17:27:22,930 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:22,931 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:22,931 [Rank 0]: document indices in [0, 2637246) total of 2637246 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:22,982 [Rank 0]: > Tokens per epoch: 4672499910 +[ip-26-0-150-122:0]:2023-06-21 17:27:22,983 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:22,983 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:23,104 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.121029 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 2637246 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 570373 +[ip-26-0-150-122:0]:2023-06-21 17:27:23,137 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.032361 +[ip-26-0-150-122:0]:2023-06-21 17:27:23,137 [Rank 0]: > building shuffle index with split [0, 570373) and [570373, 570373) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:23,156 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.018469 +[ip-26-0-150-122:0]:2023-06-21 17:27:26,339 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document_TRAIN_indexmap_37739ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:26,343 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document_TRAIN_indexmap_37739ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:26,345 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document_TRAIN_indexmap_37739ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:26,346 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:26,346 [Rank 0]: total number of samples: 570374 +[ip-26-0-150-122:0]:2023-06-21 17:27:26,346 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:26,430 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:26,430 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:26,430 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:26,431 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:26,431 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:26,431 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:26,431 [Rank 0]: > finished creating indexed dataset in 0.000770 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:26,431 [Rank 0]: number of documents: 968 +[ip-26-0-150-122:0]:2023-06-21 17:27:26,431 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:26,431 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:26,431 [Rank 0]: document indices in [0, 938) total of 938 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:26,431 [Rank 0]: > Tokens per epoch: 3695701 +[ip-26-0-150-122:0]:2023-06-21 17:27:26,432 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:26,432 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:26,436 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003263 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 938 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 451 +[ip-26-0-150-122:0]:2023-06-21 17:27:26,438 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002331 +[ip-26-0-150-122:0]:2023-06-21 17:27:26,438 [Rank 0]: > building shuffle index with split [0, 451) and [451, 451) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:26,440 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001790 +[ip-26-0-150-122:0]:2023-06-21 17:27:26,470 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:26,475 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:26,479 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:26,482 [Rank 0]: loaded indexed file in 0.012 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:26,482 [Rank 0]: total number of samples: 452 +[ip-26-0-150-122:0]:2023-06-21 17:27:26,482 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:26,565 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:26,579 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:26,580 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:26,580 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:26,580 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:26,580 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:26,580 [Rank 0]: > finished creating indexed dataset in 0.014644 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:26,580 [Rank 0]: number of documents: 8536791 +[ip-26-0-150-122:0]:2023-06-21 17:27:26,580 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:26,580 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:26,580 [Rank 0]: document indices in [0, 8272150) total of 8272150 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:26,680 [Rank 0]: > Tokens per epoch: 19732817127 +[ip-26-0-150-122:0]:2023-06-21 17:27:26,681 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:26,681 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:27,074 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.392686 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 8272150 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2408791 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,203 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.128198 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,203 [Rank 0]: > building shuffle index with split [0, 2408791) and [2408791, 2408791) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,271 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.068081 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,272 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document_TRAIN_indexmap_677919ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:27,296 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document_TRAIN_indexmap_677919ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:27,311 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document_TRAIN_indexmap_677919ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:27,315 [Rank 0]: loaded indexed file in 0.043 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:27,315 [Rank 0]: total number of samples: 2408792 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,315 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,400 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,402 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,402 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,402 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,402 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,402 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,402 [Rank 0]: > finished creating indexed dataset in 0.001769 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:27,402 [Rank 0]: number of documents: 158792 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,403 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:27,403 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:27,403 [Rank 0]: document indices in [0, 153869) total of 153869 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:27,404 [Rank 0]: > Tokens per epoch: 654520539 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,405 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,405 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:27,413 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.007231 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 153869 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 79897 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,418 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.005225 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,418 [Rank 0]: > building shuffle index with split [0, 79897) and [79897, 79897) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,422 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004096 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,499 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document_TRAIN_indexmap_22392ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:27,508 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document_TRAIN_indexmap_22392ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:27,509 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document_TRAIN_indexmap_22392ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:27,509 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:27,509 [Rank 0]: total number of samples: 79898 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,509 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,592 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,604 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,604 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,604 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,605 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,605 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,605 [Rank 0]: > finished creating indexed dataset in 0.012752 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:27,605 [Rank 0]: number of documents: 153194 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,605 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:27,605 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:27,605 [Rank 0]: document indices in [0, 148445) total of 148445 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:27,607 [Rank 0]: > Tokens per epoch: 277062287 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,608 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,608 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:27,616 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.007522 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 148445 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 33821 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,619 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003392 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,619 [Rank 0]: > building shuffle index with split [0, 33821) and [33821, 33821) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,623 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003356 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,664 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document_TRAIN_indexmap_10693ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:27,673 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document_TRAIN_indexmap_10693ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:27,673 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document_TRAIN_indexmap_10693ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:27,674 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:27,674 [Rank 0]: total number of samples: 33822 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,674 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,757 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,769 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,769 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,769 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,769 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,770 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,770 [Rank 0]: > finished creating indexed dataset in 0.012699 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:27,770 [Rank 0]: number of documents: 2239354 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,770 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:27,770 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:27,770 [Rank 0]: document indices in [0, 2169934) total of 2169934 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:27,816 [Rank 0]: > Tokens per epoch: 1397148734 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,818 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,818 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:27,911 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.093259 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 2169934 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 170550 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,925 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.013573 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,925 [Rank 0]: > building shuffle index with split [0, 170550) and [170550, 170550) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:27,931 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.006101 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,932 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document_TRAIN_indexmap_71453ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:27,949 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document_TRAIN_indexmap_71453ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:27,952 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document_TRAIN_indexmap_71453ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:27,954 [Rank 0]: loaded indexed file in 0.022 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:27,954 [Rank 0]: total number of samples: 170551 +[ip-26-0-150-122:0]:2023-06-21 17:27:27,954 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,037 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,046 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,046 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,046 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,046 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,047 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,047 [Rank 0]: > finished creating indexed dataset in 0.009268 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:28,047 [Rank 0]: number of documents: 523 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,047 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:28,047 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:28,047 [Rank 0]: document indices in [0, 507) total of 507 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:28,047 [Rank 0]: > Tokens per epoch: 1923547 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,048 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,048 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:28,051 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002938 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 507 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 234 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,054 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003389 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,054 [Rank 0]: > building shuffle index with split [0, 234) and [234, 234) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,057 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002384 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,060 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:28,064 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:28,065 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:28,067 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:28,067 [Rank 0]: total number of samples: 235 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,067 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,151 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,166 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,166 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,166 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,166 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,167 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,167 [Rank 0]: > finished creating indexed dataset in 0.015570 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:28,167 [Rank 0]: number of documents: 295364 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,167 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:28,167 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:28,167 [Rank 0]: document indices in [0, 286208) total of 286208 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:28,169 [Rank 0]: > Tokens per epoch: 465259290 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,170 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,170 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:28,183 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.012969 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 286208 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 56794 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,188 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004081 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,188 [Rank 0]: > building shuffle index with split [0, 56794) and [56794, 56794) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,191 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003397 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,213 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document_TRAIN_indexmap_16480ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:28,220 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document_TRAIN_indexmap_16480ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:28,226 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document_TRAIN_indexmap_16480ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:28,226 [Rank 0]: loaded indexed file in 0.014 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:28,226 [Rank 0]: total number of samples: 56795 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,226 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,310 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,322 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,322 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,322 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,322 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,323 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,323 [Rank 0]: > finished creating indexed dataset in 0.013181 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:28,323 [Rank 0]: number of documents: 210816 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,323 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:28,323 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:28,323 [Rank 0]: document indices in [0, 204281) total of 204281 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:28,325 [Rank 0]: > Tokens per epoch: 280134685 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,326 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,326 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:28,336 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.009267 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 204281 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 34196 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,339 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003559 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,339 [Rank 0]: > building shuffle index with split [0, 34196) and [34196, 34196) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,342 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002761 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,375 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document_TRAIN_indexmap_12329ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:28,386 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document_TRAIN_indexmap_12329ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:28,386 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document_TRAIN_indexmap_12329ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:28,387 [Rank 0]: loaded indexed file in 0.012 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:28,387 [Rank 0]: total number of samples: 34197 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,387 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,470 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,473 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,473 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,473 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,473 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,474 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,474 [Rank 0]: > finished creating indexed dataset in 0.004061 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:28,474 [Rank 0]: number of documents: 5001 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,474 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:28,475 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:28,475 [Rank 0]: document indices in [0, 4846) total of 4846 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:28,475 [Rank 0]: > Tokens per epoch: 30040727 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,475 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,476 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:28,478 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002589 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 4846 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 3667 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,481 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002957 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,481 [Rank 0]: > building shuffle index with split [0, 3667) and [3667, 3667) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,483 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001854 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,493 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document_TRAIN_indexmap_1007ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:28,497 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document_TRAIN_indexmap_1007ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:28,498 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document_TRAIN_indexmap_1007ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:28,500 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:28,500 [Rank 0]: total number of samples: 3668 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,500 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,583 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,592 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,592 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,592 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,592 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,592 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,593 [Rank 0]: > finished creating indexed dataset in 0.009055 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:28,593 [Rank 0]: number of documents: 8042 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,593 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:28,593 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:28,593 [Rank 0]: document indices in [0, 7793) total of 7793 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:28,593 [Rank 0]: > Tokens per epoch: 9515228 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,594 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,594 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:28,597 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002761 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 7793 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 1161 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,600 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002955 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,600 [Rank 0]: > building shuffle index with split [0, 1161) and [1161, 1161) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,602 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001982 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,607 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document_TRAIN_indexmap_378ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:28,611 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document_TRAIN_indexmap_378ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:28,611 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document_TRAIN_indexmap_378ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:28,612 [Rank 0]: loaded indexed file in 0.005 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:28,612 [Rank 0]: total number of samples: 1162 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,612 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,695 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,708 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,708 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,708 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,708 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,708 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,708 [Rank 0]: > finished creating indexed dataset in 0.012909 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:28,708 [Rank 0]: number of documents: 16870 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,709 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:28,709 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:28,709 [Rank 0]: document indices in [0, 16347) total of 16347 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:28,709 [Rank 0]: > Tokens per epoch: 37114704 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,709 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,710 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:28,713 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003713 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 16347 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 4530 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,718 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004682 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,718 [Rank 0]: > building shuffle index with split [0, 4530) and [4530, 4530) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,720 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002166 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,746 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document_TRAIN_indexmap_1133ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:28,751 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document_TRAIN_indexmap_1133ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:28,752 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document_TRAIN_indexmap_1133ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:28,754 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:28,754 [Rank 0]: total number of samples: 4531 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,754 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,838 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,855 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,855 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,855 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,855 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,856 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,856 [Rank 0]: > finished creating indexed dataset in 0.017919 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:28,856 [Rank 0]: number of documents: 267627 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,856 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:28,856 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:28,856 [Rank 0]: document indices in [0, 259331) total of 259331 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:28,858 [Rank 0]: > Tokens per epoch: 277947540 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,860 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,860 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:28,871 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.011573 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 259331 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 33929 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,875 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003671 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,875 [Rank 0]: > building shuffle index with split [0, 33929) and [33929, 33929) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:28,878 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002755 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,907 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document_TRAIN_indexmap_14090ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:28,918 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document_TRAIN_indexmap_14090ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:28,918 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document_TRAIN_indexmap_14090ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:28,919 [Rank 0]: loaded indexed file in 0.012 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:28,919 [Rank 0]: total number of samples: 33930 +[ip-26-0-150-122:0]:2023-06-21 17:27:28,919 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,002 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,016 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,016 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,016 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,016 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,016 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,016 [Rank 0]: > finished creating indexed dataset in 0.014051 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:29,017 [Rank 0]: number of documents: 4700526 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,017 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:29,017 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:29,017 [Rank 0]: document indices in [0, 4554810) total of 4554810 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:29,086 [Rank 0]: > Tokens per epoch: 8260498119 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,086 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,087 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:29,312 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.225591 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 4554810 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 1008361 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,365 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.052079 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,365 [Rank 0]: > building shuffle index with split [0, 1008361) and [1008361, 1008361) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,399 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.034506 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,400 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document_TRAIN_indexmap_299145ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:29,419 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document_TRAIN_indexmap_299145ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:29,421 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document_TRAIN_indexmap_299145ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:29,424 [Rank 0]: loaded indexed file in 0.024 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:29,424 [Rank 0]: total number of samples: 1008362 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,424 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,506 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,507 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,507 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,507 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,507 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,509 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,509 [Rank 0]: > finished creating indexed dataset in 0.002917 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:29,509 [Rank 0]: number of documents: 98447 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,509 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:29,509 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:29,509 [Rank 0]: document indices in [0, 95395) total of 95395 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:29,510 [Rank 0]: > Tokens per epoch: 218848651 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,510 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,511 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:29,515 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.004892 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 95395 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 26714 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,520 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004217 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,520 [Rank 0]: > building shuffle index with split [0, 26714) and [26714, 26714) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,523 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002582 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,555 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document_TRAIN_indexmap_8806ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:29,563 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document_TRAIN_indexmap_8806ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:29,564 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document_TRAIN_indexmap_8806ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:29,564 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:29,564 [Rank 0]: total number of samples: 26715 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,564 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,648 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,660 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,660 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,660 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,660 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,661 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,661 [Rank 0]: > finished creating indexed dataset in 0.013133 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:29,661 [Rank 0]: number of documents: 124066 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,661 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:29,661 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:29,661 [Rank 0]: document indices in [0, 120220) total of 120220 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:29,662 [Rank 0]: > Tokens per epoch: 158541495 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,663 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,663 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:29,669 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.006329 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 120220 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 19353 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,673 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004047 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,674 [Rank 0]: > building shuffle index with split [0, 19353) and [19353, 19353) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,676 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002883 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,677 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document_TRAIN_indexmap_7674ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:29,686 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document_TRAIN_indexmap_7674ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:29,686 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document_TRAIN_indexmap_7674ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:29,686 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:29,687 [Rank 0]: total number of samples: 19354 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,687 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,770 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,777 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,777 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,778 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,778 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,778 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,778 [Rank 0]: > finished creating indexed dataset in 0.008467 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:29,778 [Rank 0]: number of documents: 30934 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,778 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:29,778 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:29,779 [Rank 0]: document indices in [0, 29975) total of 29975 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:29,779 [Rank 0]: > Tokens per epoch: 67801957 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,780 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,780 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:29,783 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003628 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 29975 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 8276 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,788 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004615 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,788 [Rank 0]: > building shuffle index with split [0, 8276) and [8276, 8276) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,790 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002294 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,832 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document_TRAIN_indexmap_3271ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:29,837 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document_TRAIN_indexmap_3271ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:29,838 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document_TRAIN_indexmap_3271ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:29,838 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:29,838 [Rank 0]: total number of samples: 8277 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,838 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,922 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,935 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,935 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,935 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,935 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,936 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,936 [Rank 0]: > finished creating indexed dataset in 0.013900 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:29,936 [Rank 0]: number of documents: 110981 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,936 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:29,936 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:29,936 [Rank 0]: document indices in [0, 107541) total of 107541 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:29,937 [Rank 0]: > Tokens per epoch: 664777580 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,938 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,938 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:29,945 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.006143 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 107541 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 81149 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,949 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004054 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,949 [Rank 0]: > building shuffle index with split [0, 81149) and [81149, 81149) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:29,953 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003844 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,953 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document_TRAIN_indexmap_21134ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:29,961 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document_TRAIN_indexmap_21134ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:29,962 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document_TRAIN_indexmap_21134ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:29,963 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:29,963 [Rank 0]: total number of samples: 81150 +[ip-26-0-150-122:0]:2023-06-21 17:27:29,963 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,046 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,062 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,062 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,062 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,062 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,062 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,062 [Rank 0]: > finished creating indexed dataset in 0.015842 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:30,062 [Rank 0]: number of documents: 365491 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,062 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:30,062 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:30,062 [Rank 0]: document indices in [0, 354161) total of 354161 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:30,065 [Rank 0]: > Tokens per epoch: 785360896 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,066 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,066 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:30,082 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.015095 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 354161 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 95869 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,087 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.005352 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,087 [Rank 0]: > building shuffle index with split [0, 95869) and [95869, 95869) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,092 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.005175 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,093 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document_TRAIN_indexmap_28053ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:30,103 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document_TRAIN_indexmap_28053ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:30,105 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document_TRAIN_indexmap_28053ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:30,105 [Rank 0]: loaded indexed file in 0.013 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:30,105 [Rank 0]: total number of samples: 95870 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,106 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,189 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,201 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,201 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,201 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,201 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,202 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,203 [Rank 0]: > finished creating indexed dataset in 0.013115 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:30,203 [Rank 0]: number of documents: 39042 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,203 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:30,203 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:30,203 [Rank 0]: document indices in [0, 37832) total of 37832 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:30,203 [Rank 0]: > Tokens per epoch: 101034661 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,204 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,204 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:30,208 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003872 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 37832 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 12333 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,211 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002852 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,211 [Rank 0]: > building shuffle index with split [0, 12333) and [12333, 12333) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,214 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002238 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,256 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document_TRAIN_indexmap_3774ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:30,260 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document_TRAIN_indexmap_3774ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:30,261 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document_TRAIN_indexmap_3774ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:30,261 [Rank 0]: loaded indexed file in 0.005 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:30,261 [Rank 0]: total number of samples: 12334 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,261 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,345 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,359 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,359 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,359 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,359 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,360 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,360 [Rank 0]: > finished creating indexed dataset in 0.015110 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:30,360 [Rank 0]: number of documents: 97167 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,360 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:30,360 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:30,360 [Rank 0]: document indices in [0, 94155) total of 94155 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:30,361 [Rank 0]: > Tokens per epoch: 97494653 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,363 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,363 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:30,369 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.006694 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 94155 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 11901 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,375 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.005147 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,375 [Rank 0]: > building shuffle index with split [0, 11901) and [11901, 11901) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,377 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001879 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,377 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document_TRAIN_indexmap_3900ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:30,386 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document_TRAIN_indexmap_3900ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:30,386 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document_TRAIN_indexmap_3900ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:30,387 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:30,387 [Rank 0]: total number of samples: 11902 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,387 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,471 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,483 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,483 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,483 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,484 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,484 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,484 [Rank 0]: > finished creating indexed dataset in 0.012784 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:30,484 [Rank 0]: number of documents: 186375 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,484 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:30,484 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:30,484 [Rank 0]: document indices in [0, 180597) total of 180597 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:30,485 [Rank 0]: > Tokens per epoch: 146595317 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,486 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,486 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:30,495 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.008443 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 180597 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 17894 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,499 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003904 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,499 [Rank 0]: > building shuffle index with split [0, 17894) and [17894, 17894) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,502 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002908 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,502 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document_TRAIN_indexmap_5661ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:30,511 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document_TRAIN_indexmap_5661ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:30,511 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document_TRAIN_indexmap_5661ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:30,512 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:30,512 [Rank 0]: total number of samples: 17895 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,512 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,595 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,599 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,600 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,600 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,600 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,601 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,601 [Rank 0]: > finished creating indexed dataset in 0.005655 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:30,601 [Rank 0]: number of documents: 9226 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,601 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:30,601 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:30,601 [Rank 0]: document indices in [0, 8940) total of 8940 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:30,601 [Rank 0]: > Tokens per epoch: 51420995 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,602 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,602 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:30,605 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003138 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 8940 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 6276 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,608 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002635 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,608 [Rank 0]: > building shuffle index with split [0, 6276) and [6276, 6276) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,610 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001816 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,617 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document_TRAIN_indexmap_1510ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:30,623 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document_TRAIN_indexmap_1510ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:30,624 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document_TRAIN_indexmap_1510ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:30,624 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:30,624 [Rank 0]: total number of samples: 6277 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,624 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,708 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,727 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,727 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,728 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,728 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,728 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,728 [Rank 0]: > finished creating indexed dataset in 0.019740 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:30,728 [Rank 0]: number of documents: 3390320 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,728 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:30,728 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:30,728 [Rank 0]: document indices in [0, 3285220) total of 3285220 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:30,786 [Rank 0]: > Tokens per epoch: 1939961305 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,787 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,787 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:30,932 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.144127 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 3285220 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 236811 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,957 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.024971 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,957 [Rank 0]: > building shuffle index with split [0, 236811) and [236811, 236811) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:30,965 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.008261 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,966 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document_TRAIN_indexmap_85668ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:30,984 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document_TRAIN_indexmap_85668ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:30,986 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document_TRAIN_indexmap_85668ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:30,988 [Rank 0]: loaded indexed file in 0.022 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:30,988 [Rank 0]: total number of samples: 236812 +[ip-26-0-150-122:0]:2023-06-21 17:27:30,988 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:31,070 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,086 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,086 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,086 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,086 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,087 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,087 [Rank 0]: > finished creating indexed dataset in 0.016757 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:31,087 [Rank 0]: number of documents: 1380468 +[ip-26-0-150-122:0]:2023-06-21 17:27:31,087 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:31,087 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:31,087 [Rank 0]: document indices in [0, 1337673) total of 1337673 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:31,119 [Rank 0]: > Tokens per epoch: 2604422294 +[ip-26-0-150-122:0]:2023-06-21 17:27:31,121 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,121 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:31,173 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.052313 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1337673 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 317922 +[ip-26-0-150-122:0]:2023-06-21 17:27:31,185 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.011786 +[ip-26-0-150-122:0]:2023-06-21 17:27:31,185 [Rank 0]: > building shuffle index with split [0, 317922) and [317922, 317922) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,196 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.010673 +[ip-26-0-150-122:0]:2023-06-21 17:27:31,197 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document_TRAIN_indexmap_114601ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:31,212 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document_TRAIN_indexmap_114601ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:31,214 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document_TRAIN_indexmap_114601ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:31,216 [Rank 0]: loaded indexed file in 0.019 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:31,216 [Rank 0]: total number of samples: 317923 +[ip-26-0-150-122:0]:2023-06-21 17:27:31,216 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:31,299 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,309 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,309 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,309 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,309 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,310 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,310 [Rank 0]: > finished creating indexed dataset in 0.011284 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:31,311 [Rank 0]: number of documents: 5386 +[ip-26-0-150-122:0]:2023-06-21 17:27:31,311 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:31,311 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:31,311 [Rank 0]: document indices in [0, 5219) total of 5219 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:31,311 [Rank 0]: > Tokens per epoch: 18878105 +[ip-26-0-150-122:0]:2023-06-21 17:27:31,311 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,312 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:31,314 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002104 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 5219 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2304 +[ip-26-0-150-122:0]:2023-06-21 17:27:31,317 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002797 +[ip-26-0-150-122:0]:2023-06-21 17:27:31,317 [Rank 0]: > building shuffle index with split [0, 2304) and [2304, 2304) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,319 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002167 +[ip-26-0-150-122:0]:2023-06-21 17:27:31,327 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document_TRAIN_indexmap_755ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:31,331 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document_TRAIN_indexmap_755ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:31,333 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document_TRAIN_indexmap_755ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:31,334 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:31,334 [Rank 0]: total number of samples: 2305 +[ip-26-0-150-122:0]:2023-06-21 17:27:31,334 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:31,418 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,434 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,434 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,435 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,435 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,435 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,435 [Rank 0]: > finished creating indexed dataset in 0.017124 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:31,435 [Rank 0]: number of documents: 10801285 +[ip-26-0-150-122:0]:2023-06-21 17:27:31,435 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:31,435 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:31,435 [Rank 0]: document indices in [0, 10466445) total of 10466445 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:31,562 [Rank 0]: > Tokens per epoch: 10146940270 +[ip-26-0-150-122:0]:2023-06-21 17:27:31,563 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:31,563 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:32,100 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.536307 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 10466445 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 1238640 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,212 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.112106 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,212 [Rank 0]: > building shuffle index with split [0, 1238640) and [1238640, 1238640) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,251 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.038956 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,252 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document_TRAIN_indexmap_561808ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:32,279 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document_TRAIN_indexmap_561808ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:32,283 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document_TRAIN_indexmap_561808ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:32,285 [Rank 0]: loaded indexed file in 0.033 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:32,285 [Rank 0]: total number of samples: 1238641 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,285 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,372 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,374 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,374 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,375 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,375 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,375 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,375 [Rank 0]: > finished creating indexed dataset in 0.002100 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:32,375 [Rank 0]: number of documents: 587748 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,375 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:32,375 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:32,375 [Rank 0]: document indices in [0, 569528) total of 569528 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:32,378 [Rank 0]: > Tokens per epoch: 191397544 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,380 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,380 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:32,403 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.022985 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 569528 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 23363 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,408 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004616 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,408 [Rank 0]: > building shuffle index with split [0, 23363) and [23363, 23363) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,411 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003384 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,430 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document_TRAIN_indexmap_7297ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:32,443 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document_TRAIN_indexmap_7297ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:32,443 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document_TRAIN_indexmap_7297ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:32,444 [Rank 0]: loaded indexed file in 0.013 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:32,444 [Rank 0]: total number of samples: 23364 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,444 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,527 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,536 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,537 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,537 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,537 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,538 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,538 [Rank 0]: > finished creating indexed dataset in 0.010590 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:32,538 [Rank 0]: number of documents: 541454 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,538 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:32,538 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:32,538 [Rank 0]: document indices in [0, 524669) total of 524669 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:32,542 [Rank 0]: > Tokens per epoch: 632376464 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,543 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,543 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:32,563 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.020094 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 524669 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 77194 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,569 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.005697 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,569 [Rank 0]: > building shuffle index with split [0, 77194) and [77194, 77194) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,574 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004523 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,574 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document_TRAIN_indexmap_28053ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:32,583 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document_TRAIN_indexmap_28053ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:32,586 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document_TRAIN_indexmap_28053ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:32,587 [Rank 0]: loaded indexed file in 0.012 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:32,587 [Rank 0]: total number of samples: 77195 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,587 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,668 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,675 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,676 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,676 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,676 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,676 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,676 [Rank 0]: > finished creating indexed dataset in 0.007905 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:32,676 [Rank 0]: number of documents: 1152 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,676 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:32,676 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:32,676 [Rank 0]: document indices in [0, 1116) total of 1116 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:32,677 [Rank 0]: > Tokens per epoch: 1580323 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,677 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,677 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:32,679 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.001840 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1116 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 192 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,683 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003268 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,683 [Rank 0]: > building shuffle index with split [0, 192) and [192, 192) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,685 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002161 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,724 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:32,732 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:32,735 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:32,735 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:32,736 [Rank 0]: total number of samples: 193 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,736 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,819 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,830 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,830 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,830 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,830 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,832 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,832 [Rank 0]: > finished creating indexed dataset in 0.012427 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:32,832 [Rank 0]: number of documents: 22653 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,832 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:32,832 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:32,832 [Rank 0]: document indices in [0, 21951) total of 21951 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:32,832 [Rank 0]: > Tokens per epoch: 493660881 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,834 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,834 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:32,838 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.004048 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 21951 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 60261 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,842 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003935 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,842 [Rank 0]: > building shuffle index with split [0, 60261) and [60261, 60261) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,846 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004030 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,847 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document_TRAIN_indexmap_15725ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:32,852 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document_TRAIN_indexmap_15725ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:32,854 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document_TRAIN_indexmap_15725ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:32,856 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:32,856 [Rank 0]: total number of samples: 60262 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,856 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,940 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,956 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,956 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,956 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,956 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,957 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,957 [Rank 0]: > finished creating indexed dataset in 0.017241 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:32,957 [Rank 0]: number of documents: 158356 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,957 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:32,957 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:32,957 [Rank 0]: document indices in [0, 153447) total of 153447 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:32,958 [Rank 0]: > Tokens per epoch: 324030434 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,959 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,959 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:32,967 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.007948 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 153447 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 39554 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,972 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004329 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,972 [Rank 0]: > building shuffle index with split [0, 39554) and [39554, 39554) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:32,975 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002937 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,975 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document_TRAIN_indexmap_12958ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:32,981 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document_TRAIN_indexmap_12958ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:32,982 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document_TRAIN_indexmap_12958ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:32,983 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:32,983 [Rank 0]: total number of samples: 39555 +[ip-26-0-150-122:0]:2023-06-21 17:27:32,983 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,066 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,082 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,082 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,082 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,082 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,083 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,083 [Rank 0]: > finished creating indexed dataset in 0.016469 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:33,083 [Rank 0]: number of documents: 657349 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,083 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:33,083 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:33,083 [Rank 0]: document indices in [0, 636971) total of 636971 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:33,087 [Rank 0]: > Tokens per epoch: 483958770 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,089 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,089 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:33,113 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.024414 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 636971 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 59076 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,118 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004994 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,119 [Rank 0]: > building shuffle index with split [0, 59076) and [59076, 59076) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,122 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003517 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,125 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document_TRAIN_indexmap_16480ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:33,137 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document_TRAIN_indexmap_16480ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:33,138 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document_TRAIN_indexmap_16480ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:33,138 [Rank 0]: loaded indexed file in 0.013 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:33,138 [Rank 0]: total number of samples: 59077 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,138 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,223 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,232 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,232 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,233 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,233 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,233 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,233 [Rank 0]: > finished creating indexed dataset in 0.009945 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:33,233 [Rank 0]: number of documents: 549459 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,233 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:33,234 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:33,234 [Rank 0]: document indices in [0, 532426) total of 532426 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:33,237 [Rank 0]: > Tokens per epoch: 991398359 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,238 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,238 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:33,259 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.020328 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 532426 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 121020 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,265 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.005836 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,265 [Rank 0]: > building shuffle index with split [0, 121020) and [121020, 121020) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,270 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.005562 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,297 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document_TRAIN_indexmap_36104ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:33,313 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document_TRAIN_indexmap_36104ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:33,321 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document_TRAIN_indexmap_36104ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:33,321 [Rank 0]: loaded indexed file in 0.024 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:33,322 [Rank 0]: total number of samples: 121021 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,322 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,403 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,404 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,404 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,404 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,404 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,404 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,404 [Rank 0]: > finished creating indexed dataset in 0.000715 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:33,404 [Rank 0]: number of documents: 1133 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,405 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:33,405 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:33,405 [Rank 0]: document indices in [0, 1098) total of 1098 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:33,405 [Rank 0]: > Tokens per epoch: 1211172 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,406 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,406 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:33,408 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002220 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1098 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 147 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,410 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002143 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,410 [Rank 0]: > building shuffle index with split [0, 147) and [147, 147) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,413 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002741 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,445 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:33,452 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:33,452 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:33,452 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:33,452 [Rank 0]: total number of samples: 148 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,453 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,536 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,542 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,542 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,542 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,542 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,543 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,543 [Rank 0]: > finished creating indexed dataset in 0.006200 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:33,543 [Rank 0]: number of documents: 6104 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,543 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:33,543 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:33,543 [Rank 0]: document indices in [0, 5915) total of 5915 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:33,543 [Rank 0]: > Tokens per epoch: 16061021 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,544 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,544 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:33,547 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002938 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 5915 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 1960 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,549 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002421 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,550 [Rank 0]: > building shuffle index with split [0, 1960) and [1960, 1960) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,553 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003209 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,553 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document_TRAIN_indexmap_629ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:33,557 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document_TRAIN_indexmap_629ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:33,558 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document_TRAIN_indexmap_629ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:33,558 [Rank 0]: loaded indexed file in 0.005 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:33,558 [Rank 0]: total number of samples: 1961 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,558 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,642 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,660 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,660 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,660 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,660 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,661 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,662 [Rank 0]: > finished creating indexed dataset in 0.019252 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:33,662 [Rank 0]: number of documents: 896880 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,662 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:33,662 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:33,662 [Rank 0]: document indices in [0, 869077) total of 869077 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:33,667 [Rank 0]: > Tokens per epoch: 1011350209 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,668 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,668 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:33,702 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.033750 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 869077 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 123455 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,710 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.007142 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,710 [Rank 0]: > building shuffle index with split [0, 123455) and [123455, 123455) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,718 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.008262 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,718 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document_TRAIN_indexmap_41765ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:33,732 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document_TRAIN_indexmap_41765ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:33,733 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document_TRAIN_indexmap_41765ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:33,733 [Rank 0]: loaded indexed file in 0.015 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:33,733 [Rank 0]: total number of samples: 123456 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,734 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,816 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,821 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,822 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,822 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,822 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,822 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,822 [Rank 0]: > finished creating indexed dataset in 0.005900 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:33,822 [Rank 0]: number of documents: 3688 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,823 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:33,823 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:33,823 [Rank 0]: document indices in [0, 3574) total of 3574 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:33,823 [Rank 0]: > Tokens per epoch: 7491397 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,824 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,824 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:33,826 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002629 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 3574 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 914 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,829 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002932 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,829 [Rank 0]: > building shuffle index with split [0, 914) and [914, 914) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,832 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002666 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,881 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document_TRAIN_indexmap_378ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:33,885 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document_TRAIN_indexmap_378ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:33,886 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document_TRAIN_indexmap_378ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:33,886 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:33,886 [Rank 0]: total number of samples: 915 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,886 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,970 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,981 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,981 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,981 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,981 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,982 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,982 [Rank 0]: > finished creating indexed dataset in 0.011794 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:33,982 [Rank 0]: number of documents: 19630 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,982 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:33,982 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:33,982 [Rank 0]: document indices in [0, 19021) total of 19021 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:33,983 [Rank 0]: > Tokens per epoch: 64556260 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,984 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,984 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:33,987 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003319 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 19021 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 7880 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,990 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002864 +[ip-26-0-150-122:0]:2023-06-21 17:27:33,990 [Rank 0]: > building shuffle index with split [0, 7880) and [7880, 7880) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:33,993 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002328 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,032 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document_TRAIN_indexmap_2391ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:34,036 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document_TRAIN_indexmap_2391ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:34,037 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document_TRAIN_indexmap_2391ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:34,040 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:34,040 [Rank 0]: total number of samples: 7881 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,040 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,124 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,133 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,133 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,133 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,133 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,134 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,135 [Rank 0]: > finished creating indexed dataset in 0.010661 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:34,135 [Rank 0]: number of documents: 46270 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,135 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:34,135 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:34,135 [Rank 0]: document indices in [0, 44836) total of 44836 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:34,135 [Rank 0]: > Tokens per epoch: 145587797 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,136 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,136 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:34,140 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.004174 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 44836 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 17771 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,144 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003956 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,144 [Rank 0]: > building shuffle index with split [0, 17771) and [17771, 17771) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,147 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002519 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,187 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document_TRAIN_indexmap_4907ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:34,192 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document_TRAIN_indexmap_4907ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:34,192 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document_TRAIN_indexmap_4907ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:34,193 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:34,193 [Rank 0]: total number of samples: 17772 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,193 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,277 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,295 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,295 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,295 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,295 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,295 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,295 [Rank 0]: > finished creating indexed dataset in 0.018105 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:34,295 [Rank 0]: number of documents: 522778 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,295 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:34,295 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:34,295 [Rank 0]: document indices in [0, 506572) total of 506572 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:34,299 [Rank 0]: > Tokens per epoch: 1833973827 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,301 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,301 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:34,321 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.020087 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 506572 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 223873 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,329 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.007429 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,329 [Rank 0]: > building shuffle index with split [0, 223873) and [223873, 223873) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,337 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.007948 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,337 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document_TRAIN_indexmap_65415ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:34,344 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document_TRAIN_indexmap_65415ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:34,345 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document_TRAIN_indexmap_65415ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:34,346 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:34,346 [Rank 0]: total number of samples: 223874 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,346 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,430 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,441 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,441 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,441 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,441 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,442 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,442 [Rank 0]: > finished creating indexed dataset in 0.011031 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:34,442 [Rank 0]: number of documents: 10289 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,442 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:34,442 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:34,442 [Rank 0]: document indices in [0, 9970) total of 9970 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:34,442 [Rank 0]: > Tokens per epoch: 7959007 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,443 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,444 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:34,446 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002748 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 9970 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 971 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,449 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002232 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,449 [Rank 0]: > building shuffle index with split [0, 971) and [971, 971) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,450 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001675 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,465 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document_TRAIN_indexmap_252ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:34,471 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document_TRAIN_indexmap_252ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:34,479 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document_TRAIN_indexmap_252ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:34,479 [Rank 0]: loaded indexed file in 0.014 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:34,479 [Rank 0]: total number of samples: 972 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,479 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,563 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,577 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,577 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,577 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,577 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,578 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,578 [Rank 0]: > finished creating indexed dataset in 0.014680 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:34,578 [Rank 0]: number of documents: 247919 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,578 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:34,578 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:34,578 [Rank 0]: document indices in [0, 240234) total of 240234 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:34,580 [Rank 0]: > Tokens per epoch: 774529956 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,581 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,581 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:34,592 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.011126 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 240234 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 94547 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,597 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004907 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,598 [Rank 0]: > building shuffle index with split [0, 94547) and [94547, 94547) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,602 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004326 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,624 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document_TRAIN_indexmap_19625ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:34,635 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document_TRAIN_indexmap_19625ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:34,635 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document_TRAIN_indexmap_19625ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:34,636 [Rank 0]: loaded indexed file in 0.012 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:34,636 [Rank 0]: total number of samples: 94548 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,636 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,720 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,727 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,727 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,727 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,727 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,727 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,727 [Rank 0]: > finished creating indexed dataset in 0.007609 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:34,728 [Rank 0]: number of documents: 5368 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,728 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:34,728 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:34,728 [Rank 0]: document indices in [0, 5202) total of 5202 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:34,728 [Rank 0]: > Tokens per epoch: 3049652 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,729 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,729 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:34,732 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002340 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 5202 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 372 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,734 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002822 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,735 [Rank 0]: > building shuffle index with split [0, 372) and [372, 372) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,737 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002442 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,788 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:34,792 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:34,792 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:34,794 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:34,794 [Rank 0]: total number of samples: 373 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,794 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,878 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,889 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,889 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,889 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,889 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,890 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,890 [Rank 0]: > finished creating indexed dataset in 0.011380 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:34,890 [Rank 0]: number of documents: 17554 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,890 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:34,890 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:34,890 [Rank 0]: document indices in [0, 17010) total of 17010 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:34,890 [Rank 0]: > Tokens per epoch: 31798875 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,892 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,892 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:34,895 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003489 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 17010 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 3881 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,898 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002290 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,898 [Rank 0]: > building shuffle index with split [0, 3881) and [3881, 3881) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:34,900 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002261 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,917 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document_TRAIN_indexmap_881ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:34,923 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document_TRAIN_indexmap_881ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:34,926 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document_TRAIN_indexmap_881ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:34,928 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:34,928 [Rank 0]: total number of samples: 3882 +[ip-26-0-150-122:0]:2023-06-21 17:27:34,928 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,012 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,023 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,023 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,023 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,023 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,024 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,024 [Rank 0]: > finished creating indexed dataset in 0.011803 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:35,024 [Rank 0]: number of documents: 52838 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,024 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:35,024 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:35,024 [Rank 0]: document indices in [0, 51200) total of 51200 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:35,025 [Rank 0]: > Tokens per epoch: 122908675 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,025 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,026 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:35,030 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.004827 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 51200 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 15003 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,034 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003850 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,034 [Rank 0]: > building shuffle index with split [0, 15003) and [15003, 15003) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,038 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003031 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,038 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document_TRAIN_indexmap_5158ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:35,045 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document_TRAIN_indexmap_5158ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:35,046 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document_TRAIN_indexmap_5158ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:35,046 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:35,046 [Rank 0]: total number of samples: 15004 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,046 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,130 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,145 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,145 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,145 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,146 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,146 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,146 [Rank 0]: > finished creating indexed dataset in 0.016123 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:35,146 [Rank 0]: number of documents: 928415 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,146 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:35,147 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:35,147 [Rank 0]: document indices in [0, 899634) total of 899634 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:35,152 [Rank 0]: > Tokens per epoch: 909176364 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,153 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,153 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:35,186 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.032443 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 899634 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 110983 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,193 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.006793 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,193 [Rank 0]: > building shuffle index with split [0, 110983) and [110983, 110983) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,199 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.005465 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,199 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document_TRAIN_indexmap_46042ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:35,213 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document_TRAIN_indexmap_46042ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:35,220 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document_TRAIN_indexmap_46042ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:35,220 [Rank 0]: loaded indexed file in 0.021 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:35,221 [Rank 0]: total number of samples: 110984 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,221 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,304 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,313 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,313 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,313 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,313 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,314 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,314 [Rank 0]: > finished creating indexed dataset in 0.010065 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:35,314 [Rank 0]: number of documents: 58151 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,314 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:35,314 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:35,314 [Rank 0]: document indices in [0, 56348) total of 56348 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:35,315 [Rank 0]: > Tokens per epoch: 185296479 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,316 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,316 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:35,320 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.004477 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 56348 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 22619 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,323 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002550 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,323 [Rank 0]: > building shuffle index with split [0, 22619) and [22619, 22619) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,326 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003331 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,367 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document_TRAIN_indexmap_7045ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:35,373 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document_TRAIN_indexmap_7045ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:35,373 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document_TRAIN_indexmap_7045ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:35,375 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:35,375 [Rank 0]: total number of samples: 22620 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,375 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,460 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,470 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,470 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,470 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,470 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,470 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,470 [Rank 0]: > finished creating indexed dataset in 0.010700 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:35,471 [Rank 0]: number of documents: 5928 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,471 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:35,471 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:35,471 [Rank 0]: document indices in [0, 5744) total of 5744 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:35,471 [Rank 0]: > Tokens per epoch: 10076335 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,471 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,472 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:35,475 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003244 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 5744 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 1230 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,478 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003426 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,479 [Rank 0]: > building shuffle index with split [0, 1230) and [1230, 1230) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,481 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002004 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,483 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document_TRAIN_indexmap_378ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:35,487 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document_TRAIN_indexmap_378ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:35,487 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document_TRAIN_indexmap_378ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:35,488 [Rank 0]: loaded indexed file in 0.005 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:35,488 [Rank 0]: total number of samples: 1231 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,488 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,572 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,582 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,582 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,582 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,582 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,584 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,584 [Rank 0]: > finished creating indexed dataset in 0.011400 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:35,584 [Rank 0]: number of documents: 180 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,584 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:35,584 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:35,584 [Rank 0]: document indices in [0, 174) total of 174 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:35,584 [Rank 0]: > Tokens per epoch: 173017 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,585 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,585 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:35,588 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002756 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 174 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 21 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,591 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002442 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,591 [Rank 0]: > building shuffle index with split [0, 21) and [21, 21) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,592 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001762 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,597 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document_TRAIN_indexmap_13ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:35,601 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document_TRAIN_indexmap_13ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:35,601 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document_TRAIN_indexmap_13ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:35,602 [Rank 0]: loaded indexed file in 0.005 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:35,602 [Rank 0]: total number of samples: 22 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,602 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,686 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,706 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,706 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,706 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,706 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,707 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,707 [Rank 0]: > finished creating indexed dataset in 0.020667 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:35,707 [Rank 0]: number of documents: 239568 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,707 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:35,707 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:35,707 [Rank 0]: document indices in [0, 232141) total of 232141 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:35,709 [Rank 0]: > Tokens per epoch: 91736699 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,709 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,709 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:35,720 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.010457 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 232141 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 11198 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,723 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003149 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,723 [Rank 0]: > building shuffle index with split [0, 11198) and [11198, 11198) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,725 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001997 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,726 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document_TRAIN_indexmap_2894ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:35,736 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document_TRAIN_indexmap_2894ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:35,737 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document_TRAIN_indexmap_2894ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:35,737 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:35,737 [Rank 0]: total number of samples: 11199 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,737 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,821 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,829 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,829 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,829 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,829 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,831 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,831 [Rank 0]: > finished creating indexed dataset in 0.009548 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:35,831 [Rank 0]: number of documents: 4806 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,831 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:35,831 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:35,831 [Rank 0]: document indices in [0, 4657) total of 4657 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:35,831 [Rank 0]: > Tokens per epoch: 6417550 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,832 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,833 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:35,872 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.039575 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 4657 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 783 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,876 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003780 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,876 [Rank 0]: > building shuffle index with split [0, 783) and [783, 783) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,883 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.006587 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,883 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document_TRAIN_indexmap_252ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:35,888 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document_TRAIN_indexmap_252ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:35,891 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document_TRAIN_indexmap_252ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:35,893 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:35,893 [Rank 0]: total number of samples: 784 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,893 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,978 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,988 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,988 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,988 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,988 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,990 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,990 [Rank 0]: > finished creating indexed dataset in 0.011637 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:35,990 [Rank 0]: number of documents: 5429 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,990 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:35,990 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:35,990 [Rank 0]: document indices in [0, 5261) total of 5261 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:35,991 [Rank 0]: > Tokens per epoch: 5171243 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,992 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:35,992 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:35,995 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003284 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 5261 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 631 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,999 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003735 +[ip-26-0-150-122:0]:2023-06-21 17:27:35,999 [Rank 0]: > building shuffle index with split [0, 631) and [631, 631) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,002 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002413 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,002 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:36,008 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:36,009 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:36,009 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:36,009 [Rank 0]: total number of samples: 632 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,009 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,093 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,109 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,109 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,109 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,109 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,110 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,110 [Rank 0]: > finished creating indexed dataset in 0.016389 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:36,110 [Rank 0]: number of documents: 1355788 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,110 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:36,110 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:36,110 [Rank 0]: document indices in [0, 1313759) total of 1313759 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:36,136 [Rank 0]: > Tokens per epoch: 1259346636 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,137 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,137 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:36,188 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.050271 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1313759 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 153728 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,198 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.009984 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,198 [Rank 0]: > building shuffle index with split [0, 153728) and [153728, 153728) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,204 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.006081 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,205 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document_TRAIN_indexmap_58999ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:36,221 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document_TRAIN_indexmap_58999ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:36,222 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document_TRAIN_indexmap_58999ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:36,223 [Rank 0]: loaded indexed file in 0.018 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:36,223 [Rank 0]: total number of samples: 153729 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,223 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,306 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,317 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,317 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,317 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,317 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,318 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,318 [Rank 0]: > finished creating indexed dataset in 0.011732 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:36,318 [Rank 0]: number of documents: 49335 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,318 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:36,318 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:36,319 [Rank 0]: document indices in [0, 47806) total of 47806 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:36,319 [Rank 0]: > Tokens per epoch: 118964691 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,320 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,320 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:36,324 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.004637 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 47806 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 14522 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,328 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003610 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,328 [Rank 0]: > building shuffle index with split [0, 14522) and [14522, 14522) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,333 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004287 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,333 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document_TRAIN_indexmap_4403ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:36,341 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document_TRAIN_indexmap_4403ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:36,341 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document_TRAIN_indexmap_4403ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:36,341 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:36,342 [Rank 0]: total number of samples: 14523 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,342 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,425 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,432 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,432 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,432 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,432 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,432 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,432 [Rank 0]: > finished creating indexed dataset in 0.006905 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:36,433 [Rank 0]: number of documents: 24208 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,433 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:36,433 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:36,433 [Rank 0]: document indices in [0, 23458) total of 23458 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:36,433 [Rank 0]: > Tokens per epoch: 211084584 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,434 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,434 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:36,437 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003057 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 23458 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 25767 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,440 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003007 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,440 [Rank 0]: > building shuffle index with split [0, 25767) and [25767, 25767) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,443 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003457 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,448 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document_TRAIN_indexmap_4152ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:36,453 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document_TRAIN_indexmap_4152ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:36,454 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document_TRAIN_indexmap_4152ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:36,455 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:36,455 [Rank 0]: total number of samples: 25768 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,455 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,539 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,547 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,547 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,547 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,548 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,548 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,548 [Rank 0]: > finished creating indexed dataset in 0.008639 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:36,548 [Rank 0]: number of documents: 4737 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,548 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:36,548 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:36,548 [Rank 0]: document indices in [0, 4590) total of 4590 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:36,548 [Rank 0]: > Tokens per epoch: 2509212 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,549 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,550 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:36,552 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002849 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 4590 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 306 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,555 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002691 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,555 [Rank 0]: > building shuffle index with split [0, 306) and [306, 306) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,557 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002040 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,562 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:36,566 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:36,567 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:36,567 [Rank 0]: loaded indexed file in 0.005 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:36,567 [Rank 0]: total number of samples: 307 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,567 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,652 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,670 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,670 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,670 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,670 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,670 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,671 [Rank 0]: > finished creating indexed dataset in 0.018015 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:36,671 [Rank 0]: number of documents: 2206327 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,671 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:36,671 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:36,671 [Rank 0]: document indices in [0, 2137931) total of 2137931 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:36,715 [Rank 0]: > Tokens per epoch: 1047952508 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,718 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,718 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:36,806 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.088241 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 2137931 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 127923 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,821 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.014389 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,821 [Rank 0]: > building shuffle index with split [0, 127923) and [127923, 127923) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,827 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.005672 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,827 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document_TRAIN_indexmap_38872ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:36,844 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document_TRAIN_indexmap_38872ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:36,852 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document_TRAIN_indexmap_38872ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:36,852 [Rank 0]: loaded indexed file in 0.025 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:36,852 [Rank 0]: total number of samples: 127924 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,852 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,936 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,948 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,948 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,948 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,948 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,949 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,949 [Rank 0]: > finished creating indexed dataset in 0.013536 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:36,949 [Rank 0]: number of documents: 125163 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,950 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:36,950 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:36,950 [Rank 0]: document indices in [0, 121283) total of 121283 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:36,951 [Rank 0]: > Tokens per epoch: 130456741 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,952 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,952 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:36,959 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.006978 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 121283 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 15924 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,963 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004428 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,963 [Rank 0]: > building shuffle index with split [0, 15924) and [15924, 15924) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:36,967 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003343 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,967 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document_TRAIN_indexmap_5787ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:36,976 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document_TRAIN_indexmap_5787ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:36,977 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document_TRAIN_indexmap_5787ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:36,977 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:36,977 [Rank 0]: total number of samples: 15925 +[ip-26-0-150-122:0]:2023-06-21 17:27:36,977 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,061 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,070 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,070 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,070 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,070 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,071 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,071 [Rank 0]: > finished creating indexed dataset in 0.009922 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:37,071 [Rank 0]: number of documents: 41890 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,071 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:37,071 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:37,072 [Rank 0]: document indices in [0, 40591) total of 40591 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:37,072 [Rank 0]: > Tokens per epoch: 63430707 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,073 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,073 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:37,078 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.004754 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 40591 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 7743 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,081 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003213 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,081 [Rank 0]: > building shuffle index with split [0, 7743) and [7743, 7743) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,084 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002101 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,084 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document_TRAIN_indexmap_2516ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:37,089 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document_TRAIN_indexmap_2516ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:37,090 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document_TRAIN_indexmap_2516ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:37,090 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:37,090 [Rank 0]: total number of samples: 7744 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,090 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,174 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,183 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,183 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,183 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,183 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,184 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,184 [Rank 0]: > finished creating indexed dataset in 0.009798 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:37,184 [Rank 0]: number of documents: 7917 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,184 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:37,184 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:37,185 [Rank 0]: document indices in [0, 7672) total of 7672 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:37,185 [Rank 0]: > Tokens per epoch: 16598658 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,186 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,186 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:37,190 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003758 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 7672 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2026 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,194 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003965 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,194 [Rank 0]: > building shuffle index with split [0, 2026) and [2026, 2026) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,196 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002376 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,197 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document_TRAIN_indexmap_629ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:37,202 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document_TRAIN_indexmap_629ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:37,203 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document_TRAIN_indexmap_629ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:37,203 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:37,203 [Rank 0]: total number of samples: 2027 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,203 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,287 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,298 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,298 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,298 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,298 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,299 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,299 [Rank 0]: > finished creating indexed dataset in 0.011681 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:37,299 [Rank 0]: number of documents: 13716 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,299 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:37,299 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:37,299 [Rank 0]: document indices in [0, 13291) total of 13291 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:37,299 [Rank 0]: > Tokens per epoch: 15425176 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,300 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,300 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:37,303 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003304 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 13291 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 1882 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,308 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004424 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,308 [Rank 0]: > building shuffle index with split [0, 1882) and [1882, 1882) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,310 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002266 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,351 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document_TRAIN_indexmap_504ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:37,356 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document_TRAIN_indexmap_504ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:37,357 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document_TRAIN_indexmap_504ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:37,357 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:37,357 [Rank 0]: total number of samples: 1883 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,357 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,441 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,460 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,460 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,460 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,460 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,461 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,461 [Rank 0]: > finished creating indexed dataset in 0.019475 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:37,461 [Rank 0]: number of documents: 975420 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,461 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:37,461 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:37,461 [Rank 0]: document indices in [0, 945182) total of 945182 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:37,467 [Rank 0]: > Tokens per epoch: 5267734886 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,469 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,469 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:37,503 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.033926 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 945182 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 643034 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,518 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.015037 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,518 [Rank 0]: > building shuffle index with split [0, 643034) and [643034, 643034) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,538 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.019418 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,538 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document_TRAIN_indexmap_139509ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:37,554 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document_TRAIN_indexmap_139509ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:37,558 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document_TRAIN_indexmap_139509ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:37,559 [Rank 0]: loaded indexed file in 0.021 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:37,559 [Rank 0]: total number of samples: 643035 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,559 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,642 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,648 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,648 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,648 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,649 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,649 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,649 [Rank 0]: > finished creating indexed dataset in 0.007028 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:37,649 [Rank 0]: number of documents: 167701 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,649 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:37,649 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:37,649 [Rank 0]: document indices in [0, 162502) total of 162502 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:37,651 [Rank 0]: > Tokens per epoch: 170250515 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,651 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,652 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:37,659 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.007797 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 162502 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 20782 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,663 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003554 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,663 [Rank 0]: > building shuffle index with split [0, 20782) and [20782, 20782) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,666 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002317 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,686 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document_TRAIN_indexmap_5032ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:37,696 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document_TRAIN_indexmap_5032ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:37,697 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document_TRAIN_indexmap_5032ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:37,697 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:37,697 [Rank 0]: total number of samples: 20783 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,697 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,781 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,789 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,790 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,790 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,790 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,791 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,791 [Rank 0]: > finished creating indexed dataset in 0.009533 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:37,791 [Rank 0]: number of documents: 62033 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,791 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:37,791 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:37,791 [Rank 0]: document indices in [0, 60110) total of 60110 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:37,791 [Rank 0]: > Tokens per epoch: 73268168 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,793 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,793 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:37,797 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.004324 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 60110 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 8943 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,800 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003085 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,800 [Rank 0]: > building shuffle index with split [0, 8943) and [8943, 8943) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,803 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002478 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,842 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document_TRAIN_indexmap_3774ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:37,847 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document_TRAIN_indexmap_3774ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:37,847 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document_TRAIN_indexmap_3774ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:37,847 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:37,848 [Rank 0]: total number of samples: 8944 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,848 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,932 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,949 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,949 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,949 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,949 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,950 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,951 [Rank 0]: > finished creating indexed dataset in 0.018653 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:37,951 [Rank 0]: number of documents: 571506 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,951 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:37,951 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:37,951 [Rank 0]: document indices in [0, 553789) total of 553789 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:37,954 [Rank 0]: > Tokens per epoch: 142265394 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,955 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,955 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:37,976 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.020984 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 553789 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 17366 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,981 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004753 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,981 [Rank 0]: > building shuffle index with split [0, 17366) and [17366, 17366) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:37,985 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003595 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,985 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document_TRAIN_indexmap_5284ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:37,991 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document_TRAIN_indexmap_5284ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:37,993 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document_TRAIN_indexmap_5284ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:37,994 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:37,994 [Rank 0]: total number of samples: 17367 +[ip-26-0-150-122:0]:2023-06-21 17:27:37,994 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:38,078 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,095 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,095 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,095 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,095 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,096 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,096 [Rank 0]: > finished creating indexed dataset in 0.017258 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:38,096 [Rank 0]: number of documents: 6353527 +[ip-26-0-150-122:0]:2023-06-21 17:27:38,096 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:38,096 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:38,096 [Rank 0]: document indices in [0, 6156568) total of 6156568 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:38,177 [Rank 0]: > Tokens per epoch: 15680764197 +[ip-26-0-150-122:0]:2023-06-21 17:27:38,178 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,179 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:38,481 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.302279 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 6156568 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 1914155 +[ip-26-0-150-122:0]:2023-06-21 17:27:38,559 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.077728 +[ip-26-0-150-122:0]:2023-06-21 17:27:38,559 [Rank 0]: > building shuffle index with split [0, 1914155) and [1914155, 1914155) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,612 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.052875 +[ip-26-0-150-122:0]:2023-06-21 17:27:38,613 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document_TRAIN_indexmap_615398ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:38,634 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document_TRAIN_indexmap_615398ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:38,637 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document_TRAIN_indexmap_615398ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:38,640 [Rank 0]: loaded indexed file in 0.027 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:38,640 [Rank 0]: total number of samples: 1914156 +[ip-26-0-150-122:0]:2023-06-21 17:27:38,640 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:38,725 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,728 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,728 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,728 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,728 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,728 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,728 [Rank 0]: > finished creating indexed dataset in 0.003068 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:38,728 [Rank 0]: number of documents: 226209 +[ip-26-0-150-122:0]:2023-06-21 17:27:38,728 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:38,728 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:38,728 [Rank 0]: document indices in [0, 219197) total of 219197 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:38,730 [Rank 0]: > Tokens per epoch: 179407601 +[ip-26-0-150-122:0]:2023-06-21 17:27:38,731 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,731 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:38,740 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.009260 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 219197 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 21900 +[ip-26-0-150-122:0]:2023-06-21 17:27:38,744 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003982 +[ip-26-0-150-122:0]:2023-06-21 17:27:38,744 [Rank 0]: > building shuffle index with split [0, 21900) and [21900, 21900) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,749 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004086 +[ip-26-0-150-122:0]:2023-06-21 17:27:38,781 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document_TRAIN_indexmap_8051ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:38,791 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document_TRAIN_indexmap_8051ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:38,791 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document_TRAIN_indexmap_8051ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:38,792 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:38,792 [Rank 0]: total number of samples: 21901 +[ip-26-0-150-122:0]:2023-06-21 17:27:38,792 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:38,925 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,927 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,927 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,927 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,927 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,927 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,928 [Rank 0]: > finished creating indexed dataset in 0.001779 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:38,928 [Rank 0]: number of documents: 98733 +[ip-26-0-150-122:0]:2023-06-21 17:27:38,928 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:38,928 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:38,928 [Rank 0]: document indices in [0, 95672) total of 95672 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:38,929 [Rank 0]: > Tokens per epoch: 476152050 +[ip-26-0-150-122:0]:2023-06-21 17:27:38,931 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,931 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:38,939 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.007621 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 95672 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 58124 +[ip-26-0-150-122:0]:2023-06-21 17:27:38,943 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003995 +[ip-26-0-150-122:0]:2023-06-21 17:27:38,943 [Rank 0]: > building shuffle index with split [0, 58124) and [58124, 58124) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:38,948 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004955 +[ip-26-0-150-122:0]:2023-06-21 17:27:38,948 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document_TRAIN_indexmap_17612ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:38,956 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document_TRAIN_indexmap_17612ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:38,957 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document_TRAIN_indexmap_17612ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:38,959 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:38,959 [Rank 0]: total number of samples: 58125 +[ip-26-0-150-122:0]:2023-06-21 17:27:38,959 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,075 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,077 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,077 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,077 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,077 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,077 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,078 [Rank 0]: > finished creating indexed dataset in 0.001933 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:39,078 [Rank 0]: number of documents: 281016 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,078 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:39,078 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:39,078 [Rank 0]: document indices in [0, 272305) total of 272305 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:39,080 [Rank 0]: > Tokens per epoch: 212250969 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,080 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,080 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:39,093 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.012500 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 272305 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 25909 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,097 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003708 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,097 [Rank 0]: > building shuffle index with split [0, 25909) and [25909, 25909) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,099 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002505 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,100 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document_TRAIN_indexmap_8932ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:39,106 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document_TRAIN_indexmap_8932ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:39,107 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document_TRAIN_indexmap_8932ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:39,109 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:39,109 [Rank 0]: total number of samples: 25910 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,109 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,194 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,210 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,210 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,210 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,210 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,210 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,210 [Rank 0]: > finished creating indexed dataset in 0.016318 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:39,210 [Rank 0]: number of documents: 250834 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,210 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:39,210 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:39,210 [Rank 0]: document indices in [0, 243058) total of 243058 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:39,212 [Rank 0]: > Tokens per epoch: 222150396 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,213 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,213 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:39,224 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.011011 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 243058 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 27117 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,228 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004154 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,228 [Rank 0]: > building shuffle index with split [0, 27117) and [27117, 27117) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,231 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002729 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,265 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document_TRAIN_indexmap_11448ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:39,275 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document_TRAIN_indexmap_11448ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:39,276 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document_TRAIN_indexmap_11448ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:39,276 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:39,276 [Rank 0]: total number of samples: 27118 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,276 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,361 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,373 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,373 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,374 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,374 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,374 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,374 [Rank 0]: > finished creating indexed dataset in 0.012663 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:39,374 [Rank 0]: number of documents: 3299965 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,374 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:39,374 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:39,374 [Rank 0]: document indices in [0, 3197666) total of 3197666 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:39,434 [Rank 0]: > Tokens per epoch: 9536019084 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,435 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,435 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:39,577 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.141893 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 3197666 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 1164064 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,615 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.037362 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,615 [Rank 0]: > building shuffle index with split [0, 1164064) and [1164064, 1164064) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,649 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.034142 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,650 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document_TRAIN_indexmap_369339ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:39,670 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document_TRAIN_indexmap_369339ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:39,674 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document_TRAIN_indexmap_369339ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:39,676 [Rank 0]: loaded indexed file in 0.026 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:39,676 [Rank 0]: total number of samples: 1164065 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,676 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,759 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,762 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,763 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,763 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,763 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,764 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:39,764 [Rank 0]: > finished creating indexed dataset in 0.004384 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:39,764 [Rank 0]: number of documents: 20071773 +[ip-26-0-150-122:0]:2023-06-21 17:27:39,764 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:39,764 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:39,764 [Rank 0]: document indices in [0, 19449548) total of 19449548 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:40,012 [Rank 0]: > Tokens per epoch: 21964883896 +[ip-26-0-150-122:0]:2023-06-21 17:27:40,014 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:40,014 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:41,160 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 1.146600 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 19449548 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2681260 +[ip-26-0-150-122:0]:2023-06-21 17:27:41,425 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.264485 +[ip-26-0-150-122:0]:2023-06-21 17:27:41,425 [Rank 0]: > building shuffle index with split [0, 2681260) and [2681260, 2681260) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:41,508 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.082354 +[ip-26-0-150-122:0]:2023-06-21 17:27:41,508 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document_TRAIN_indexmap_1093676ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:41,535 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document_TRAIN_indexmap_1093676ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:41,539 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document_TRAIN_indexmap_1093676ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:41,541 [Rank 0]: loaded indexed file in 0.033 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:41,541 [Rank 0]: total number of samples: 2681261 +[ip-26-0-150-122:0]:2023-06-21 17:27:41,541 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:41,625 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:41,643 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:41,643 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:41,643 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:41,643 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:41,643 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:41,644 [Rank 0]: > finished creating indexed dataset in 0.018318 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:41,644 [Rank 0]: number of documents: 19544285 +[ip-26-0-150-122:0]:2023-06-21 17:27:41,644 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:41,644 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:41,644 [Rank 0]: document indices in [0, 18938412) total of 18938412 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:41,862 [Rank 0]: > Tokens per epoch: 18328788838 +[ip-26-0-150-122:0]:2023-06-21 17:27:41,863 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:41,863 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:42,981 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 1.117496 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 18938412 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2237400 +[ip-26-0-150-122:0]:2023-06-21 17:27:43,231 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.249909 +[ip-26-0-150-122:0]:2023-06-21 17:27:43,231 [Rank 0]: > building shuffle index with split [0, 2237400) and [2237400, 2237400) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:43,295 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.063840 +[ip-26-0-150-122:0]:2023-06-21 17:27:43,296 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document_TRAIN_indexmap_814030ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:43,328 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document_TRAIN_indexmap_814030ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:43,332 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document_TRAIN_indexmap_814030ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:43,334 [Rank 0]: loaded indexed file in 0.039 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:43,335 [Rank 0]: total number of samples: 2237401 +[ip-26-0-150-122:0]:2023-06-21 17:27:43,335 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:43,421 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:43,437 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:43,437 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:43,437 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:43,437 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:43,438 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:43,438 [Rank 0]: > finished creating indexed dataset in 0.016427 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:43,438 [Rank 0]: number of documents: 21029287 +[ip-26-0-150-122:0]:2023-06-21 17:27:43,438 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:43,438 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:43,438 [Rank 0]: document indices in [0, 20377379) total of 20377379 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:43,674 [Rank 0]: > Tokens per epoch: 24642614919 +[ip-26-0-150-122:0]:2023-06-21 17:27:43,675 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:43,675 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:44,910 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 1.234511 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 20377379 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 3008131 +[ip-26-0-150-122:0]:2023-06-21 17:27:45,186 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.276196 +[ip-26-0-150-122:0]:2023-06-21 17:27:45,186 [Rank 0]: > building shuffle index with split [0, 3008131) and [3008131, 3008131) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:45,274 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.087909 +[ip-26-0-150-122:0]:2023-06-21 17:27:45,275 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document_TRAIN_indexmap_942595ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:45,282 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document_TRAIN_indexmap_942595ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:45,285 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document_TRAIN_indexmap_942595ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:45,287 [Rank 0]: loaded indexed file in 0.013 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:45,288 [Rank 0]: total number of samples: 3008132 +[ip-26-0-150-122:0]:2023-06-21 17:27:45,288 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:45,373 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:45,396 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:45,396 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:45,396 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:45,396 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:45,396 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:45,396 [Rank 0]: > finished creating indexed dataset in 0.023523 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:45,396 [Rank 0]: number of documents: 15683017 +[ip-26-0-150-122:0]:2023-06-21 17:27:45,396 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:45,397 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:45,397 [Rank 0]: document indices in [0, 15196843) total of 15196843 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:45,570 [Rank 0]: > Tokens per epoch: 16296942573 +[ip-26-0-150-122:0]:2023-06-21 17:27:45,572 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:45,572 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:46,384 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.812244 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 15196843 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 1989372 +[ip-26-0-150-122:0]:2023-06-21 17:27:46,573 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.188375 +[ip-26-0-150-122:0]:2023-06-21 17:27:46,573 [Rank 0]: > building shuffle index with split [0, 1989372) and [1989372, 1989372) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:46,629 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.055413 +[ip-26-0-150-122:0]:2023-06-21 17:27:46,629 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document_TRAIN_indexmap_765976ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:46,638 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document_TRAIN_indexmap_765976ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:46,640 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document_TRAIN_indexmap_765976ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:46,642 [Rank 0]: loaded indexed file in 0.013 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:46,642 [Rank 0]: total number of samples: 1989373 +[ip-26-0-150-122:0]:2023-06-21 17:27:46,642 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:46,727 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:46,744 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:46,744 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:46,744 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:46,744 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:46,745 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:46,745 [Rank 0]: > finished creating indexed dataset in 0.017557 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:46,745 [Rank 0]: number of documents: 12866649 +[ip-26-0-150-122:0]:2023-06-21 17:27:46,745 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:46,745 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:46,745 [Rank 0]: document indices in [0, 12467783) total of 12467783 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:46,898 [Rank 0]: > Tokens per epoch: 17087509450 +[ip-26-0-150-122:0]:2023-06-21 17:27:46,899 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:46,899 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:47,537 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.637768 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 12467783 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2085877 +[ip-26-0-150-122:0]:2023-06-21 17:27:47,700 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.163215 +[ip-26-0-150-122:0]:2023-06-21 17:27:47,700 [Rank 0]: > building shuffle index with split [0, 2085877) and [2085877, 2085877) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:47,761 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.060786 +[ip-26-0-150-122:0]:2023-06-21 17:27:47,762 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document_TRAIN_indexmap_759812ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:47,788 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document_TRAIN_indexmap_759812ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:47,793 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document_TRAIN_indexmap_759812ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:47,795 [Rank 0]: loaded indexed file in 0.033 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:47,795 [Rank 0]: total number of samples: 2085878 +[ip-26-0-150-122:0]:2023-06-21 17:27:47,795 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:47,879 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:47,896 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:47,897 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:47,897 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:47,897 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:47,897 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:47,897 [Rank 0]: > finished creating indexed dataset in 0.017633 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:47,897 [Rank 0]: number of documents: 10547331 +[ip-26-0-150-122:0]:2023-06-21 17:27:47,897 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:47,897 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:47,897 [Rank 0]: document indices in [0, 10220364) total of 10220364 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:48,059 [Rank 0]: > Tokens per epoch: 7178711685 +[ip-26-0-150-122:0]:2023-06-21 17:27:48,060 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:48,061 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:48,570 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.509525 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 10220364 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 876307 +[ip-26-0-150-122:0]:2023-06-21 17:27:48,675 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.104243 +[ip-26-0-150-122:0]:2023-06-21 17:27:48,675 [Rank 0]: > building shuffle index with split [0, 876307) and [876307, 876307) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:48,701 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.026336 +[ip-26-0-150-122:0]:2023-06-21 17:27:48,702 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document_TRAIN_indexmap_333613ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:48,727 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document_TRAIN_indexmap_333613ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:48,730 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document_TRAIN_indexmap_333613ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:48,732 [Rank 0]: loaded indexed file in 0.030 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:48,732 [Rank 0]: total number of samples: 876308 +[ip-26-0-150-122:0]:2023-06-21 17:27:48,732 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:48,814 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:48,815 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:48,815 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:48,815 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:48,815 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:48,815 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:48,815 [Rank 0]: > finished creating indexed dataset in 0.000670 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:48,815 [Rank 0]: number of documents: 75 +[ip-26-0-150-122:0]:2023-06-21 17:27:48,815 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:48,815 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:48,815 [Rank 0]: document indices in [0, 73) total of 73 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:48,816 [Rank 0]: > Tokens per epoch: 153326 +[ip-26-0-150-122:0]:2023-06-21 17:27:48,817 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:48,817 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:48,820 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002213 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 73 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 18 +[ip-26-0-150-122:0]:2023-06-21 17:27:48,825 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.005342 +[ip-26-0-150-122:0]:2023-06-21 17:27:48,825 [Rank 0]: > building shuffle index with split [0, 18) and [18, 18) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:48,827 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002082 +[ip-26-0-150-122:0]:2023-06-21 17:27:48,868 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document_TRAIN_indexmap_13ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:48,872 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document_TRAIN_indexmap_13ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:48,872 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document_TRAIN_indexmap_13ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:48,874 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:48,874 [Rank 0]: total number of samples: 19 +[ip-26-0-150-122:0]:2023-06-21 17:27:48,874 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:48,959 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:48,973 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:48,973 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:48,973 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:48,973 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:48,974 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:48,974 [Rank 0]: > finished creating indexed dataset in 0.014628 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:48,974 [Rank 0]: number of documents: 161239 +[ip-26-0-150-122:0]:2023-06-21 17:27:48,974 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:48,974 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:48,974 [Rank 0]: document indices in [0, 156241) total of 156241 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:48,976 [Rank 0]: > Tokens per epoch: 362410000 +[ip-26-0-150-122:0]:2023-06-21 17:27:48,978 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:48,978 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:48,986 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.007948 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 156241 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 44239 +[ip-26-0-150-122:0]:2023-06-21 17:27:48,989 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003449 +[ip-26-0-150-122:0]:2023-06-21 17:27:48,989 [Rank 0]: > building shuffle index with split [0, 44239) and [44239, 44239) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:48,992 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002924 +[ip-26-0-150-122:0]:2023-06-21 17:27:48,993 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document_TRAIN_indexmap_17864ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:49,003 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document_TRAIN_indexmap_17864ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:49,003 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document_TRAIN_indexmap_17864ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:49,004 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:49,004 [Rank 0]: total number of samples: 44240 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,004 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,086 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,093 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,094 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,094 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,094 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,095 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,095 [Rank 0]: > finished creating indexed dataset in 0.008249 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:49,095 [Rank 0]: number of documents: 58208 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,095 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:49,095 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:49,095 [Rank 0]: document indices in [0, 56404) total of 56404 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:49,095 [Rank 0]: > Tokens per epoch: 366255320 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,097 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,097 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:49,101 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.004261 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 56404 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 44708 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,105 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004413 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,106 [Rank 0]: > building shuffle index with split [0, 44708) and [44708, 44708) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,109 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003122 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,146 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document_TRAIN_indexmap_11825ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:49,154 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document_TRAIN_indexmap_11825ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:49,154 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document_TRAIN_indexmap_11825ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:49,155 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:49,155 [Rank 0]: total number of samples: 44709 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,155 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,239 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,246 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,247 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,247 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,247 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,248 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,248 [Rank 0]: > finished creating indexed dataset in 0.008633 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:49,248 [Rank 0]: number of documents: 4661 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,248 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:49,248 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:49,248 [Rank 0]: document indices in [0, 4517) total of 4517 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:49,248 [Rank 0]: > Tokens per epoch: 3469924 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,249 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,250 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:49,253 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003701 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 4517 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 423 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,256 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003052 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,257 [Rank 0]: > building shuffle index with split [0, 423) and [423, 423) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,259 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002293 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,262 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:49,266 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:49,267 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document_TRAIN_indexmap_126ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:49,268 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:49,268 [Rank 0]: total number of samples: 424 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,268 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,353 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,364 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,365 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,365 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,365 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,365 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,365 [Rank 0]: > finished creating indexed dataset in 0.011978 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:49,365 [Rank 0]: number of documents: 93 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,365 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:49,365 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:49,365 [Rank 0]: document indices in [0, 90) total of 90 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:49,365 [Rank 0]: > Tokens per epoch: 74220 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,366 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,366 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:49,369 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002468 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 90 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 9 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,372 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003386 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,372 [Rank 0]: > building shuffle index with split [0, 9) and [9, 9) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,375 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002402 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,375 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document_TRAIN_indexmap_3ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:49,379 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document_TRAIN_indexmap_3ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:49,382 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document_TRAIN_indexmap_3ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:49,383 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:49,383 [Rank 0]: total number of samples: 10 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,383 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,467 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,475 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,475 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,475 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,475 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,476 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,476 [Rank 0]: > finished creating indexed dataset in 0.008526 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:49,476 [Rank 0]: number of documents: 7451 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,476 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:49,476 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:49,476 [Rank 0]: document indices in [0, 7220) total of 7220 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:49,477 [Rank 0]: > Tokens per epoch: 35201031 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,478 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,478 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:49,482 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.004081 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 7220 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 4297 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,486 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003545 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,486 [Rank 0]: > building shuffle index with split [0, 4297) and [4297, 4297) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,488 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002185 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,488 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document_TRAIN_indexmap_1384ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:49,493 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document_TRAIN_indexmap_1384ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:49,496 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document_TRAIN_indexmap_1384ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:49,498 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:49,498 [Rank 0]: total number of samples: 4298 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,499 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,583 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,593 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,594 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,594 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,594 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,594 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,594 [Rank 0]: > finished creating indexed dataset in 0.010884 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:49,594 [Rank 0]: number of documents: 15850 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,594 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:49,594 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:49,594 [Rank 0]: document indices in [0, 15359) total of 15359 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:49,595 [Rank 0]: > Tokens per epoch: 55447717 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,596 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,596 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:49,604 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.007724 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 15359 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 6768 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,608 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003874 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,608 [Rank 0]: > building shuffle index with split [0, 6768) and [6768, 6768) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,616 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.008218 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,616 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document_TRAIN_indexmap_2265ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:49,622 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document_TRAIN_indexmap_2265ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:49,624 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document_TRAIN_indexmap_2265ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:49,626 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:49,626 [Rank 0]: total number of samples: 6769 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,626 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,711 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,717 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,717 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,717 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,718 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,718 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,718 [Rank 0]: > finished creating indexed dataset in 0.007133 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:49,718 [Rank 0]: number of documents: 42103 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,718 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:49,718 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:49,718 [Rank 0]: document indices in [0, 40798) total of 40798 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:49,719 [Rank 0]: > Tokens per epoch: 136106399 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,720 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,720 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:49,724 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.004394 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 40798 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 16614 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,728 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003517 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,728 [Rank 0]: > building shuffle index with split [0, 16614) and [16614, 16614) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,731 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002808 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,771 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document_TRAIN_indexmap_629ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:49,779 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document_TRAIN_indexmap_629ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:49,779 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document_TRAIN_indexmap_629ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:49,780 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:49,780 [Rank 0]: total number of samples: 16615 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,780 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,864 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,878 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,878 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,878 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,878 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,879 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,879 [Rank 0]: > finished creating indexed dataset in 0.014641 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:49,879 [Rank 0]: number of documents: 4751547 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,879 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:49,879 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:49,879 [Rank 0]: document indices in [0, 4604249) total of 4604249 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:49,952 [Rank 0]: > Tokens per epoch: 2031305386 +[ip-26-0-150-122:0]:2023-06-21 17:27:49,954 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:49,954 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:50,150 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.196015 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 4604249 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 247962 +[ip-26-0-150-122:0]:2023-06-21 17:27:50,184 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.033755 +[ip-26-0-150-122:0]:2023-06-21 17:27:50,184 [Rank 0]: > building shuffle index with split [0, 247962) and [247962, 247962) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:50,193 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.008546 +[ip-26-0-150-122:0]:2023-06-21 17:27:50,193 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document_TRAIN_indexmap_12580ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:50,213 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document_TRAIN_indexmap_12580ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:50,214 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document_TRAIN_indexmap_12580ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:50,215 [Rank 0]: loaded indexed file in 0.022 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:50,215 [Rank 0]: total number of samples: 247963 +[ip-26-0-150-122:0]:2023-06-21 17:27:50,215 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:50,297 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:50,306 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:50,306 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:50,306 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:50,306 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:50,306 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:50,306 [Rank 0]: > finished creating indexed dataset in 0.009135 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:50,306 [Rank 0]: number of documents: 3995948 +[ip-26-0-150-122:0]:2023-06-21 17:27:50,307 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:50,307 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:50,307 [Rank 0]: document indices in [0, 3872074) total of 3872074 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:50,382 [Rank 0]: > Tokens per epoch: 1165518004 +[ip-26-0-150-122:0]:2023-06-21 17:27:50,384 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:50,384 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:50,549 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.165272 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 3872074 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 142275 +[ip-26-0-150-122:0]:2023-06-21 17:27:50,577 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.026964 +[ip-26-0-150-122:0]:2023-06-21 17:27:50,577 [Rank 0]: > building shuffle index with split [0, 142275) and [142275, 142275) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:50,582 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.005657 +[ip-26-0-150-122:0]:2023-06-21 17:27:50,583 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document_TRAIN_indexmap_12580ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:50,590 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document_TRAIN_indexmap_12580ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:50,591 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document_TRAIN_indexmap_12580ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:50,593 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:50,593 [Rank 0]: total number of samples: 142276 +[ip-26-0-150-122:0]:2023-06-21 17:27:50,593 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:50,677 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:50,695 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:50,695 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:50,695 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:50,695 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:50,696 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:50,696 [Rank 0]: > finished creating indexed dataset in 0.018911 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:50,696 [Rank 0]: number of documents: 30982955 +[ip-26-0-150-122:0]:2023-06-21 17:27:50,696 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:50,696 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:50,696 [Rank 0]: document indices in [0, 30022483) total of 30022483 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:51,045 [Rank 0]: > Tokens per epoch: 17478333988 +[ip-26-0-150-122:0]:2023-06-21 17:27:51,046 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:51,047 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:52,985 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 1.938788 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 30022483 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2133585 +[ip-26-0-150-122:0]:2023-06-21 17:27:53,391 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.405525 +[ip-26-0-150-122:0]:2023-06-21 17:27:53,391 [Rank 0]: > building shuffle index with split [0, 2133585) and [2133585, 2133585) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:53,451 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.060010 +[ip-26-0-150-122:0]:2023-06-21 17:27:53,452 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document_TRAIN_indexmap_684334ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:53,496 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document_TRAIN_indexmap_684334ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:53,500 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document_TRAIN_indexmap_684334ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:53,503 [Rank 0]: loaded indexed file in 0.051 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:53,503 [Rank 0]: total number of samples: 2133586 +[ip-26-0-150-122:0]:2023-06-21 17:27:53,503 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:53,587 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:53,603 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:53,603 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:53,604 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:53,604 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:53,604 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:53,604 [Rank 0]: > finished creating indexed dataset in 0.016830 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:53,604 [Rank 0]: number of documents: 7634718 +[ip-26-0-150-122:0]:2023-06-21 17:27:53,605 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:53,605 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:53,605 [Rank 0]: document indices in [0, 7398042) total of 7398042 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:53,700 [Rank 0]: > Tokens per epoch: 15747857063 +[ip-26-0-150-122:0]:2023-06-21 17:27:53,701 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:53,701 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:54,072 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.370953 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 7398042 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 1922345 +[ip-26-0-150-122:0]:2023-06-21 17:27:54,181 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.108519 +[ip-26-0-150-122:0]:2023-06-21 17:27:54,181 [Rank 0]: > building shuffle index with split [0, 1922345) and [1922345, 1922345) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:54,235 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.053852 +[ip-26-0-150-122:0]:2023-06-21 17:27:54,235 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document_TRAIN_indexmap_402550ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:54,258 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document_TRAIN_indexmap_402550ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:54,260 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document_TRAIN_indexmap_402550ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:54,262 [Rank 0]: loaded indexed file in 0.027 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:54,262 [Rank 0]: total number of samples: 1922346 +[ip-26-0-150-122:0]:2023-06-21 17:27:54,262 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:54,348 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:54,351 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:54,351 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:54,351 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:54,351 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:54,351 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:54,351 [Rank 0]: > finished creating indexed dataset in 0.003456 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:54,351 [Rank 0]: number of documents: 914510 +[ip-26-0-150-122:0]:2023-06-21 17:27:54,351 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:54,351 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:54,351 [Rank 0]: document indices in [0, 886160) total of 886160 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:54,358 [Rank 0]: > Tokens per epoch: 2392372458 +[ip-26-0-150-122:0]:2023-06-21 17:27:54,359 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:54,360 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:54,396 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.036130 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 886160 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 292037 +[ip-26-0-150-122:0]:2023-06-21 17:27:54,406 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.010301 +[ip-26-0-150-122:0]:2023-06-21 17:27:54,406 [Rank 0]: > building shuffle index with split [0, 292037) and [292037, 292037) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:54,415 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.009125 +[ip-26-0-150-122:0]:2023-06-21 17:27:54,416 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document_TRAIN_indexmap_89568ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:54,429 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document_TRAIN_indexmap_89568ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:54,430 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document_TRAIN_indexmap_89568ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:54,431 [Rank 0]: loaded indexed file in 0.015 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:54,431 [Rank 0]: total number of samples: 292038 +[ip-26-0-150-122:0]:2023-06-21 17:27:54,432 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:54,516 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:54,527 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:54,527 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:54,527 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:54,527 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:54,527 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:54,527 [Rank 0]: > finished creating indexed dataset in 0.011625 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:54,528 [Rank 0]: number of documents: 668743 +[ip-26-0-150-122:0]:2023-06-21 17:27:54,528 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:54,528 [Rank 0]: TRAIN: +[ip-26-0-150-122:0]:2023-06-21 17:27:54,528 [Rank 0]: document indices in [0, 648012) total of 648012 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:54,532 [Rank 0]: > Tokens per epoch: 1927094062 +[ip-26-0-150-122:0]:2023-06-21 17:27:54,533 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:54,533 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:54,557 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.024563 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 648012 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 235240 +[ip-26-0-150-122:0]:2023-06-21 17:27:54,568 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.010390 +[ip-26-0-150-122:0]:2023-06-21 17:27:54,568 [Rank 0]: > building shuffle index with split [0, 235240) and [235240, 235240) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:54,576 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.007555 +[ip-26-0-150-122:0]:2023-06-21 17:27:54,576 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document_TRAIN_indexmap_75478ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:54,588 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document_TRAIN_indexmap_75478ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:54,589 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document_TRAIN_indexmap_75478ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:54,590 [Rank 0]: loaded indexed file in 0.014 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:54,590 [Rank 0]: total number of samples: 235241 +[ip-26-0-150-122:0]:2023-06-21 17:27:54,590 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:> building indices for blendable datasets ... +[ip-26-0-150-122:0]: > sample ratios: +[ip-26-0-150-122:0]: dataset 0, input: 0.00391159, achieved: 0.00391159 +[ip-26-0-150-122:0]: dataset 1, input: 1.30386e-05, achieved: 1.30547e-05 +[ip-26-0-150-122:0]: dataset 2, input: 0.0702651, achieved: 0.0702651 +[ip-26-0-150-122:0]: dataset 3, input: 0.00232087, achieved: 0.00232087 +[ip-26-0-150-122:0]: dataset 4, input: 0.00110828, achieved: 0.00110827 +[ip-26-0-150-122:0]: dataset 5, input: 0.00740594, achieved: 0.00740593 +[ip-26-0-150-122:0]: dataset 6, input: 1.30386e-05, achieved: 1.30547e-05 +[ip-26-0-150-122:0]: dataset 7, input: 0.00170806, achieved: 0.00170807 +[ip-26-0-150-122:0]: dataset 8, input: 0.00127778, achieved: 0.00127778 +[ip-26-0-150-122:0]: dataset 9, input: 0.000104309, achieved: 0.000104303 +[ip-26-0-150-122:0]: dataset 10, input: 3.91159e-05, achieved: 3.91303e-05 +[ip-26-0-150-122:0]: dataset 11, input: 0.000117348, achieved: 0.000117357 +[ip-26-0-150-122:0]: dataset 12, input: 0.00146033, achieved: 0.00146034 +[ip-26-0-150-122:0]: dataset 13, input: 0.0310058, achieved: 0.0310058 +[ip-26-0-150-122:0]: dataset 14, input: 0.000912704, achieved: 0.000912716 +[ip-26-0-150-122:0]: dataset 15, input: 0.000795356, achieved: 0.000795359 +[ip-26-0-150-122:0]: dataset 16, input: 0.000339004, achieved: 0.000339018 +[ip-26-0-150-122:0]: dataset 17, input: 0.00219049, achieved: 0.00219049 +[ip-26-0-150-122:0]: dataset 18, input: 0.00290761, achieved: 0.00290762 +[ip-26-0-150-122:0]: dataset 19, input: 0.000391159, achieved: 0.000391169 +[ip-26-0-150-122:0]: dataset 20, input: 0.000404197, achieved: 0.00040419 +[ip-26-0-150-122:0]: dataset 21, input: 0.000586738, achieved: 0.000586753 +[ip-26-0-150-122:0]: dataset 22, input: 0.000156463, achieved: 0.000156454 +[ip-26-0-150-122:0]: dataset 23, input: 0.0088793, achieved: 0.00887929 +[ip-26-0-150-122:0]: dataset 24, input: 0.0118782, achieved: 0.0118782 +[ip-26-0-150-122:0]: dataset 25, input: 7.82317e-05, achieved: 7.8227e-05 +[ip-26-0-150-122:0]: dataset 26, input: 0.0582305, achieved: 0.0582305 +[ip-26-0-150-122:0]: dataset 27, input: 0.00075624, achieved: 0.000756228 +[ip-26-0-150-122:0]: dataset 28, input: 0.00290761, achieved: 0.00290762 +[ip-26-0-150-122:0]: dataset 29, input: 1.30386e-05, achieved: 1.30547e-05 +[ip-26-0-150-122:0]: dataset 30, input: 0.00162983, achieved: 0.00162981 +[ip-26-0-150-122:0]: dataset 31, input: 0.00134298, achieved: 0.00134298 +[ip-26-0-150-122:0]: dataset 32, input: 0.00170806, achieved: 0.00170804 +[ip-26-0-150-122:0]: dataset 33, input: 0.00374208, achieved: 0.00374208 +[ip-26-0-150-122:0]: dataset 34, input: 1.30386e-05, achieved: 1.30547e-05 +[ip-26-0-150-122:0]: dataset 35, input: 6.51931e-05, achieved: 6.5206e-05 +[ip-26-0-150-122:0]: dataset 36, input: 0.00432882, achieved: 0.00432883 +[ip-26-0-150-122:0]: dataset 37, input: 3.91159e-05, achieved: 3.91303e-05 +[ip-26-0-150-122:0]: dataset 38, input: 0.000247734, achieved: 0.000247736 +[ip-26-0-150-122:0]: dataset 39, input: 0.000508506, achieved: 0.000508493 +[ip-26-0-150-122:0]: dataset 40, input: 0.00678008, achieved: 0.00678008 +[ip-26-0-150-122:0]: dataset 41, input: 2.60772e-05, achieved: 2.60757e-05 +[ip-26-0-150-122:0]: dataset 42, input: 0.00203403, achieved: 0.00203404 +[ip-26-0-150-122:0]: dataset 43, input: 1.30386e-05, achieved: 1.30547e-05 +[ip-26-0-150-122:0]: dataset 44, input: 9.12704e-05, achieved: 9.12817e-05 +[ip-26-0-150-122:0]: dataset 45, input: 0.000534584, achieved: 0.000534568 +[ip-26-0-150-122:0]: dataset 46, input: 0.00477214, achieved: 0.00477212 +[ip-26-0-150-122:0]: dataset 47, input: 0.000730163, achieved: 0.000730153 +[ip-26-0-150-122:0]: dataset 48, input: 3.91159e-05, achieved: 3.91303e-05 +[ip-26-0-150-122:0]: dataset 49, input: 1.30386e-06, achieved: 1.3122e-06 +[ip-26-0-150-122:0]: dataset 50, input: 0.000299888, achieved: 0.000299887 +[ip-26-0-150-122:0]: dataset 51, input: 2.60772e-05, achieved: 2.60757e-05 +[ip-26-0-150-122:0]: dataset 52, input: 1.30386e-05, achieved: 1.30547e-05 +[ip-26-0-150-122:0]: dataset 53, input: 0.00611511, achieved: 0.0061151 +[ip-26-0-150-122:0]: dataset 54, input: 0.000456352, achieved: 0.000456341 +[ip-26-0-150-122:0]: dataset 55, input: 0.000430275, achieved: 0.000430266 +[ip-26-0-150-122:0]: dataset 56, input: 1.30386e-05, achieved: 1.30547e-05 +[ip-26-0-150-122:0]: dataset 57, input: 0.00402893, achieved: 0.00402895 +[ip-26-0-150-122:0]: dataset 58, input: 0.000599777, achieved: 0.000599774 +[ip-26-0-150-122:0]: dataset 59, input: 0.000260772, achieved: 0.000260757 +[ip-26-0-150-122:0]: dataset 60, input: 6.51931e-05, achieved: 6.5206e-05 +[ip-26-0-150-122:0]: dataset 61, input: 5.21545e-05, achieved: 5.21514e-05 +[ip-26-0-150-122:0]: dataset 62, input: 0.0144598, achieved: 0.0144598 +[ip-26-0-150-122:0]: dataset 63, input: 0.000521545, achieved: 0.000521547 +[ip-26-0-150-122:0]: dataset 64, input: 0.000391159, achieved: 0.000391169 +[ip-26-0-150-122:0]: dataset 65, input: 0.000547622, achieved: 0.000547623 +[ip-26-0-150-122:0]: dataset 66, input: 0.0637849, achieved: 0.0637849 +[ip-26-0-150-122:0]: dataset 67, input: 0.000834472, achieved: 0.000834455 +[ip-26-0-150-122:0]: dataset 68, input: 0.00182541, achieved: 0.0018254 +[ip-26-0-150-122:0]: dataset 69, input: 0.000925742, achieved: 0.000925737 +[ip-26-0-150-122:0]: dataset 70, input: 0.00118651, achieved: 0.00118653 +[ip-26-0-150-122:0]: dataset 71, input: 0.0382814, achieved: 0.0382814 +[ip-26-0-150-122:0]: dataset 72, input: 0.113358, achieved: 0.113358 +[ip-26-0-150-122:0]: dataset 73, input: 0.0843729, achieved: 0.0843729 +[ip-26-0-150-122:0]: dataset 74, input: 0.0976984, achieved: 0.0976984 +[ip-26-0-150-122:0]: dataset 75, input: 0.0793922, achieved: 0.0793922 +[ip-26-0-150-122:0]: dataset 76, input: 0.0787533, achieved: 0.0787533 +[ip-26-0-150-122:0]: dataset 77, input: 0.0345784, achieved: 0.0345784 +[ip-26-0-150-122:0]: dataset 78, input: 1.30386e-06, achieved: 1.3122e-06 +[ip-26-0-150-122:0]: dataset 79, input: 0.00185148, achieved: 0.00185147 +[ip-26-0-150-122:0]: dataset 80, input: 0.00122563, achieved: 0.00122562 +[ip-26-0-150-122:0]: dataset 81, input: 1.30386e-05, achieved: 1.30547e-05 +[ip-26-0-150-122:0]: dataset 82, input: 2.60772e-07, achieved: 2.69168e-07 +[ip-26-0-150-122:0]: dataset 83, input: 0.000143425, achieved: 0.000143433 +[ip-26-0-150-122:0]: dataset 84, input: 0.000234695, achieved: 0.000234681 +[ip-26-0-150-122:0]: dataset 85, input: 6.51931e-05, achieved: 6.5206e-05 +[ip-26-0-150-122:0]: dataset 86, input: 0.00130386, achieved: 0.00130385 +[ip-26-0-150-122:0]: dataset 87, input: 0.00130386, achieved: 0.00130385 +[ip-26-0-150-122:0]: dataset 88, input: 0.0709301, achieved: 0.0709301 +[ip-26-0-150-122:0]: dataset 89, input: 0.0417236, achieved: 0.0417236 +[ip-26-0-150-122:0]: dataset 90, input: 0.0092835, achieved: 0.00928348 +[ip-26-0-150-122:0]: dataset 91, input: 0.00782317, achieved: 0.00782318 +[ip-26-0-150-122:0]:2023-06-21 17:27:57,965 [Rank 0]: > elapsed time for building blendable dataset indices: 3.29 (sec) +[ip-26-0-150-122:0]:2023-06-21 17:27:57,966 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:57,969 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:57,969 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:57,969 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:57,969 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:57,969 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:57,969 [Rank 0]: > finished creating indexed dataset in 0.003214 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:57,969 [Rank 0]: number of documents: 2721616 +[ip-26-0-150-122:0]:2023-06-21 17:27:57,969 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:57,969 [Rank 0]: VALID_css: +[ip-26-0-150-122:0]:2023-06-21 17:27:57,969 [Rank 0]: document indices in [2637246, 2718894) total of 81648 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:57,970 [Rank 0]: > Tokens per epoch: 142752310 +[ip-26-0-150-122:0]:2023-06-21 17:27:57,972 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:57,972 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:57,977 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.005069 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 81648 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 17425 +[ip-26-0-150-122:0]:2023-06-21 17:27:57,980 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002764 +[ip-26-0-150-122:0]:2023-06-21 17:27:57,980 [Rank 0]: > building shuffle index with split [0, 17425) and [17425, 17425) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:57,983 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002703 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,165 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document_VALID_css_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:58,174 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document_VALID_css_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:58,174 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document_VALID_css_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:58,175 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:58,175 [Rank 0]: total number of samples: 17426 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,175 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,259 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]: > finished creating indexed dataset in 0.000721 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]: number of documents: 968 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]: VALID_prolog: +[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]: document indices in [938, 967) total of 29 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:58,260 [Rank 0]: > Tokens per epoch: 55028 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,263 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,263 [Rank 0]: > last epoch number of samples (6) is larger than 80% of number of samples per epoch (6), setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:58,266 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003192 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 29 +[ip-26-0-150-122:0]: number of epochs: 305 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2048 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,270 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004219 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,270 [Rank 0]: > building shuffle index with split [0, 2048) and [2048, 2048) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,273 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002822 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,324 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document_VALID_prolog_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:58,328 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document_VALID_prolog_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:58,329 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document_VALID_prolog_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:58,331 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:58,331 [Rank 0]: total number of samples: 2049 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,331 [Rank 0]: total number of epochs: 305 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,415 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,417 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,417 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,417 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,417 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,417 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,418 [Rank 0]: > finished creating indexed dataset in 0.002199 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:58,418 [Rank 0]: number of documents: 8536791 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,418 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:58,418 [Rank 0]: VALID_c: +[ip-26-0-150-122:0]:2023-06-21 17:27:58,418 [Rank 0]: document indices in [8272150, 8528254) total of 256104 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:58,420 [Rank 0]: > Tokens per epoch: 613576495 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,423 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,423 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:58,434 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.011330 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 256104 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 74899 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,439 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004647 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,439 [Rank 0]: > building shuffle index with split [0, 74899) and [74899, 74899) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,446 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.006539 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,458 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document_VALID_c_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:58,467 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document_VALID_c_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:58,468 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document_VALID_c_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:58,468 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:58,469 [Rank 0]: total number of samples: 74900 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,469 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,551 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,553 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,553 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,553 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,553 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,553 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,553 [Rank 0]: > finished creating indexed dataset in 0.001810 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:58,553 [Rank 0]: number of documents: 158792 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,553 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:58,554 [Rank 0]: VALID_fortran: +[ip-26-0-150-122:0]:2023-06-21 17:27:58,554 [Rank 0]: document indices in [153869, 158633) total of 4764 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:58,554 [Rank 0]: > Tokens per epoch: 18815887 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,556 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,556 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:58,560 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003936 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 4764 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2296 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,563 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003311 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,563 [Rank 0]: > building shuffle index with split [0, 2296) and [2296, 2296) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,566 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003001 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,576 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document_VALID_fortran_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:58,580 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document_VALID_fortran_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:58,582 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document_VALID_fortran_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:58,585 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:58,585 [Rank 0]: total number of samples: 2297 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,585 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,667 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,669 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,669 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,669 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,669 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,669 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,670 [Rank 0]: > finished creating indexed dataset in 0.002075 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:58,670 [Rank 0]: number of documents: 153194 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,670 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:58,670 [Rank 0]: VALID_solidity: +[ip-26-0-150-122:0]:2023-06-21 17:27:58,670 [Rank 0]: document indices in [148445, 153041) total of 4596 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:58,670 [Rank 0]: > Tokens per epoch: 8220293 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,672 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,673 [Rank 0]: > last epoch number of samples (42) is smaller than 80% of number of samples per epoch (1003), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:27:58,676 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003494 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 4596 +[ip-26-0-150-122:0]: number of epochs: 3 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 3010 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,683 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.006305 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,683 [Rank 0]: > building shuffle index with split [0, 2006) and [2006, 3010) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,686 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003234 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,730 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document_VALID_solidity_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:58,735 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document_VALID_solidity_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:58,735 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document_VALID_solidity_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:58,736 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:58,736 [Rank 0]: total number of samples: 3011 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,736 [Rank 0]: total number of epochs: 3 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,818 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,820 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,820 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,820 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,820 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,821 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,821 [Rank 0]: > finished creating indexed dataset in 0.002482 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:58,821 [Rank 0]: number of documents: 2239354 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,821 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:58,821 [Rank 0]: VALID_kotlin: +[ip-26-0-150-122:0]:2023-06-21 17:27:58,821 [Rank 0]: document indices in [2169934, 2237115) total of 67181 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:58,822 [Rank 0]: > Tokens per epoch: 43085225 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,824 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,824 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:58,829 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.004974 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 67181 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 5259 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,832 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002326 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,832 [Rank 0]: > building shuffle index with split [0, 5259) and [5259, 5259) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,837 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004907 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,886 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document_VALID_kotlin_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:58,892 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document_VALID_kotlin_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:58,893 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document_VALID_kotlin_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:58,894 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:58,894 [Rank 0]: total number of samples: 5260 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,894 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,976 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,977 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,977 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,977 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,977 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,977 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,977 [Rank 0]: > finished creating indexed dataset in 0.000769 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:58,977 [Rank 0]: number of documents: 523 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,977 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:58,978 [Rank 0]: VALID_literate-agda: +[ip-26-0-150-122:0]:2023-06-21 17:27:58,978 [Rank 0]: document indices in [507, 522) total of 15 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:58,978 [Rank 0]: > Tokens per epoch: 46791 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,980 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,980 [Rank 0]: > last epoch number of samples (4) is larger than 80% of number of samples per epoch (5), setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:58,984 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003511 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 15 +[ip-26-0-150-122:0]: number of epochs: 359 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2050 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,987 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003382 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,987 [Rank 0]: > building shuffle index with split [0, 2050) and [2050, 2050) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:58,990 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003127 +[ip-26-0-150-122:0]:2023-06-21 17:27:58,993 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document_VALID_literate-agda_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,000 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document_VALID_literate-agda_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,001 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document_VALID_literate-agda_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,001 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:59,001 [Rank 0]: total number of samples: 2051 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,001 [Rank 0]: total number of epochs: 359 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,084 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,086 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,087 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,087 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,087 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,087 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,087 [Rank 0]: > finished creating indexed dataset in 0.002306 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:59,087 [Rank 0]: number of documents: 295364 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,087 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:59,087 [Rank 0]: VALID_julia: +[ip-26-0-150-122:0]:2023-06-21 17:27:59,087 [Rank 0]: document indices in [286208, 295069) total of 8861 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:59,087 [Rank 0]: > Tokens per epoch: 13589070 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,090 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,090 [Rank 0]: > last epoch number of samples (390) is smaller than 80% of number of samples per epoch (1658), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:27:59,094 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003983 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 8861 +[ip-26-0-150-122:0]: number of epochs: 2 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 3317 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,099 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004236 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,099 [Rank 0]: > building shuffle index with split [0, 1658) and [1658, 3317) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,101 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002714 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,102 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document_VALID_julia_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,107 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document_VALID_julia_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,107 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document_VALID_julia_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,109 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:59,109 [Rank 0]: total number of samples: 3318 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,109 [Rank 0]: total number of epochs: 2 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,192 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,194 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,194 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,194 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,194 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,194 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,194 [Rank 0]: > finished creating indexed dataset in 0.002203 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:59,195 [Rank 0]: number of documents: 210816 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,195 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:59,195 [Rank 0]: VALID_java-server-pages: +[ip-26-0-150-122:0]:2023-06-21 17:27:59,195 [Rank 0]: document indices in [204281, 210605) total of 6324 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:59,195 [Rank 0]: > Tokens per epoch: 8481384 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,198 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,198 [Rank 0]: > last epoch number of samples (1013) is larger than 80% of number of samples per epoch (1035), setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:59,201 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002964 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 6324 +[ip-26-0-150-122:0]: number of epochs: 2 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2070 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,205 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003585 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,205 [Rank 0]: > building shuffle index with split [0, 2070) and [2070, 2070) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,207 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002521 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,213 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document_VALID_java-server-pages_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,218 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document_VALID_java-server-pages_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,218 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document_VALID_java-server-pages_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,220 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:59,220 [Rank 0]: total number of samples: 2071 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,220 [Rank 0]: total number of epochs: 2 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,304 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,304 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,304 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,304 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,305 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,305 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,305 [Rank 0]: > finished creating indexed dataset in 0.000721 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:59,305 [Rank 0]: number of documents: 5001 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,305 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:59,305 [Rank 0]: VALID_isabelle: +[ip-26-0-150-122:0]:2023-06-21 17:27:59,305 [Rank 0]: document indices in [4846, 4996) total of 150 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:59,305 [Rank 0]: > Tokens per epoch: 1014769 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,308 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,308 [Rank 0]: > last epoch number of samples (67) is smaller than 80% of number of samples per epoch (123), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:27:59,311 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003356 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 150 +[ip-26-0-150-122:0]: number of epochs: 17 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2105 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,314 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002580 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,314 [Rank 0]: > building shuffle index with split [0, 1981) and [1981, 2105) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,318 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003952 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,319 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document_VALID_isabelle_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,326 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document_VALID_isabelle_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,326 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document_VALID_isabelle_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,327 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:59,327 [Rank 0]: total number of samples: 2106 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,327 [Rank 0]: total number of epochs: 17 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,409 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,410 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,410 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,410 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,410 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,410 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,410 [Rank 0]: > finished creating indexed dataset in 0.000761 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:59,410 [Rank 0]: number of documents: 8042 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,410 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:59,411 [Rank 0]: VALID_idris: +[ip-26-0-150-122:0]:2023-06-21 17:27:59,411 [Rank 0]: document indices in [7793, 8034) total of 241 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:59,411 [Rank 0]: > Tokens per epoch: 225513 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,414 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,414 [Rank 0]: > last epoch number of samples (11) is smaller than 80% of number of samples per epoch (27), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:27:59,418 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003664 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 241 +[ip-26-0-150-122:0]: number of epochs: 75 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2064 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,420 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002453 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,420 [Rank 0]: > building shuffle index with split [0, 2037) and [2037, 2064) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,423 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003065 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,424 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document_VALID_idris_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,431 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document_VALID_idris_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,431 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document_VALID_idris_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,432 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:59,432 [Rank 0]: total number of samples: 2065 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,432 [Rank 0]: total number of epochs: 75 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,514 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,515 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,515 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,515 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,515 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,515 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,515 [Rank 0]: > finished creating indexed dataset in 0.000802 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:59,515 [Rank 0]: number of documents: 16870 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,515 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:59,516 [Rank 0]: VALID_lean: +[ip-26-0-150-122:0]:2023-06-21 17:27:59,516 [Rank 0]: document indices in [16347, 16853) total of 506 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:59,516 [Rank 0]: > Tokens per epoch: 1042103 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,518 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,519 [Rank 0]: > last epoch number of samples (13) is smaller than 80% of number of samples per epoch (127), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:27:59,522 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003744 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 506 +[ip-26-0-150-122:0]: number of epochs: 17 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2162 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,526 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003023 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,526 [Rank 0]: > building shuffle index with split [0, 2035) and [2035, 2162) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,529 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003336 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,529 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document_VALID_lean_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,534 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document_VALID_lean_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,534 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document_VALID_lean_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,535 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:59,535 [Rank 0]: total number of samples: 2163 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,535 [Rank 0]: total number of epochs: 17 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,618 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,620 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,620 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,620 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,620 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,620 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,621 [Rank 0]: > finished creating indexed dataset in 0.002193 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:59,621 [Rank 0]: number of documents: 267627 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,621 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:59,621 [Rank 0]: VALID_powershell: +[ip-26-0-150-122:0]:2023-06-21 17:27:59,621 [Rank 0]: document indices in [259331, 267359) total of 8028 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:59,621 [Rank 0]: > Tokens per epoch: 8559847 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,624 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,624 [Rank 0]: > last epoch number of samples (1004) is larger than 80% of number of samples per epoch (1044), setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:59,627 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002673 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 8028 +[ip-26-0-150-122:0]: number of epochs: 2 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2089 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,630 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003070 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,630 [Rank 0]: > building shuffle index with split [0, 2089) and [2089, 2089) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,636 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.005995 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,636 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document_VALID_powershell_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,641 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document_VALID_powershell_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,643 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document_VALID_powershell_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,646 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:59,646 [Rank 0]: total number of samples: 2090 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,646 [Rank 0]: total number of epochs: 2 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,729 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,731 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,731 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,731 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,731 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,731 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,731 [Rank 0]: > finished creating indexed dataset in 0.002337 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:59,731 [Rank 0]: number of documents: 4700526 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,731 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:59,731 [Rank 0]: VALID_go: +[ip-26-0-150-122:0]:2023-06-21 17:27:59,732 [Rank 0]: document indices in [4554810, 4695825) total of 141015 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:59,733 [Rank 0]: > Tokens per epoch: 253353715 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,735 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,735 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:27:59,743 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.007701 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 141015 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 30926 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,748 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004221 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,748 [Rank 0]: > building shuffle index with split [0, 30926) and [30926, 30926) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,752 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004390 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,753 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document_VALID_go_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,758 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document_VALID_go_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,758 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document_VALID_go_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,760 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:59,760 [Rank 0]: total number of samples: 30927 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,760 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,842 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,843 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,844 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,844 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,844 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,844 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,844 [Rank 0]: > finished creating indexed dataset in 0.001406 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:59,844 [Rank 0]: number of documents: 98447 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,844 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:59,844 [Rank 0]: VALID_erlang: +[ip-26-0-150-122:0]:2023-06-21 17:27:59,844 [Rank 0]: document indices in [95395, 98349) total of 2954 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:59,844 [Rank 0]: > Tokens per epoch: 6597590 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,846 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,847 [Rank 0]: > last epoch number of samples (438) is smaller than 80% of number of samples per epoch (805), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:27:59,850 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003486 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 2954 +[ip-26-0-150-122:0]: number of epochs: 3 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2416 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,854 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003487 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,854 [Rank 0]: > building shuffle index with split [0, 1610) and [1610, 2416) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,857 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003391 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,864 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document_VALID_erlang_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,868 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document_VALID_erlang_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,870 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document_VALID_erlang_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,872 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:59,873 [Rank 0]: total number of samples: 2417 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,873 [Rank 0]: total number of epochs: 3 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,956 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,957 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,957 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,957 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,957 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,957 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,958 [Rank 0]: > finished creating indexed dataset in 0.001523 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:59,958 [Rank 0]: number of documents: 124066 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,958 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:27:59,958 [Rank 0]: VALID_f-sharp: +[ip-26-0-150-122:0]:2023-06-21 17:27:59,958 [Rank 0]: document indices in [120220, 123942) total of 3722 documents +[ip-26-0-150-122:0]:2023-06-21 17:27:59,958 [Rank 0]: > Tokens per epoch: 4694260 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,961 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,961 [Rank 0]: > last epoch number of samples (329) is smaller than 80% of number of samples per epoch (573), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:27:59,964 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003069 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 3722 +[ip-26-0-150-122:0]: number of epochs: 4 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2292 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,968 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003844 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,968 [Rank 0]: > building shuffle index with split [0, 1719) and [1719, 2292) ... +[ip-26-0-150-122:0]:2023-06-21 17:27:59,970 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002148 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,971 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document_VALID_f-sharp_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,976 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document_VALID_f-sharp_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,978 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document_VALID_f-sharp_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:27:59,978 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:27:59,979 [Rank 0]: total number of samples: 2293 +[ip-26-0-150-122:0]:2023-06-21 17:27:59,979 [Rank 0]: total number of epochs: 4 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,061 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,062 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,062 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,062 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,062 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,062 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,062 [Rank 0]: > finished creating indexed dataset in 0.001108 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:00,062 [Rank 0]: number of documents: 30934 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,062 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:00,062 [Rank 0]: VALID_ada: +[ip-26-0-150-122:0]:2023-06-21 17:28:00,062 [Rank 0]: document indices in [29975, 30903) total of 928 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:00,063 [Rank 0]: > Tokens per epoch: 2230554 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,066 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,066 [Rank 0]: > last epoch number of samples (143) is smaller than 80% of number of samples per epoch (272), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:00,070 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.004065 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 928 +[ip-26-0-150-122:0]: number of epochs: 8 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2178 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,074 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004121 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,074 [Rank 0]: > building shuffle index with split [0, 1905) and [1905, 2178) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,077 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002964 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,122 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document_VALID_ada_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,131 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document_VALID_ada_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,131 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document_VALID_ada_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,131 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:00,131 [Rank 0]: total number of samples: 2179 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,131 [Rank 0]: total number of epochs: 8 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,215 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,216 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,216 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,216 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,216 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,216 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,217 [Rank 0]: > finished creating indexed dataset in 0.001481 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:00,217 [Rank 0]: number of documents: 110981 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,217 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:00,217 [Rank 0]: VALID_pascal: +[ip-26-0-150-122:0]:2023-06-21 17:28:00,217 [Rank 0]: document indices in [107541, 110870) total of 3329 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:00,217 [Rank 0]: > Tokens per epoch: 21526929 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,219 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,219 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:00,222 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003074 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 3329 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2627 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,225 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002768 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,225 [Rank 0]: > building shuffle index with split [0, 2627) and [2627, 2627) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,229 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004297 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,232 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document_VALID_pascal_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,236 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document_VALID_pascal_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,237 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document_VALID_pascal_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,237 [Rank 0]: loaded indexed file in 0.005 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:00,237 [Rank 0]: total number of samples: 2628 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,237 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,320 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,322 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,322 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,322 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,322 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,323 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,323 [Rank 0]: > finished creating indexed dataset in 0.002411 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:00,323 [Rank 0]: number of documents: 365491 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,323 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:00,323 [Rank 0]: VALID_perl: +[ip-26-0-150-122:0]:2023-06-21 17:28:00,323 [Rank 0]: document indices in [354161, 365126) total of 10965 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:00,323 [Rank 0]: > Tokens per epoch: 25729670 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,325 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,325 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:00,329 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003576 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 10965 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 3140 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,331 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002792 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,332 [Rank 0]: > building shuffle index with split [0, 3140) and [3140, 3140) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,335 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003522 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,341 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document_VALID_perl_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,346 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document_VALID_perl_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,346 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document_VALID_perl_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,347 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:00,347 [Rank 0]: total number of samples: 3141 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,347 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,430 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,430 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,430 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,431 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,431 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,431 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,431 [Rank 0]: > finished creating indexed dataset in 0.000989 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:00,431 [Rank 0]: number of documents: 39042 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,431 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:00,431 [Rank 0]: VALID_r: +[ip-26-0-150-122:0]:2023-06-21 17:28:00,431 [Rank 0]: document indices in [37832, 39003) total of 1171 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:00,431 [Rank 0]: > Tokens per epoch: 2880088 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,434 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,434 [Rank 0]: > last epoch number of samples (291) is larger than 80% of number of samples per epoch (351), setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:00,438 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003234 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1171 +[ip-26-0-150-122:0]: number of epochs: 6 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2109 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,441 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003574 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,441 [Rank 0]: > building shuffle index with split [0, 2109) and [2109, 2109) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,444 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002730 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,447 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document_VALID_r_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,453 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document_VALID_r_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,458 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document_VALID_r_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,459 [Rank 0]: loaded indexed file in 0.012 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:00,459 [Rank 0]: total number of samples: 2110 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,459 [Rank 0]: total number of epochs: 6 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,542 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,543 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,543 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,543 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,543 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,543 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,543 [Rank 0]: > finished creating indexed dataset in 0.001363 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:00,543 [Rank 0]: number of documents: 97167 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,544 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:00,544 [Rank 0]: VALID_protocol-buffer: +[ip-26-0-150-122:0]:2023-06-21 17:28:00,544 [Rank 0]: document indices in [94155, 97070) total of 2915 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:00,544 [Rank 0]: > Tokens per epoch: 2614634 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,547 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,547 [Rank 0]: > last epoch number of samples (133) is smaller than 80% of number of samples per epoch (319), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:00,551 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003422 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 2915 +[ip-26-0-150-122:0]: number of epochs: 7 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2234 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,554 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003362 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,554 [Rank 0]: > building shuffle index with split [0, 1915) and [1915, 2234) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,557 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002918 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,557 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document_VALID_protocol-buffer_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,562 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document_VALID_protocol-buffer_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,562 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document_VALID_protocol-buffer_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,563 [Rank 0]: loaded indexed file in 0.005 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:00,563 [Rank 0]: total number of samples: 2235 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,563 [Rank 0]: total number of epochs: 7 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,646 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,647 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,648 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,648 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,648 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,648 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,648 [Rank 0]: > finished creating indexed dataset in 0.002081 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:00,648 [Rank 0]: number of documents: 186375 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,648 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:00,648 [Rank 0]: VALID_cmake: +[ip-26-0-150-122:0]:2023-06-21 17:28:00,648 [Rank 0]: document indices in [180597, 186189) total of 5592 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:00,648 [Rank 0]: > Tokens per epoch: 4338734 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,651 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,651 [Rank 0]: > last epoch number of samples (460) is larger than 80% of number of samples per epoch (529), setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:00,655 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003150 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 5592 +[ip-26-0-150-122:0]: number of epochs: 4 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2118 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,658 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003588 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,658 [Rank 0]: > building shuffle index with split [0, 2118) and [2118, 2118) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,661 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002840 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,666 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document_VALID_cmake_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,671 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document_VALID_cmake_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,672 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document_VALID_cmake_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,673 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:00,673 [Rank 0]: total number of samples: 2119 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,673 [Rank 0]: total number of epochs: 4 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,757 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,757 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,757 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,757 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,757 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,757 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,757 [Rank 0]: > finished creating indexed dataset in 0.000735 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:00,758 [Rank 0]: number of documents: 9226 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,758 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:00,758 [Rank 0]: VALID_sas: +[ip-26-0-150-122:0]:2023-06-21 17:28:00,758 [Rank 0]: document indices in [8940, 9217) total of 277 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:00,758 [Rank 0]: > Tokens per epoch: 1021218 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,761 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,761 [Rank 0]: > last epoch number of samples (54) is smaller than 80% of number of samples per epoch (124), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:00,764 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002904 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 277 +[ip-26-0-150-122:0]: number of epochs: 17 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2119 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,768 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003730 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,768 [Rank 0]: > building shuffle index with split [0, 1994) and [1994, 2119) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,771 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002562 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,771 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document_VALID_sas_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,775 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document_VALID_sas_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,776 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document_VALID_sas_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,777 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:00,777 [Rank 0]: total number of samples: 2120 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,777 [Rank 0]: total number of epochs: 17 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,860 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,862 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,862 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,862 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,863 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,863 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,863 [Rank 0]: > finished creating indexed dataset in 0.002244 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:00,863 [Rank 0]: number of documents: 3390320 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,863 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:00,863 [Rank 0]: VALID_ruby: +[ip-26-0-150-122:0]:2023-06-21 17:28:00,863 [Rank 0]: document indices in [3285220, 3386930) total of 101710 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:00,864 [Rank 0]: > Tokens per epoch: 61345928 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,867 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,867 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:00,872 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.005407 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 101710 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 7488 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,876 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003428 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,876 [Rank 0]: > building shuffle index with split [0, 7488) and [7488, 7488) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,879 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002558 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,879 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document_VALID_ruby_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,888 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document_VALID_ruby_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,888 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document_VALID_ruby_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,889 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:00,889 [Rank 0]: total number of samples: 7489 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,889 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,971 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,973 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,973 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,973 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,973 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,973 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,973 [Rank 0]: > finished creating indexed dataset in 0.002114 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:00,973 [Rank 0]: number of documents: 1380468 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,973 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:00,973 [Rank 0]: VALID_rust: +[ip-26-0-150-122:0]:2023-06-21 17:28:00,973 [Rank 0]: document indices in [1337673, 1379088) total of 41415 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:00,974 [Rank 0]: > Tokens per epoch: 81845020 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,976 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,977 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:00,981 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.004022 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 41415 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 9990 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,984 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003029 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,984 [Rank 0]: > building shuffle index with split [0, 9990) and [9990, 9990) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:00,988 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004048 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,988 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document_VALID_rust_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,996 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document_VALID_rust_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,997 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document_VALID_rust_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:00,997 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:00,997 [Rank 0]: total number of samples: 9991 +[ip-26-0-150-122:0]:2023-06-21 17:28:00,997 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,079 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,080 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,080 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,080 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,080 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,080 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,080 [Rank 0]: > finished creating indexed dataset in 0.000731 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:01,080 [Rank 0]: number of documents: 5386 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,080 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:01,080 [Rank 0]: VALID_rmarkdown: +[ip-26-0-150-122:0]:2023-06-21 17:28:01,081 [Rank 0]: document indices in [5219, 5381) total of 162 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:01,081 [Rank 0]: > Tokens per epoch: 626200 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,083 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,083 [Rank 0]: > last epoch number of samples (61) is larger than 80% of number of samples per epoch (76), setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:01,087 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003994 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 162 +[ip-26-0-150-122:0]: number of epochs: 27 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2063 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,091 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003976 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,091 [Rank 0]: > building shuffle index with split [0, 2063) and [2063, 2063) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,094 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002503 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,094 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document_VALID_rmarkdown_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,099 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document_VALID_rmarkdown_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,100 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document_VALID_rmarkdown_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,100 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:01,100 [Rank 0]: total number of samples: 2064 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,100 [Rank 0]: total number of epochs: 27 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,182 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,184 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,184 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,185 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,185 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,185 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,185 [Rank 0]: > finished creating indexed dataset in 0.002401 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:01,185 [Rank 0]: number of documents: 10801285 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,185 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:01,185 [Rank 0]: VALID_c-sharp: +[ip-26-0-150-122:0]:2023-06-21 17:28:01,185 [Rank 0]: document indices in [10466445, 10790484) total of 324039 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:01,187 [Rank 0]: > Tokens per epoch: 318261515 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,190 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,190 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:01,204 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.013754 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 324039 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 38850 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,208 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004621 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,209 [Rank 0]: > building shuffle index with split [0, 38850) and [38850, 38850) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,213 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004475 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,213 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document_VALID_c-sharp_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,223 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document_VALID_c-sharp_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,224 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document_VALID_c-sharp_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,224 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:01,224 [Rank 0]: total number of samples: 38851 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,224 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,307 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,309 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,309 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,309 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,309 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,309 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,309 [Rank 0]: > finished creating indexed dataset in 0.002165 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:01,309 [Rank 0]: number of documents: 587748 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,309 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:01,310 [Rank 0]: VALID_smalltalk: +[ip-26-0-150-122:0]:2023-06-21 17:28:01,310 [Rank 0]: document indices in [569528, 587160) total of 17632 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:01,310 [Rank 0]: > Tokens per epoch: 6393705 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,313 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,313 [Rank 0]: > last epoch number of samples (488) is smaller than 80% of number of samples per epoch (780), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:01,318 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.005171 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 17632 +[ip-26-0-150-122:0]: number of epochs: 3 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2341 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,324 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.005542 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,324 [Rank 0]: > building shuffle index with split [0, 1560) and [1560, 2341) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,328 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004037 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,329 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document_VALID_smalltalk_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,334 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document_VALID_smalltalk_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,335 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document_VALID_smalltalk_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,335 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:01,335 [Rank 0]: total number of samples: 2342 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,335 [Rank 0]: total number of epochs: 3 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,418 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,420 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,420 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,420 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,420 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,420 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,420 [Rank 0]: > finished creating indexed dataset in 0.002215 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:01,420 [Rank 0]: number of documents: 541454 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,420 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:01,420 [Rank 0]: VALID_haskell: +[ip-26-0-150-122:0]:2023-06-21 17:28:01,420 [Rank 0]: document indices in [524669, 540913) total of 16244 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:01,421 [Rank 0]: > Tokens per epoch: 19105324 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,423 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,423 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:01,427 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.004004 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 16244 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2332 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,431 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003378 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,431 [Rank 0]: > building shuffle index with split [0, 2332) and [2332, 2332) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,434 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002811 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,436 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document_VALID_haskell_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,440 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document_VALID_haskell_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,441 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document_VALID_haskell_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,442 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:01,442 [Rank 0]: total number of samples: 2333 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,442 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,525 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,525 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,525 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,525 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,525 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,526 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,526 [Rank 0]: > finished creating indexed dataset in 0.000686 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:01,526 [Rank 0]: number of documents: 1152 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,526 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:01,526 [Rank 0]: VALID_maple: +[ip-26-0-150-122:0]:2023-06-21 17:28:01,526 [Rank 0]: document indices in [1116, 1151) total of 35 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:01,526 [Rank 0]: > Tokens per epoch: 30587 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,529 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,529 [Rank 0]: > last epoch number of samples (2) is larger than 80% of number of samples per epoch (3), setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:01,532 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003190 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 35 +[ip-26-0-150-122:0]: number of epochs: 549 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2049 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,535 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002869 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,536 [Rank 0]: > building shuffle index with split [0, 2049) and [2049, 2049) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,537 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001853 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,540 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document_VALID_maple_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,545 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document_VALID_maple_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,546 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document_VALID_maple_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,548 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:01,548 [Rank 0]: total number of samples: 2050 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,548 [Rank 0]: total number of epochs: 549 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,631 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]: > finished creating indexed dataset in 0.000886 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]: number of documents: 22653 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]: VALID_mathematica: +[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]: document indices in [21951, 22630) total of 679 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:01,632 [Rank 0]: > Tokens per epoch: 16838913 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,635 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,635 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:01,637 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002029 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 679 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2055 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,639 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.001919 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,639 [Rank 0]: > building shuffle index with split [0, 2055) and [2055, 2055) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,641 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001841 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,644 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document_VALID_mathematica_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,648 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document_VALID_mathematica_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,652 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document_VALID_mathematica_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,654 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:01,654 [Rank 0]: total number of samples: 2056 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,654 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,737 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,738 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,739 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,739 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,739 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,739 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,739 [Rank 0]: > finished creating indexed dataset in 0.001762 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:01,739 [Rank 0]: number of documents: 158356 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,739 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:01,739 [Rank 0]: VALID_ocaml: +[ip-26-0-150-122:0]:2023-06-21 17:28:01,739 [Rank 0]: document indices in [153447, 158198) total of 4751 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:01,739 [Rank 0]: > Tokens per epoch: 9867998 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,742 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,742 [Rank 0]: > last epoch number of samples (844) is smaller than 80% of number of samples per epoch (1204), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:01,745 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002709 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 4751 +[ip-26-0-150-122:0]: number of epochs: 2 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2409 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,748 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003420 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,749 [Rank 0]: > building shuffle index with split [0, 1204) and [1204, 2409) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,752 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003230 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,793 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document_VALID_ocaml_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,798 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document_VALID_ocaml_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,800 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document_VALID_ocaml_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,802 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:01,802 [Rank 0]: total number of samples: 2410 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,802 [Rank 0]: total number of epochs: 2 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,885 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,887 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,887 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,887 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,887 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,887 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,887 [Rank 0]: > finished creating indexed dataset in 0.002210 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:01,887 [Rank 0]: number of documents: 657349 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,887 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:01,888 [Rank 0]: VALID_makefile: +[ip-26-0-150-122:0]:2023-06-21 17:28:01,888 [Rank 0]: document indices in [636971, 656692) total of 19721 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:01,888 [Rank 0]: > Tokens per epoch: 14806733 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,890 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,890 [Rank 0]: > last epoch number of samples (241) is smaller than 80% of number of samples per epoch (1807), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:01,894 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003898 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 19721 +[ip-26-0-150-122:0]: number of epochs: 2 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 3614 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,897 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002912 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,897 [Rank 0]: > building shuffle index with split [0, 1807) and [1807, 3614) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:01,899 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002130 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,945 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document_VALID_makefile_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,953 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document_VALID_makefile_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,953 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document_VALID_makefile_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:01,954 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:01,954 [Rank 0]: total number of samples: 3615 +[ip-26-0-150-122:0]:2023-06-21 17:28:01,954 [Rank 0]: total number of epochs: 2 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,037 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,038 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,038 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,038 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,038 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,039 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,039 [Rank 0]: > finished creating indexed dataset in 0.001940 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:02,039 [Rank 0]: number of documents: 549459 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,039 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:02,039 [Rank 0]: VALID_lua: +[ip-26-0-150-122:0]:2023-06-21 17:28:02,039 [Rank 0]: document indices in [532426, 548910) total of 16484 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:02,039 [Rank 0]: > Tokens per epoch: 29891276 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,042 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,042 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:02,046 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003458 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 16484 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 3648 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,049 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002739 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,049 [Rank 0]: > building shuffle index with split [0, 3648) and [3648, 3648) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,052 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002691 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,052 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document_VALID_lua_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,059 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document_VALID_lua_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,065 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document_VALID_lua_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,068 [Rank 0]: loaded indexed file in 0.016 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:02,068 [Rank 0]: total number of samples: 3649 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,068 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,151 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]: > finished creating indexed dataset in 0.000729 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]: number of documents: 1133 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]: VALID_literate-coffeescript: +[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]: document indices in [1098, 1132) total of 34 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:02,152 [Rank 0]: > Tokens per epoch: 39416 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,155 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,155 [Rank 0]: > last epoch number of samples (4) is larger than 80% of number of samples per epoch (4), setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:02,158 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002805 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 34 +[ip-26-0-150-122:0]: number of epochs: 426 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2049 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,162 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003502 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,162 [Rank 0]: > building shuffle index with split [0, 2049) and [2049, 2049) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,165 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002926 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,165 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document_VALID_literate-coffeescript_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,172 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document_VALID_literate-coffeescript_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,172 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document_VALID_literate-coffeescript_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,172 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:02,173 [Rank 0]: total number of samples: 2050 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,173 [Rank 0]: total number of epochs: 426 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,256 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,256 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,256 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,256 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,256 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,256 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,257 [Rank 0]: > finished creating indexed dataset in 0.000713 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:02,257 [Rank 0]: number of documents: 6104 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,257 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:02,257 [Rank 0]: VALID_literate-haskell: +[ip-26-0-150-122:0]:2023-06-21 17:28:02,257 [Rank 0]: document indices in [5915, 6098) total of 183 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:02,257 [Rank 0]: > Tokens per epoch: 518557 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,259 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,260 [Rank 0]: > last epoch number of samples (23) is smaller than 80% of number of samples per epoch (63), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:02,262 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002487 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 183 +[ip-26-0-150-122:0]: number of epochs: 33 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2088 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,265 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002833 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,265 [Rank 0]: > building shuffle index with split [0, 2025) and [2025, 2088) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,268 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002635 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,271 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document_VALID_literate-haskell_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,277 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document_VALID_literate-haskell_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,282 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document_VALID_literate-haskell_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,283 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:02,283 [Rank 0]: total number of samples: 2089 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,283 [Rank 0]: total number of epochs: 33 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,366 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,367 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,367 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,367 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,368 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,368 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,368 [Rank 0]: > finished creating indexed dataset in 0.002059 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:02,368 [Rank 0]: number of documents: 896880 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,368 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:02,368 [Rank 0]: VALID_restructuredtext: +[ip-26-0-150-122:0]:2023-06-21 17:28:02,368 [Rank 0]: document indices in [869077, 895983) total of 26906 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:02,368 [Rank 0]: > Tokens per epoch: 31882370 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,371 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,371 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:02,374 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003130 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 26906 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 3891 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,378 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003741 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,378 [Rank 0]: > building shuffle index with split [0, 3891) and [3891, 3891) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,380 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001913 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,380 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document_VALID_restructuredtext_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,384 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document_VALID_restructuredtext_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,385 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document_VALID_restructuredtext_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,387 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:02,387 [Rank 0]: total number of samples: 3892 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,387 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,470 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]: > finished creating indexed dataset in 0.000714 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]: number of documents: 3688 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]: VALID_racket: +[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]: document indices in [3574, 3684) total of 110 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:02,471 [Rank 0]: > Tokens per epoch: 233387 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,474 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,474 [Rank 0]: > last epoch number of samples (26) is larger than 80% of number of samples per epoch (28), setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:02,477 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003119 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 110 +[ip-26-0-150-122:0]: number of epochs: 72 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2051 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,481 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003143 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,481 [Rank 0]: > building shuffle index with split [0, 2051) and [2051, 2051) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,484 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002894 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,485 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document_VALID_racket_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,492 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document_VALID_racket_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,492 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document_VALID_racket_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,493 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:02,493 [Rank 0]: total number of samples: 2052 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,493 [Rank 0]: total number of epochs: 72 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,576 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,576 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,576 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,576 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,577 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,577 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,577 [Rank 0]: > finished creating indexed dataset in 0.000879 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:02,577 [Rank 0]: number of documents: 19630 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,577 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:02,577 [Rank 0]: VALID_standard-ml: +[ip-26-0-150-122:0]:2023-06-21 17:28:02,577 [Rank 0]: document indices in [19021, 19610) total of 589 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:02,577 [Rank 0]: > Tokens per epoch: 2060914 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,580 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,580 [Rank 0]: > last epoch number of samples (36) is smaller than 80% of number of samples per epoch (251), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:02,583 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002579 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 589 +[ip-26-0-150-122:0]: number of epochs: 9 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2264 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,585 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002657 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,586 [Rank 0]: > building shuffle index with split [0, 2012) and [2012, 2264) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,588 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002530 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,589 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document_VALID_standard-ml_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,595 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document_VALID_standard-ml_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,595 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document_VALID_standard-ml_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,596 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:02,596 [Rank 0]: total number of samples: 2265 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,596 [Rank 0]: total number of epochs: 9 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,679 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]: > finished creating indexed dataset in 0.001124 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]: number of documents: 46270 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]: VALID_systemverilog: +[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]: document indices in [44836, 46224) total of 1388 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:02,680 [Rank 0]: > Tokens per epoch: 4206961 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,682 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,682 [Rank 0]: > last epoch number of samples (508) is larger than 80% of number of samples per epoch (513), setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:02,685 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002397 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1388 +[ip-26-0-150-122:0]: number of epochs: 4 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2054 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,687 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002331 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,688 [Rank 0]: > building shuffle index with split [0, 2054) and [2054, 2054) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,689 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001858 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,693 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document_VALID_systemverilog_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,698 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document_VALID_systemverilog_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,698 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document_VALID_systemverilog_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,699 [Rank 0]: loaded indexed file in 0.005 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:02,699 [Rank 0]: total number of samples: 2055 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,699 [Rank 0]: total number of epochs: 4 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,781 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,783 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,783 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,783 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,783 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,784 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,784 [Rank 0]: > finished creating indexed dataset in 0.002254 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:02,784 [Rank 0]: number of documents: 522778 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,784 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:02,784 [Rank 0]: VALID_tex: +[ip-26-0-150-122:0]:2023-06-21 17:28:02,784 [Rank 0]: document indices in [506572, 522255) total of 15683 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:02,784 [Rank 0]: > Tokens per epoch: 56256264 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,786 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,786 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:02,789 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002800 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 15683 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 6867 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,791 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002120 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,791 [Rank 0]: > building shuffle index with split [0, 6867) and [6867, 6867) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,793 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002073 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,798 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document_VALID_tex_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,802 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document_VALID_tex_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,803 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document_VALID_tex_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,803 [Rank 0]: loaded indexed file in 0.005 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:02,803 [Rank 0]: total number of samples: 6868 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,803 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,886 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,886 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,886 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,886 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,886 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,887 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,887 [Rank 0]: > finished creating indexed dataset in 0.000829 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:02,887 [Rank 0]: number of documents: 10289 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,887 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:02,887 [Rank 0]: VALID_awk: +[ip-26-0-150-122:0]:2023-06-21 17:28:02,887 [Rank 0]: document indices in [9970, 10279) total of 309 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:02,887 [Rank 0]: > Tokens per epoch: 224077 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,889 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,889 [Rank 0]: > last epoch number of samples (24) is larger than 80% of number of samples per epoch (27), setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:02,892 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002709 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 309 +[ip-26-0-150-122:0]: number of epochs: 75 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2051 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,894 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002155 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,895 [Rank 0]: > building shuffle index with split [0, 2051) and [2051, 2051) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,898 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003301 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,901 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document_VALID_awk_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,908 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document_VALID_awk_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,908 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document_VALID_awk_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:02,909 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:02,909 [Rank 0]: total number of samples: 2052 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,909 [Rank 0]: total number of epochs: 75 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,991 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,993 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,993 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,993 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,993 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,993 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,993 [Rank 0]: > finished creating indexed dataset in 0.002246 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:02,993 [Rank 0]: number of documents: 247919 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,993 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:02,993 [Rank 0]: VALID_assembly: +[ip-26-0-150-122:0]:2023-06-21 17:28:02,993 [Rank 0]: document indices in [240234, 247671) total of 7437 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:02,994 [Rank 0]: > Tokens per epoch: 23244839 +[ip-26-0-150-122:0]:2023-06-21 17:28:02,996 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:02,996 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:02,999 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002417 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 7437 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2837 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,002 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003104 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,002 [Rank 0]: > building shuffle index with split [0, 2837) and [2837, 2837) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,005 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003012 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,008 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document_VALID_assembly_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,013 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document_VALID_assembly_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,013 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document_VALID_assembly_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,013 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:03,014 [Rank 0]: total number of samples: 2838 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,014 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,096 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,096 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,096 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,097 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,097 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,097 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,097 [Rank 0]: > finished creating indexed dataset in 0.000723 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:03,097 [Rank 0]: number of documents: 5368 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,097 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:03,097 [Rank 0]: VALID_alloy: +[ip-26-0-150-122:0]:2023-06-21 17:28:03,097 [Rank 0]: document indices in [5202, 5363) total of 161 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:03,097 [Rank 0]: > Tokens per epoch: 60505 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,100 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,100 [Rank 0]: > last epoch number of samples (3) is smaller than 80% of number of samples per epoch (7), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:03,104 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003553 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 161 +[ip-26-0-150-122:0]: number of epochs: 278 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2053 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,106 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002137 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,106 [Rank 0]: > building shuffle index with split [0, 2045) and [2045, 2053) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,109 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003112 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,116 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document_VALID_alloy_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,123 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document_VALID_alloy_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,124 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document_VALID_alloy_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,124 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:03,124 [Rank 0]: total number of samples: 2054 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,124 [Rank 0]: total number of epochs: 278 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,207 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,207 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,208 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,208 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,208 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,208 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,208 [Rank 0]: > finished creating indexed dataset in 0.000798 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:03,208 [Rank 0]: number of documents: 17554 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,208 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:03,208 [Rank 0]: VALID_agda: +[ip-26-0-150-122:0]:2023-06-21 17:28:03,208 [Rank 0]: document indices in [17010, 17536) total of 526 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:03,208 [Rank 0]: > Tokens per epoch: 791611 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,210 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,210 [Rank 0]: > last epoch number of samples (19) is smaller than 80% of number of samples per epoch (96), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:03,214 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003379 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 526 +[ip-26-0-150-122:0]: number of epochs: 22 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2125 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,217 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003273 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,217 [Rank 0]: > building shuffle index with split [0, 2029) and [2029, 2125) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,221 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003384 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,221 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document_VALID_agda_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,227 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document_VALID_agda_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,228 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document_VALID_agda_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,231 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:03,231 [Rank 0]: total number of samples: 2126 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,231 [Rank 0]: total number of epochs: 22 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,313 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,314 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,314 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,314 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,314 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,314 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,314 [Rank 0]: > finished creating indexed dataset in 0.001096 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:03,314 [Rank 0]: number of documents: 52838 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,315 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:03,315 [Rank 0]: VALID_emacs-lisp: +[ip-26-0-150-122:0]:2023-06-21 17:28:03,315 [Rank 0]: document indices in [51200, 52785) total of 1585 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:03,315 [Rank 0]: > Tokens per epoch: 3599819 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,317 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,317 [Rank 0]: > last epoch number of samples (291) is smaller than 80% of number of samples per epoch (439), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:03,320 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002774 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1585 +[ip-26-0-150-122:0]: number of epochs: 5 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2197 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,323 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002762 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,323 [Rank 0]: > building shuffle index with split [0, 1757) and [1757, 2197) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,326 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002636 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,326 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document_VALID_emacs-lisp_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,331 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document_VALID_emacs-lisp_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,331 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document_VALID_emacs-lisp_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,332 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:03,332 [Rank 0]: total number of samples: 2198 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,332 [Rank 0]: total number of epochs: 5 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,414 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,417 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,417 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,417 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,417 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,417 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,417 [Rank 0]: > finished creating indexed dataset in 0.002377 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:03,417 [Rank 0]: number of documents: 928415 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,417 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:03,417 [Rank 0]: VALID_dart: +[ip-26-0-150-122:0]:2023-06-21 17:28:03,417 [Rank 0]: document indices in [899634, 927487) total of 27853 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:03,418 [Rank 0]: > Tokens per epoch: 27319085 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,420 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,420 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:03,423 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003215 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 27853 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 3334 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,426 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002939 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,427 [Rank 0]: > building shuffle index with split [0, 3334) and [3334, 3334) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,429 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002031 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,433 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document_VALID_dart_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,438 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document_VALID_dart_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,439 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document_VALID_dart_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,440 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:03,440 [Rank 0]: total number of samples: 3335 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,440 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,523 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]: > finished creating indexed dataset in 0.001148 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]: number of documents: 58151 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]: VALID_cuda: +[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]: document indices in [56348, 58093) total of 1745 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:03,524 [Rank 0]: > Tokens per epoch: 5481832 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,527 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,527 [Rank 0]: > last epoch number of samples (41) is smaller than 80% of number of samples per epoch (669), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:03,530 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002944 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1745 +[ip-26-0-150-122:0]: number of epochs: 4 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2676 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,533 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002695 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,533 [Rank 0]: > building shuffle index with split [0, 2007) and [2007, 2676) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,535 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002061 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,537 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document_VALID_cuda_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,545 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document_VALID_cuda_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,545 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document_VALID_cuda_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,545 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:03,546 [Rank 0]: total number of samples: 2677 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,546 [Rank 0]: total number of epochs: 4 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,628 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,628 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,628 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,629 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,629 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,629 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,629 [Rank 0]: > finished creating indexed dataset in 0.000731 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:03,629 [Rank 0]: number of documents: 5928 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,629 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:03,629 [Rank 0]: VALID_bluespec: +[ip-26-0-150-122:0]:2023-06-21 17:28:03,629 [Rank 0]: document indices in [5744, 5922) total of 178 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:03,629 [Rank 0]: > Tokens per epoch: 389178 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,631 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,631 [Rank 0]: > last epoch number of samples (6) is smaller than 80% of number of samples per epoch (47), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:03,634 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002778 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 178 +[ip-26-0-150-122:0]: number of epochs: 44 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2090 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,636 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002164 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,636 [Rank 0]: > building shuffle index with split [0, 2042) and [2042, 2090) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,639 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003067 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,642 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document_VALID_bluespec_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,646 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document_VALID_bluespec_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,647 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document_VALID_bluespec_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,647 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:03,647 [Rank 0]: total number of samples: 2091 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,647 [Rank 0]: total number of epochs: 44 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,730 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,730 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,730 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,730 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,731 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,731 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,731 [Rank 0]: > finished creating indexed dataset in 0.000697 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:03,731 [Rank 0]: number of documents: 180 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,731 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:03,731 [Rank 0]: VALID_augeas: +[ip-26-0-150-122:0]:2023-06-21 17:28:03,731 [Rank 0]: document indices in [174, 180) total of 6 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:03,731 [Rank 0]: > Tokens per epoch: 7815 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,735 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,735 [Rank 0]: > last epoch number of samples (1) is larger than 80% of number of samples per epoch (0), setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:03,738 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003106 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 6 +[ip-26-0-150-122:0]: number of epochs: 2147 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2048 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,742 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003817 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,742 [Rank 0]: > building shuffle index with split [0, 2048) and [2048, 2048) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,744 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001868 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,745 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document_VALID_augeas_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,752 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document_VALID_augeas_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,756 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document_VALID_augeas_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,757 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:03,757 [Rank 0]: total number of samples: 2049 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,757 [Rank 0]: total number of epochs: 2147 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,840 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,842 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,842 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,842 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,842 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,842 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,842 [Rank 0]: > finished creating indexed dataset in 0.002311 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:03,842 [Rank 0]: number of documents: 239568 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,842 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:03,842 [Rank 0]: VALID_batchfile: +[ip-26-0-150-122:0]:2023-06-21 17:28:03,843 [Rank 0]: document indices in [232141, 239328) total of 7187 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:03,843 [Rank 0]: > Tokens per epoch: 3729565 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,845 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,846 [Rank 0]: > last epoch number of samples (227) is smaller than 80% of number of samples per epoch (455), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:03,849 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003623 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 7187 +[ip-26-0-150-122:0]: number of epochs: 5 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2276 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,852 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003036 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,853 [Rank 0]: > building shuffle index with split [0, 1821) and [1821, 2276) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,855 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002944 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,856 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document_VALID_batchfile_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,863 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document_VALID_batchfile_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,868 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document_VALID_batchfile_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,868 [Rank 0]: loaded indexed file in 0.012 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:03,868 [Rank 0]: total number of samples: 2277 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,868 [Rank 0]: total number of epochs: 5 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,951 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]: > finished creating indexed dataset in 0.000754 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]: number of documents: 4806 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]: VALID_tcsh: +[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]: document indices in [4657, 4801) total of 144 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:03,952 [Rank 0]: > Tokens per epoch: 118601 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,955 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,955 [Rank 0]: > last epoch number of samples (7) is smaller than 80% of number of samples per epoch (14), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:03,959 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003267 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 144 +[ip-26-0-150-122:0]: number of epochs: 142 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2055 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,962 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003060 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,962 [Rank 0]: > building shuffle index with split [0, 2041) and [2041, 2055) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:03,965 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003306 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,966 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document_VALID_tcsh_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,973 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document_VALID_tcsh_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,979 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document_VALID_tcsh_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:03,985 [Rank 0]: loaded indexed file in 0.019 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:03,985 [Rank 0]: total number of samples: 2056 +[ip-26-0-150-122:0]:2023-06-21 17:28:03,985 [Rank 0]: total number of epochs: 142 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,068 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,068 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,069 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,069 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,069 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,069 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,069 [Rank 0]: > finished creating indexed dataset in 0.000722 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:04,069 [Rank 0]: number of documents: 5429 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,069 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:04,069 [Rank 0]: VALID_stan: +[ip-26-0-150-122:0]:2023-06-21 17:28:04,069 [Rank 0]: document indices in [5261, 5424) total of 163 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:04,069 [Rank 0]: > Tokens per epoch: 146349 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,071 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,071 [Rank 0]: > last epoch number of samples (12) is smaller than 80% of number of samples per epoch (17), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:04,075 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003399 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 163 +[ip-26-0-150-122:0]: number of epochs: 115 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2054 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,079 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003582 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,079 [Rank 0]: > building shuffle index with split [0, 2036) and [2036, 2054) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,081 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001931 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,081 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document_VALID_stan_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,086 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document_VALID_stan_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,086 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document_VALID_stan_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,088 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:04,088 [Rank 0]: total number of samples: 2055 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,088 [Rank 0]: total number of epochs: 115 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,171 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,173 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,174 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,174 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,174 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,174 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,174 [Rank 0]: > finished creating indexed dataset in 0.002231 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:04,174 [Rank 0]: number of documents: 1355788 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,174 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:04,174 [Rank 0]: VALID_scala: +[ip-26-0-150-122:0]:2023-06-21 17:28:04,174 [Rank 0]: document indices in [1313759, 1354432) total of 40673 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:04,175 [Rank 0]: > Tokens per epoch: 38836780 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,176 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,177 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:04,180 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003434 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 40673 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 4740 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,184 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003736 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,184 [Rank 0]: > building shuffle index with split [0, 4740) and [4740, 4740) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,187 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003390 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,188 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document_VALID_scala_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,196 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document_VALID_scala_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,197 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document_VALID_scala_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,197 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:04,197 [Rank 0]: total number of samples: 4741 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,197 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,280 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]: > finished creating indexed dataset in 0.001043 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]: number of documents: 49335 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]: VALID_tcl: +[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]: document indices in [47806, 49286) total of 1480 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:04,281 [Rank 0]: > Tokens per epoch: 3611088 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,283 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,284 [Rank 0]: > last epoch number of samples (285) is smaller than 80% of number of samples per epoch (440), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:04,287 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002987 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1480 +[ip-26-0-150-122:0]: number of epochs: 5 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2204 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,290 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003655 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,291 [Rank 0]: > building shuffle index with split [0, 1763) and [1763, 2204) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,294 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003071 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,296 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document_VALID_tcl_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,301 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document_VALID_tcl_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,301 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document_VALID_tcl_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,302 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:04,302 [Rank 0]: total number of samples: 2205 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,302 [Rank 0]: total number of epochs: 5 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,384 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,385 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,385 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,385 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,385 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,385 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,385 [Rank 0]: > finished creating indexed dataset in 0.000846 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:04,385 [Rank 0]: number of documents: 24208 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,385 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:04,385 [Rank 0]: VALID_stata: +[ip-26-0-150-122:0]:2023-06-21 17:28:04,385 [Rank 0]: document indices in [23458, 24184) total of 726 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:04,386 [Rank 0]: > Tokens per epoch: 5577566 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,388 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,388 [Rank 0]: > last epoch number of samples (6) is smaller than 80% of number of samples per epoch (680), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:04,390 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002375 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 726 +[ip-26-0-150-122:0]: number of epochs: 4 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2723 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,394 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003521 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,394 [Rank 0]: > building shuffle index with split [0, 2042) and [2042, 2723) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,398 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003567 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,403 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document_VALID_stata_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,410 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document_VALID_stata_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,415 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document_VALID_stata_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,415 [Rank 0]: loaded indexed file in 0.012 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:04,415 [Rank 0]: total number of samples: 2724 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,415 [Rank 0]: total number of epochs: 4 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,498 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,498 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,499 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,499 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,499 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,499 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,499 [Rank 0]: > finished creating indexed dataset in 0.000723 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:04,499 [Rank 0]: number of documents: 4737 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,499 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:04,499 [Rank 0]: VALID_applescript: +[ip-26-0-150-122:0]:2023-06-21 17:28:04,499 [Rank 0]: document indices in [4590, 4732) total of 142 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:04,499 [Rank 0]: > Tokens per epoch: 63420 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,502 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,502 [Rank 0]: > last epoch number of samples (5) is larger than 80% of number of samples per epoch (7), setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:04,506 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003609 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 142 +[ip-26-0-150-122:0]: number of epochs: 265 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2051 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,509 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002748 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,509 [Rank 0]: > building shuffle index with split [0, 2051) and [2051, 2051) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,511 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002564 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,512 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document_VALID_applescript_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,519 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document_VALID_applescript_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,519 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document_VALID_applescript_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,520 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:04,520 [Rank 0]: total number of samples: 2052 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,520 [Rank 0]: total number of epochs: 265 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,603 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,605 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,605 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,605 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,605 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,605 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,605 [Rank 0]: > finished creating indexed dataset in 0.002281 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:04,605 [Rank 0]: number of documents: 2206327 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,605 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:04,605 [Rank 0]: VALID_shell: +[ip-26-0-150-122:0]:2023-06-21 17:28:04,605 [Rank 0]: document indices in [2137931, 2204121) total of 66190 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:04,606 [Rank 0]: > Tokens per epoch: 31891052 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,608 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,608 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:04,613 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.004554 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 66190 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 3892 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,616 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003262 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,616 [Rank 0]: > building shuffle index with split [0, 3892) and [3892, 3892) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,620 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003659 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,620 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document_VALID_shell_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,626 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document_VALID_shell_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,626 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document_VALID_shell_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,627 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:04,627 [Rank 0]: total number of samples: 3893 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,627 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,709 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]: > finished creating indexed dataset in 0.001496 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]: number of documents: 125163 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]: VALID_clojure: +[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]: document indices in [121283, 125038) total of 3755 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:04,711 [Rank 0]: > Tokens per epoch: 3837021 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,714 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,714 [Rank 0]: > last epoch number of samples (175) is smaller than 80% of number of samples per epoch (468), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:04,718 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003604 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 3755 +[ip-26-0-150-122:0]: number of epochs: 5 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2341 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,721 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002620 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,721 [Rank 0]: > building shuffle index with split [0, 1873) and [1873, 2341) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,723 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002258 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,775 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document_VALID_clojure_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,782 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document_VALID_clojure_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,783 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document_VALID_clojure_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,783 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:04,783 [Rank 0]: total number of samples: 2342 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,783 [Rank 0]: total number of epochs: 5 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,866 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]: > finished creating indexed dataset in 0.000991 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]: number of documents: 41890 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]: VALID_scheme: +[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]: document indices in [40591, 41848) total of 1257 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:04,867 [Rank 0]: > Tokens per epoch: 2017219 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,870 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,870 [Rank 0]: > last epoch number of samples (79) is smaller than 80% of number of samples per epoch (246), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:04,873 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003488 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1257 +[ip-26-0-150-122:0]: number of epochs: 9 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2216 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,876 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002666 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,876 [Rank 0]: > building shuffle index with split [0, 1969) and [1969, 2216) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:04,879 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002155 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,927 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document_VALID_scheme_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,932 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document_VALID_scheme_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,936 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document_VALID_scheme_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:04,939 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:04,939 [Rank 0]: total number of samples: 2217 +[ip-26-0-150-122:0]:2023-06-21 17:28:04,939 [Rank 0]: total number of epochs: 9 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,022 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]: > finished creating indexed dataset in 0.000759 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]: number of documents: 7917 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]: VALID_antlr: +[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]: document indices in [7672, 7909) total of 237 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:05,023 [Rank 0]: > Tokens per epoch: 1102148 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,026 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,026 [Rank 0]: > last epoch number of samples (30) is smaller than 80% of number of samples per epoch (134), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:05,030 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003216 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 237 +[ip-26-0-150-122:0]: number of epochs: 16 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2152 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,032 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002051 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,032 [Rank 0]: > building shuffle index with split [0, 2018) and [2018, 2152) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,036 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003610 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,067 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document_VALID_antlr_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,075 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document_VALID_antlr_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,075 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document_VALID_antlr_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,076 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:05,076 [Rank 0]: total number of samples: 2153 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,076 [Rank 0]: total number of epochs: 16 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,159 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,159 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,159 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,159 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,160 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,160 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,160 [Rank 0]: > finished creating indexed dataset in 0.000803 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:05,160 [Rank 0]: number of documents: 13716 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,160 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:05,160 [Rank 0]: VALID_sparql: +[ip-26-0-150-122:0]:2023-06-21 17:28:05,160 [Rank 0]: document indices in [13291, 13702) total of 411 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:05,160 [Rank 0]: > Tokens per epoch: 465467 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,162 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,163 [Rank 0]: > last epoch number of samples (3) is smaller than 80% of number of samples per epoch (56), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:05,166 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003404 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 411 +[ip-26-0-150-122:0]: number of epochs: 37 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2102 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,169 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002594 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,169 [Rank 0]: > building shuffle index with split [0, 2045) and [2045, 2102) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,172 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002611 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,172 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document_VALID_sparql_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,177 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document_VALID_sparql_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,220 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document_VALID_sparql_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,225 [Rank 0]: loaded indexed file in 0.053 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:05,225 [Rank 0]: total number of samples: 2103 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,225 [Rank 0]: total number of epochs: 37 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,300 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,301 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,301 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,301 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,301 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,301 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,302 [Rank 0]: > finished creating indexed dataset in 0.001526 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:05,302 [Rank 0]: number of documents: 975420 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,302 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:05,302 [Rank 0]: VALID_sql: +[ip-26-0-150-122:0]:2023-06-21 17:28:05,302 [Rank 0]: document indices in [945182, 974445) total of 29263 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:05,302 [Rank 0]: > Tokens per epoch: 164859090 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,305 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,305 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:05,308 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003180 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 29263 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 20124 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,311 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002947 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,311 [Rank 0]: > building shuffle index with split [0, 20124) and [20124, 20124) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,315 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003353 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,315 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document_VALID_sql_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,322 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document_VALID_sql_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,323 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document_VALID_sql_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,323 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:05,323 [Rank 0]: total number of samples: 20125 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,323 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,405 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]: > finished creating indexed dataset in 0.001858 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]: number of documents: 167701 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]: VALID_glsl: +[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]: document indices in [162502, 167533) total of 5031 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:05,407 [Rank 0]: > Tokens per epoch: 5272081 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,410 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,410 [Rank 0]: > last epoch number of samples (118) is smaller than 80% of number of samples per epoch (643), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:05,413 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003568 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 5031 +[ip-26-0-150-122:0]: number of epochs: 4 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2574 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,418 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004110 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,418 [Rank 0]: > building shuffle index with split [0, 1930) and [1930, 2574) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,423 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004795 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,423 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document_VALID_glsl_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,428 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document_VALID_glsl_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,429 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document_VALID_glsl_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,429 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:05,429 [Rank 0]: total number of samples: 2575 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,429 [Rank 0]: total number of epochs: 4 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,512 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,513 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,513 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,513 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,513 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,513 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,513 [Rank 0]: > finished creating indexed dataset in 0.001139 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:05,513 [Rank 0]: number of documents: 62033 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,513 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:05,513 [Rank 0]: VALID_elm: +[ip-26-0-150-122:0]:2023-06-21 17:28:05,513 [Rank 0]: document indices in [60110, 61971) total of 1861 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:05,514 [Rank 0]: > Tokens per epoch: 2205938 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,516 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,516 [Rank 0]: > last epoch number of samples (164) is smaller than 80% of number of samples per epoch (269), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:05,519 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003202 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1861 +[ip-26-0-150-122:0]: number of epochs: 8 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2154 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,523 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003124 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,523 [Rank 0]: > building shuffle index with split [0, 1884) and [1884, 2154) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,525 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002546 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,529 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document_VALID_elm_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,537 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document_VALID_elm_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,537 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document_VALID_elm_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,538 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:05,538 [Rank 0]: total number of samples: 2155 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,538 [Rank 0]: total number of epochs: 8 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,620 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,622 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,622 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,622 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,622 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,622 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,623 [Rank 0]: > finished creating indexed dataset in 0.001971 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:05,623 [Rank 0]: number of documents: 571506 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,623 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:05,623 [Rank 0]: VALID_dockerfile: +[ip-26-0-150-122:0]:2023-06-21 17:28:05,623 [Rank 0]: document indices in [553789, 570934) total of 17145 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:05,623 [Rank 0]: > Tokens per epoch: 4375164 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,625 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,625 [Rank 0]: > last epoch number of samples (446) is larger than 80% of number of samples per epoch (534), setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:05,630 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.004376 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 17145 +[ip-26-0-150-122:0]: number of epochs: 4 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2136 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,632 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002386 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,633 [Rank 0]: > building shuffle index with split [0, 2136) and [2136, 2136) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,636 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003057 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,636 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document_VALID_dockerfile_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,641 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document_VALID_dockerfile_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,642 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document_VALID_dockerfile_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,642 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:05,642 [Rank 0]: total number of samples: 2137 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,642 [Rank 0]: total number of epochs: 4 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,726 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,728 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,728 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,728 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,728 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,728 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,728 [Rank 0]: > finished creating indexed dataset in 0.002331 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:05,728 [Rank 0]: number of documents: 6353527 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,728 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:05,728 [Rank 0]: VALID_cpp: +[ip-26-0-150-122:0]:2023-06-21 17:28:05,729 [Rank 0]: document indices in [6156568, 6347173) total of 190605 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:05,730 [Rank 0]: > Tokens per epoch: 476705041 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,732 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,733 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:05,741 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.007958 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 190605 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 58191 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,745 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004300 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,745 [Rank 0]: > building shuffle index with split [0, 58191) and [58191, 58191) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,750 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.005185 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,751 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document_VALID_cpp_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,759 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document_VALID_cpp_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,760 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document_VALID_cpp_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,760 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:05,761 [Rank 0]: total number of samples: 58192 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,761 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,844 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,845 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,845 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,846 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,846 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,846 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,846 [Rank 0]: > finished creating indexed dataset in 0.001933 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:05,846 [Rank 0]: number of documents: 226209 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,846 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:05,846 [Rank 0]: VALID_coffeescript: +[ip-26-0-150-122:0]:2023-06-21 17:28:05,846 [Rank 0]: document indices in [219197, 225983) total of 6786 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:05,846 [Rank 0]: > Tokens per epoch: 5560129 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,849 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,849 [Rank 0]: > last epoch number of samples (12) is smaller than 80% of number of samples per epoch (678), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:05,853 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003625 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 6786 +[ip-26-0-150-122:0]: number of epochs: 4 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2714 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,856 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002911 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,856 [Rank 0]: > building shuffle index with split [0, 2036) and [2036, 2714) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,858 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002097 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,861 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document_VALID_coffeescript_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,870 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document_VALID_coffeescript_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,871 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document_VALID_coffeescript_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:05,871 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:05,871 [Rank 0]: total number of samples: 2715 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,871 [Rank 0]: total number of epochs: 4 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,954 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,955 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,955 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,955 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,956 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,956 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,956 [Rank 0]: > finished creating indexed dataset in 0.001300 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:05,956 [Rank 0]: number of documents: 98733 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,956 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:05,956 [Rank 0]: VALID_common-lisp: +[ip-26-0-150-122:0]:2023-06-21 17:28:05,956 [Rank 0]: document indices in [95672, 98634) total of 2962 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:05,956 [Rank 0]: > Tokens per epoch: 16829467 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,958 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,958 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:05,960 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002123 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 2962 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2054 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,962 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.001976 +[ip-26-0-150-122:0]:2023-06-21 17:28:05,963 [Rank 0]: > building shuffle index with split [0, 2054) and [2054, 2054) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:05,965 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002257 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,014 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document_VALID_common-lisp_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:06,019 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document_VALID_common-lisp_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:06,019 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document_VALID_common-lisp_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:06,021 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:06,021 [Rank 0]: total number of samples: 2055 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,021 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,104 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,106 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,106 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,107 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,107 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,107 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,107 [Rank 0]: > finished creating indexed dataset in 0.002294 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:06,107 [Rank 0]: number of documents: 281016 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,107 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:06,107 [Rank 0]: VALID_elixir: +[ip-26-0-150-122:0]:2023-06-21 17:28:06,107 [Rank 0]: document indices in [272305, 280735) total of 8430 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:06,107 [Rank 0]: > Tokens per epoch: 7046176 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,110 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,110 [Rank 0]: > last epoch number of samples (328) is smaller than 80% of number of samples per epoch (860), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:06,114 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003461 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 8430 +[ip-26-0-150-122:0]: number of epochs: 3 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2580 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,117 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003002 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,117 [Rank 0]: > building shuffle index with split [0, 1720) and [1720, 2580) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,119 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002223 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,120 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document_VALID_elixir_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:06,125 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document_VALID_elixir_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:06,126 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document_VALID_elixir_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:06,126 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:06,126 [Rank 0]: total number of samples: 2581 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,126 [Rank 0]: total number of epochs: 3 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,210 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,211 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,212 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,212 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,212 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,212 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,212 [Rank 0]: > finished creating indexed dataset in 0.002243 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:06,212 [Rank 0]: number of documents: 250834 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,212 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:06,212 [Rank 0]: VALID_groovy: +[ip-26-0-150-122:0]:2023-06-21 17:28:06,212 [Rank 0]: document indices in [243058, 250583) total of 7525 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:06,212 [Rank 0]: > Tokens per epoch: 7066083 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,215 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,215 [Rank 0]: > last epoch number of samples (323) is smaller than 80% of number of samples per epoch (862), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:06,219 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003078 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 7525 +[ip-26-0-150-122:0]: number of epochs: 3 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2587 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,222 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002899 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,222 [Rank 0]: > building shuffle index with split [0, 1725) and [1725, 2587) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,225 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003019 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,227 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document_VALID_groovy_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:06,232 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document_VALID_groovy_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:06,232 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document_VALID_groovy_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:06,232 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:06,233 [Rank 0]: total number of samples: 2588 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,233 [Rank 0]: total number of epochs: 3 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,315 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,317 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,317 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,317 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,317 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,318 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,318 [Rank 0]: > finished creating indexed dataset in 0.002011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:06,318 [Rank 0]: number of documents: 3299965 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,318 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:06,318 [Rank 0]: VALID_html: +[ip-26-0-150-122:0]:2023-06-21 17:28:06,318 [Rank 0]: document indices in [3197666, 3296665) total of 98999 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:06,319 [Rank 0]: > Tokens per epoch: 293479485 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,322 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,322 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:06,328 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.005949 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 98999 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 35825 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,331 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003183 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,331 [Rank 0]: > building shuffle index with split [0, 35825) and [35825, 35825) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,335 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003513 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,380 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document_VALID_html_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:06,390 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document_VALID_html_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:06,390 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document_VALID_html_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:06,391 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:06,391 [Rank 0]: total number of samples: 35826 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,391 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,474 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,476 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,476 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,476 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,476 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,477 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,477 [Rank 0]: > finished creating indexed dataset in 0.002271 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:06,477 [Rank 0]: number of documents: 20071773 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,477 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:06,477 [Rank 0]: VALID_java: +[ip-26-0-150-122:0]:2023-06-21 17:28:06,477 [Rank 0]: document indices in [19449548, 20051701) total of 602153 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:06,481 [Rank 0]: > Tokens per epoch: 679829501 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,483 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,483 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:06,508 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.024745 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 602153 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 82986 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,514 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.005701 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,514 [Rank 0]: > building shuffle index with split [0, 82986) and [82986, 82986) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,518 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004076 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,543 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document_VALID_java_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:06,555 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document_VALID_java_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:06,559 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document_VALID_java_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:06,559 [Rank 0]: loaded indexed file in 0.016 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:06,559 [Rank 0]: total number of samples: 82987 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,559 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,642 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,644 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,645 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,645 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,645 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,645 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,645 [Rank 0]: > finished creating indexed dataset in 0.002342 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:06,645 [Rank 0]: number of documents: 19544285 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,645 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:06,645 [Rank 0]: VALID_javascript: +[ip-26-0-150-122:0]:2023-06-21 17:28:06,645 [Rank 0]: document indices in [18938412, 19524741) total of 586329 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:06,649 [Rank 0]: > Tokens per epoch: 565628573 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,652 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,652 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:06,674 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.021904 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 586329 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 69046 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,680 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.005604 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,680 [Rank 0]: > building shuffle index with split [0, 69046) and [69046, 69046) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,683 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003501 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,710 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document_VALID_javascript_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:06,722 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document_VALID_javascript_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:06,722 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document_VALID_javascript_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:06,723 [Rank 0]: loaded indexed file in 0.013 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:06,723 [Rank 0]: total number of samples: 69047 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,723 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,805 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,808 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,808 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,808 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,808 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,808 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,808 [Rank 0]: > finished creating indexed dataset in 0.002514 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:06,808 [Rank 0]: number of documents: 21029287 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,808 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:06,808 [Rank 0]: VALID_markdown: +[ip-26-0-150-122:0]:2023-06-21 17:28:06,808 [Rank 0]: document indices in [20377379, 21008258) total of 630879 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:06,812 [Rank 0]: > Tokens per epoch: 765105610 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,815 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,815 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:06,838 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.022965 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 630879 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 93396 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,845 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.006653 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,845 [Rank 0]: > building shuffle index with split [0, 93396) and [93396, 93396) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,850 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.005421 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,870 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document_VALID_markdown_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:06,883 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document_VALID_markdown_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:06,883 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document_VALID_markdown_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:06,884 [Rank 0]: loaded indexed file in 0.014 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:06,884 [Rank 0]: total number of samples: 93397 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,884 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,967 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,969 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,969 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,969 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,969 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,969 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,969 [Rank 0]: > finished creating indexed dataset in 0.002205 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:06,969 [Rank 0]: number of documents: 15683017 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,970 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:06,970 [Rank 0]: VALID_php: +[ip-26-0-150-122:0]:2023-06-21 17:28:06,970 [Rank 0]: document indices in [15196843, 15667334) total of 470491 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:06,973 [Rank 0]: > Tokens per epoch: 512566580 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,976 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:06,976 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:06,994 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.017977 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 470491 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 62569 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,998 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004299 +[ip-26-0-150-122:0]:2023-06-21 17:28:06,998 [Rank 0]: > building shuffle index with split [0, 62569) and [62569, 62569) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,002 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004163 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,006 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document_VALID_php_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,018 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document_VALID_php_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,019 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document_VALID_php_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,020 [Rank 0]: loaded indexed file in 0.014 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:07,020 [Rank 0]: total number of samples: 62570 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,020 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,103 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,105 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,105 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,105 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,105 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,105 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,105 [Rank 0]: > finished creating indexed dataset in 0.002205 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:07,105 [Rank 0]: number of documents: 12866649 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,105 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:07,105 [Rank 0]: VALID_python: +[ip-26-0-150-122:0]:2023-06-21 17:28:07,105 [Rank 0]: document indices in [12467783, 12853782) total of 385999 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:07,108 [Rank 0]: > Tokens per epoch: 529606827 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,111 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,111 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:07,126 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.015276 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 385999 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 64649 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,131 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004516 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,131 [Rank 0]: > building shuffle index with split [0, 64649) and [64649, 64649) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,134 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003384 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,135 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document_VALID_python_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,145 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document_VALID_python_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,146 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document_VALID_python_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,146 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:07,146 [Rank 0]: total number of samples: 64650 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,146 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,230 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,232 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,232 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,232 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,232 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,232 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,232 [Rank 0]: > finished creating indexed dataset in 0.002286 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:07,232 [Rank 0]: number of documents: 10547331 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,232 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:07,232 [Rank 0]: VALID_typescript: +[ip-26-0-150-122:0]:2023-06-21 17:28:07,232 [Rank 0]: document indices in [10220364, 10536784) total of 316420 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:07,234 [Rank 0]: > Tokens per epoch: 222078157 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,237 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,237 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:07,250 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.012563 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 316420 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 27109 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,254 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003621 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,254 [Rank 0]: > building shuffle index with split [0, 27109) and [27109, 27109) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,258 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003985 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,258 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document_VALID_typescript_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,268 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document_VALID_typescript_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,273 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document_VALID_typescript_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,278 [Rank 0]: loaded indexed file in 0.020 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:07,278 [Rank 0]: total number of samples: 27110 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,278 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,361 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,362 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,362 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,362 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,362 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,362 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,362 [Rank 0]: > finished creating indexed dataset in 0.000719 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:07,362 [Rank 0]: number of documents: 75 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,362 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:07,362 [Rank 0]: VALID_verilog: +[ip-26-0-150-122:0]:2023-06-21 17:28:07,362 [Rank 0]: document indices in [73, 75) total of 2 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:07,363 [Rank 0]: > Tokens per epoch: 5184 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,365 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,366 [Rank 0]: > last epoch number of samples (1) is larger than 80% of number of samples per epoch (0), setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:07,368 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002391 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 2 +[ip-26-0-150-122:0]: number of epochs: 3237 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2048 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,370 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.001837 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,370 [Rank 0]: > building shuffle index with split [0, 2048) and [2048, 2048) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,373 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002986 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,373 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document_VALID_verilog_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,378 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document_VALID_verilog_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,378 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document_VALID_verilog_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,379 [Rank 0]: loaded indexed file in 0.005 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:07,379 [Rank 0]: total number of samples: 2049 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,379 [Rank 0]: total number of epochs: 3237 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,462 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,463 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,464 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,464 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,464 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,464 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,464 [Rank 0]: > finished creating indexed dataset in 0.001833 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:07,464 [Rank 0]: number of documents: 161239 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,464 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:07,464 [Rank 0]: VALID_visual-basic: +[ip-26-0-150-122:0]:2023-06-21 17:28:07,464 [Rank 0]: document indices in [156241, 161078) total of 4837 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:07,464 [Rank 0]: > Tokens per epoch: 11401469 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,467 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,467 [Rank 0]: > last epoch number of samples (657) is smaller than 80% of number of samples per epoch (1391), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:07,471 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002988 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 4837 +[ip-26-0-150-122:0]: number of epochs: 2 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2783 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,473 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002198 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,473 [Rank 0]: > building shuffle index with split [0, 1391) and [1391, 2783) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,476 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002656 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,526 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document_VALID_visual-basic_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,530 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document_VALID_visual-basic_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,531 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document_VALID_visual-basic_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,531 [Rank 0]: loaded indexed file in 0.005 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:07,532 [Rank 0]: total number of samples: 2784 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,532 [Rank 0]: total number of epochs: 2 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,615 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,616 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,616 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,616 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,616 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,616 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,616 [Rank 0]: > finished creating indexed dataset in 0.001218 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:07,616 [Rank 0]: number of documents: 58208 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,616 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:07,616 [Rank 0]: VALID_vhdl: +[ip-26-0-150-122:0]:2023-06-21 17:28:07,616 [Rank 0]: document indices in [56404, 58150) total of 1746 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:07,617 [Rank 0]: > Tokens per epoch: 12008501 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,619 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,619 [Rank 0]: > last epoch number of samples (583) is smaller than 80% of number of samples per epoch (1465), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:07,622 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002933 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1746 +[ip-26-0-150-122:0]: number of epochs: 2 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2931 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,625 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002862 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,625 [Rank 0]: > building shuffle index with split [0, 1465) and [1465, 2931) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,627 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001572 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,632 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document_VALID_vhdl_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,638 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document_VALID_vhdl_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,639 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document_VALID_vhdl_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,639 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:07,639 [Rank 0]: total number of samples: 2932 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,639 [Rank 0]: total number of epochs: 2 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,722 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,723 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,723 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,723 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,723 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,723 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,723 [Rank 0]: > finished creating indexed dataset in 0.000704 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:07,723 [Rank 0]: number of documents: 4661 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,723 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:07,723 [Rank 0]: VALID_thrift: +[ip-26-0-150-122:0]:2023-06-21 17:28:07,723 [Rank 0]: document indices in [4517, 4656) total of 139 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:07,724 [Rank 0]: > Tokens per epoch: 98302 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,726 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,727 [Rank 0]: > last epoch number of samples (9) is larger than 80% of number of samples per epoch (11), setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:07,730 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003812 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 139 +[ip-26-0-150-122:0]: number of epochs: 171 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2051 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,733 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002584 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,733 [Rank 0]: > building shuffle index with split [0, 2051) and [2051, 2051) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,735 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002088 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,784 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document_VALID_thrift_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,790 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document_VALID_thrift_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,790 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document_VALID_thrift_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,791 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:07,791 [Rank 0]: total number of samples: 2052 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,791 [Rank 0]: total number of epochs: 171 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,875 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,875 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,875 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,875 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,875 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,876 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,876 [Rank 0]: > finished creating indexed dataset in 0.000680 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:07,876 [Rank 0]: number of documents: 93 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,876 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:07,876 [Rank 0]: VALID_matlab: +[ip-26-0-150-122:0]:2023-06-21 17:28:07,876 [Rank 0]: document indices in [90, 93) total of 3 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:07,876 [Rank 0]: > Tokens per epoch: 4277 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,879 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,880 [Rank 0]: > last epoch number of samples (1) is larger than 80% of number of samples per epoch (0), setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:07,883 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003370 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 3 +[ip-26-0-150-122:0]: number of epochs: 3923 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2048 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,886 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003336 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,887 [Rank 0]: > building shuffle index with split [0, 2048) and [2048, 2048) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,889 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002569 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,890 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document_VALID_matlab_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,895 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document_VALID_matlab_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,895 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document_VALID_matlab_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:07,896 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:07,896 [Rank 0]: total number of samples: 2049 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,896 [Rank 0]: total number of epochs: 3923 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,979 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,979 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,980 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,980 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,980 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,980 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,980 [Rank 0]: > finished creating indexed dataset in 0.000772 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:07,980 [Rank 0]: number of documents: 7451 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,980 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:07,980 [Rank 0]: VALID_yacc: +[ip-26-0-150-122:0]:2023-06-21 17:28:07,980 [Rank 0]: document indices in [7220, 7444) total of 224 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:07,980 [Rank 0]: > Tokens per epoch: 1128407 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,982 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,982 [Rank 0]: > last epoch number of samples (120) is larger than 80% of number of samples per epoch (137), setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:07,985 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002760 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 224 +[ip-26-0-150-122:0]: number of epochs: 15 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2066 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,989 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003140 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,989 [Rank 0]: > building shuffle index with split [0, 2066) and [2066, 2066) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:07,991 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002217 +[ip-26-0-150-122:0]:2023-06-21 17:28:07,995 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document_VALID_yacc_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,002 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document_VALID_yacc_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,005 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document_VALID_yacc_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,006 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:08,006 [Rank 0]: total number of samples: 2067 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,006 [Rank 0]: total number of epochs: 15 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,089 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,090 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,090 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,090 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,090 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,090 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,090 [Rank 0]: > finished creating indexed dataset in 0.000813 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:08,090 [Rank 0]: number of documents: 15850 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,090 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:08,090 [Rank 0]: VALID_zig: +[ip-26-0-150-122:0]:2023-06-21 17:28:08,091 [Rank 0]: document indices in [15359, 15834) total of 475 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:08,091 [Rank 0]: > Tokens per epoch: 2144189 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,093 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,093 [Rank 0]: > last epoch number of samples (216) is larger than 80% of number of samples per epoch (261), setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:08,095 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002125 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 475 +[ip-26-0-150-122:0]: number of epochs: 8 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2093 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,099 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003680 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,099 [Rank 0]: > building shuffle index with split [0, 2093) and [2093, 2093) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,103 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003473 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,103 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document_VALID_zig_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,110 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document_VALID_zig_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,110 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document_VALID_zig_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,111 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:08,111 [Rank 0]: total number of samples: 2094 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,111 [Rank 0]: total number of epochs: 8 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,194 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,195 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,195 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,195 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,195 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,195 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,195 [Rank 0]: > finished creating indexed dataset in 0.000978 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:08,195 [Rank 0]: number of documents: 42103 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,195 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:08,196 [Rank 0]: VALID_xslt: +[ip-26-0-150-122:0]:2023-06-21 17:28:08,196 [Rank 0]: document indices in [40798, 42061) total of 1263 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:08,196 [Rank 0]: > Tokens per epoch: 4166294 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,198 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,199 [Rank 0]: > last epoch number of samples (14) is smaller than 80% of number of samples per epoch (508), setting separate_last_epoch to True +[ip-26-0-150-122:0]:2023-06-21 17:28:08,201 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002378 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1263 +[ip-26-0-150-122:0]: number of epochs: 5 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2542 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,204 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002669 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,204 [Rank 0]: > building shuffle index with split [0, 2034) and [2034, 2542) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,207 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002728 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,208 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document_VALID_xslt_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,213 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document_VALID_xslt_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,213 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document_VALID_xslt_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,214 [Rank 0]: loaded indexed file in 0.005 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:08,214 [Rank 0]: total number of samples: 2543 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,214 [Rank 0]: total number of epochs: 5 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,297 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,299 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,299 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,299 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,299 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,300 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,300 [Rank 0]: > finished creating indexed dataset in 0.002198 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:08,300 [Rank 0]: number of documents: 4751547 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,300 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:08,300 [Rank 0]: VALID_json: +[ip-26-0-150-122:0]:2023-06-21 17:28:08,300 [Rank 0]: document indices in [4604249, 4746795) total of 142546 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:08,301 [Rank 0]: > Tokens per epoch: 62884447 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,303 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,304 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:08,311 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.007176 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 142546 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 7676 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,315 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003801 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,315 [Rank 0]: > building shuffle index with split [0, 7676) and [7676, 7676) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,318 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003295 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,366 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document_VALID_json_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,375 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document_VALID_json_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,376 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document_VALID_json_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,376 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:08,376 [Rank 0]: total number of samples: 7677 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,376 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,460 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,461 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,462 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,462 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,462 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,462 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,462 [Rank 0]: > finished creating indexed dataset in 0.002057 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:08,462 [Rank 0]: number of documents: 3995948 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,462 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:08,462 [Rank 0]: VALID_yaml: +[ip-26-0-150-122:0]:2023-06-21 17:28:08,462 [Rank 0]: document indices in [3872074, 3991952) total of 119878 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:08,463 [Rank 0]: > Tokens per epoch: 35974762 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,466 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,466 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:08,472 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.005821 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 119878 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 4391 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,475 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003133 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,475 [Rank 0]: > building shuffle index with split [0, 4391) and [4391, 4391) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,478 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002653 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,527 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document_VALID_yaml_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,536 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document_VALID_yaml_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,536 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document_VALID_yaml_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,537 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:08,537 [Rank 0]: total number of samples: 4392 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,537 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,619 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,621 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,621 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,621 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,621 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,621 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,621 [Rank 0]: > finished creating indexed dataset in 0.002074 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:08,621 [Rank 0]: number of documents: 30982955 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,621 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:08,621 [Rank 0]: VALID_gh_issues: +[ip-26-0-150-122:0]:2023-06-21 17:28:08,621 [Rank 0]: document indices in [30022483, 30951972) total of 929489 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:08,627 [Rank 0]: > Tokens per epoch: 538755961 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,630 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,630 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:08,666 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.035955 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 929489 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 65766 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,673 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.007118 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,673 [Rank 0]: > building shuffle index with split [0, 65766) and [65766, 65766) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,678 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004275 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,736 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document_VALID_gh_issues_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,751 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document_VALID_gh_issues_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,752 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document_VALID_gh_issues_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,752 [Rank 0]: loaded indexed file in 0.016 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:08,752 [Rank 0]: total number of samples: 65767 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,752 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,836 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,837 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,838 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,838 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,838 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,838 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,838 [Rank 0]: > finished creating indexed dataset in 0.002246 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:08,838 [Rank 0]: number of documents: 7634718 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,838 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:08,838 [Rank 0]: VALID_gh_commits: +[ip-26-0-150-122:0]:2023-06-21 17:28:08,838 [Rank 0]: document indices in [7398042, 7627083) total of 229041 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:08,840 [Rank 0]: > Tokens per epoch: 483498380 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,842 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,842 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:08,852 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.009787 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 229041 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 59020 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,857 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004542 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,857 [Rank 0]: > building shuffle index with split [0, 59020) and [59020, 59020) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,862 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004894 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,862 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document_VALID_gh_commits_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,874 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document_VALID_gh_commits_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,875 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document_VALID_gh_commits_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,875 [Rank 0]: loaded indexed file in 0.013 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:08,875 [Rank 0]: total number of samples: 59021 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,875 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,959 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,960 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,960 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,960 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,961 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,961 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,961 [Rank 0]: > finished creating indexed dataset in 0.001894 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:08,961 [Rank 0]: number of documents: 914510 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,961 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:08,961 [Rank 0]: VALID_notebook_scripts: +[ip-26-0-150-122:0]:2023-06-21 17:28:08,961 [Rank 0]: document indices in [886160, 913595) total of 27435 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:08,961 [Rank 0]: > Tokens per epoch: 73709652 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,964 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,964 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:08,968 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003803 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 27435 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 8997 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,971 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003204 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,971 [Rank 0]: > building shuffle index with split [0, 8997) and [8997, 8997) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:08,974 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002797 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,977 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document_VALID_notebook_scripts_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,985 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document_VALID_notebook_scripts_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,985 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document_VALID_notebook_scripts_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:08,986 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:08,986 [Rank 0]: total number of samples: 8998 +[ip-26-0-150-122:0]:2023-06-21 17:28:08,986 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,067 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,068 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,068 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,068 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,068 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,069 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,069 [Rank 0]: > finished creating indexed dataset in 0.001880 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:09,069 [Rank 0]: number of documents: 668743 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,069 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:09,069 [Rank 0]: VALID_notebook_structured: +[ip-26-0-150-122:0]:2023-06-21 17:28:09,069 [Rank 0]: document indices in [648012, 668074) total of 20062 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:09,069 [Rank 0]: > Tokens per epoch: 56156688 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,071 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,072 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:09,075 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003791 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 20062 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 6855 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,078 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002285 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,078 [Rank 0]: > building shuffle index with split [0, 6855) and [6855, 6855) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,080 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002088 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,139 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document_VALID_notebook_structured_indexmap_2048ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,146 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document_VALID_notebook_structured_indexmap_2048ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,148 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document_VALID_notebook_structured_indexmap_2048ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,150 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:09,150 [Rank 0]: total number of samples: 6856 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,150 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,235 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,237 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,237 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,237 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,237 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,237 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,237 [Rank 0]: > finished creating indexed dataset in 0.001864 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:09,237 [Rank 0]: number of documents: 2721616 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,237 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:09,237 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:09,237 [Rank 0]: document indices in [2637246, 2718894) total of 81648 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:09,238 [Rank 0]: > Tokens per epoch: 142752310 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,241 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,241 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:09,246 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.005119 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 81648 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 17425 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,250 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003681 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,250 [Rank 0]: > building shuffle index with split [0, 17425) and [17425, 17425) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,252 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002301 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,253 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_9ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,260 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_9ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,261 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/css/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_9ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,261 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:09,261 [Rank 0]: total number of samples: 17426 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,261 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,345 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,345 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,345 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,345 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,346 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,346 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,346 [Rank 0]: > finished creating indexed dataset in 0.000685 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:09,346 [Rank 0]: number of documents: 968 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,346 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:09,346 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:09,346 [Rank 0]: document indices in [938, 967) total of 29 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:09,346 [Rank 0]: > Tokens per epoch: 55028 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,358 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,362 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,364 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/prolog/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,367 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:09,367 [Rank 0]: total number of samples: 7 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,367 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,450 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,452 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,452 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,452 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,452 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,453 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,453 [Rank 0]: > finished creating indexed dataset in 0.002246 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:09,453 [Rank 0]: number of documents: 8536791 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,453 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:09,453 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:09,453 [Rank 0]: document indices in [8272150, 8528254) total of 256104 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:09,455 [Rank 0]: > Tokens per epoch: 613576495 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,456 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,456 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:09,467 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.010761 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 256104 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 74899 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,471 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003652 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,471 [Rank 0]: > building shuffle index with split [0, 74899) and [74899, 74899) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,475 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003861 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,475 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_145ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,487 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_145ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,488 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_145ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,489 [Rank 0]: loaded indexed file in 0.014 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:09,489 [Rank 0]: total number of samples: 74900 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,489 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,572 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,573 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,574 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,574 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,574 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,574 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,574 [Rank 0]: > finished creating indexed dataset in 0.001657 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:09,574 [Rank 0]: number of documents: 158792 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,574 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:09,574 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:09,574 [Rank 0]: document indices in [153869, 158633) total of 4764 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:09,574 [Rank 0]: > Tokens per epoch: 18815887 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,577 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,577 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:09,580 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002379 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 4764 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2296 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,583 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002757 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,583 [Rank 0]: > building shuffle index with split [0, 2296) and [2296, 2296) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,586 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003100 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,589 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_5ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,594 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_5ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,594 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/fortran/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_5ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,594 [Rank 0]: loaded indexed file in 0.005 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:09,594 [Rank 0]: total number of samples: 2297 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,595 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,678 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,679 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,679 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,680 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,680 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,680 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,680 [Rank 0]: > finished creating indexed dataset in 0.001779 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:09,680 [Rank 0]: number of documents: 153194 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,680 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:09,680 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:09,680 [Rank 0]: document indices in [148445, 153041) total of 4596 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:09,680 [Rank 0]: > Tokens per epoch: 8220293 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,682 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,682 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:09,684 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002250 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 4596 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 1003 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,688 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003712 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,688 [Rank 0]: > building shuffle index with split [0, 1003) and [1003, 1003) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,691 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002539 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,697 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,701 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,703 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/solidity/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,705 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:09,705 [Rank 0]: total number of samples: 1004 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,705 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,788 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,790 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,790 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,790 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,790 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,791 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,791 [Rank 0]: > finished creating indexed dataset in 0.002346 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:09,791 [Rank 0]: number of documents: 2239354 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,791 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:09,791 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:09,791 [Rank 0]: document indices in [2169934, 2237115) total of 67181 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:09,792 [Rank 0]: > Tokens per epoch: 43085225 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,793 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,793 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:09,798 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.004282 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 67181 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 5259 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,801 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002722 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,801 [Rank 0]: > building shuffle index with split [0, 5259) and [5259, 5259) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,804 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003165 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,805 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_16ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,811 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_16ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,811 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/kotlin/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_16ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,812 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:09,812 [Rank 0]: total number of samples: 5260 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,812 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,895 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,896 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,896 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,896 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,896 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,896 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:09,896 [Rank 0]: > finished creating indexed dataset in 0.000727 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:09,896 [Rank 0]: number of documents: 523 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,896 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:09,896 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:09,896 [Rank 0]: document indices in [507, 522) total of 15 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:09,897 [Rank 0]: > Tokens per epoch: 46791 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,912 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,916 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,919 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-agda/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:09,923 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:09,923 [Rank 0]: total number of samples: 6 +[ip-26-0-150-122:0]:2023-06-21 17:28:09,923 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,007 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,009 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,009 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,009 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,009 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,009 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,009 [Rank 0]: > finished creating indexed dataset in 0.002274 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:10,009 [Rank 0]: number of documents: 295364 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,009 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:10,009 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:10,009 [Rank 0]: document indices in [286208, 295069) total of 8861 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:10,010 [Rank 0]: > Tokens per epoch: 13589070 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,011 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,012 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:10,015 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002999 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 8861 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 1658 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,017 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002250 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,017 [Rank 0]: > building shuffle index with split [0, 1658) and [1658, 1658) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,019 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002191 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,020 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,027 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,032 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/julia/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,033 [Rank 0]: loaded indexed file in 0.013 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:10,033 [Rank 0]: total number of samples: 1659 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,033 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,116 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,118 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,118 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,118 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,118 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,118 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,118 [Rank 0]: > finished creating indexed dataset in 0.001998 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:10,118 [Rank 0]: number of documents: 210816 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,118 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:10,118 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:10,119 [Rank 0]: document indices in [204281, 210605) total of 6324 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:10,119 [Rank 0]: > Tokens per epoch: 8481384 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,122 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,122 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:10,125 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003184 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 6324 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 1035 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,127 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002203 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,127 [Rank 0]: > building shuffle index with split [0, 1035) and [1035, 1035) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,130 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002909 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,131 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,136 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,136 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java-server-pages/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,137 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:10,137 [Rank 0]: total number of samples: 1036 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,137 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,220 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,220 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,220 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,221 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,221 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,221 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,221 [Rank 0]: > finished creating indexed dataset in 0.000698 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:10,221 [Rank 0]: number of documents: 5001 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,221 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:10,221 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:10,221 [Rank 0]: document indices in [4846, 4996) total of 150 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:10,221 [Rank 0]: > Tokens per epoch: 1014769 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,222 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,223 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:10,225 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002061 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 150 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 123 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,227 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002050 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,227 [Rank 0]: > building shuffle index with split [0, 123) and [123, 123) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,229 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002345 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,235 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,239 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,239 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/isabelle/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,241 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:10,242 [Rank 0]: total number of samples: 124 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,242 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,325 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]: > finished creating indexed dataset in 0.000789 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]: number of documents: 8042 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]: document indices in [7793, 8034) total of 241 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:10,326 [Rank 0]: > Tokens per epoch: 225513 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,329 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,329 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:10,332 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002602 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 241 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 27 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,336 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004295 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,336 [Rank 0]: > building shuffle index with split [0, 27) and [27, 27) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,338 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001874 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,338 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,343 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,343 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/idris/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,346 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:10,346 [Rank 0]: total number of samples: 28 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,346 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,430 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,430 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,430 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,430 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,431 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,431 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,431 [Rank 0]: > finished creating indexed dataset in 0.000812 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:10,431 [Rank 0]: number of documents: 16870 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,431 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:10,431 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:10,431 [Rank 0]: document indices in [16347, 16853) total of 506 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:10,431 [Rank 0]: > Tokens per epoch: 1042103 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,433 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,433 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:10,435 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002311 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 506 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 127 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,438 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002538 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,438 [Rank 0]: > building shuffle index with split [0, 127) and [127, 127) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,441 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003080 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,441 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,446 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,446 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lean/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,448 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:10,448 [Rank 0]: total number of samples: 128 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,448 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,532 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,534 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,534 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,534 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,534 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,534 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,534 [Rank 0]: > finished creating indexed dataset in 0.002170 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:10,534 [Rank 0]: number of documents: 267627 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,534 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:10,534 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:10,534 [Rank 0]: document indices in [259331, 267359) total of 8028 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:10,535 [Rank 0]: > Tokens per epoch: 8559847 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,537 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,537 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:10,539 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002263 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 8028 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 1044 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,542 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002454 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,542 [Rank 0]: > building shuffle index with split [0, 1044) and [1044, 1044) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,544 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001797 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,544 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,551 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,552 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/powershell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,552 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:10,552 [Rank 0]: total number of samples: 1045 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,552 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,636 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,637 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,638 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,638 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,638 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,638 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,638 [Rank 0]: > finished creating indexed dataset in 0.002060 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:10,638 [Rank 0]: number of documents: 4700526 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,638 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:10,638 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:10,638 [Rank 0]: document indices in [4554810, 4695825) total of 141015 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:10,639 [Rank 0]: > Tokens per epoch: 253353715 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,642 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,642 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:10,649 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.007241 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 141015 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 30926 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,653 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003915 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,653 [Rank 0]: > building shuffle index with split [0, 30926) and [30926, 30926) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,656 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002367 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,656 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_64ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,667 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_64ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,667 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/go/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_64ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,668 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:10,668 [Rank 0]: total number of samples: 30927 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,668 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,751 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,752 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,753 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,753 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,753 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,753 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,753 [Rank 0]: > finished creating indexed dataset in 0.001324 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:10,753 [Rank 0]: number of documents: 98447 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,753 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:10,753 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:10,753 [Rank 0]: document indices in [95395, 98349) total of 2954 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:10,753 [Rank 0]: > Tokens per epoch: 6597590 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,756 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,756 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:10,759 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002831 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 2954 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 805 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,762 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003247 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,762 [Rank 0]: > building shuffle index with split [0, 805) and [805, 805) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,765 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002427 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,765 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,772 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,772 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/erlang/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,773 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:10,773 [Rank 0]: total number of samples: 806 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,773 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,856 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,858 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,858 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,858 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,858 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,858 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,858 [Rank 0]: > finished creating indexed dataset in 0.001475 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:10,858 [Rank 0]: number of documents: 124066 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,858 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:10,858 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:10,858 [Rank 0]: document indices in [120220, 123942) total of 3722 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:10,859 [Rank 0]: > Tokens per epoch: 4694260 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,861 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,861 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:10,864 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002899 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 3722 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 573 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,868 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003971 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,868 [Rank 0]: > building shuffle index with split [0, 573) and [573, 573) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,871 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003098 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,872 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,879 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,880 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/f-sharp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,880 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:10,880 [Rank 0]: total number of samples: 574 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,880 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,964 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,964 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,964 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,965 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,965 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,965 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,965 [Rank 0]: > finished creating indexed dataset in 0.000930 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:10,965 [Rank 0]: number of documents: 30934 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,965 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:10,965 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:10,965 [Rank 0]: document indices in [29975, 30903) total of 928 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:10,965 [Rank 0]: > Tokens per epoch: 2230554 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,967 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,967 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:10,969 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002251 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 928 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 272 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,971 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002059 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,972 [Rank 0]: > building shuffle index with split [0, 272) and [272, 272) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:10,975 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003396 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,975 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,980 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,980 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ada/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:10,981 [Rank 0]: loaded indexed file in 0.005 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:10,981 [Rank 0]: total number of samples: 273 +[ip-26-0-150-122:0]:2023-06-21 17:28:10,981 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,065 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,066 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,066 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,066 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,066 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,066 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,066 [Rank 0]: > finished creating indexed dataset in 0.001506 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:11,066 [Rank 0]: number of documents: 110981 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,066 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:11,066 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:11,066 [Rank 0]: document indices in [107541, 110870) total of 3329 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:11,067 [Rank 0]: > Tokens per epoch: 21526929 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,070 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,070 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:11,072 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002216 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 3329 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2627 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,076 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003878 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,076 [Rank 0]: > building shuffle index with split [0, 2627) and [2627, 2627) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,078 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002243 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,079 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_5ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,086 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_5ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,086 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/pascal/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_5ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,087 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:11,087 [Rank 0]: total number of samples: 2628 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,087 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,170 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,172 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,172 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,172 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,172 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,172 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,172 [Rank 0]: > finished creating indexed dataset in 0.002155 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:11,172 [Rank 0]: number of documents: 365491 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,172 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:11,172 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:11,173 [Rank 0]: document indices in [354161, 365126) total of 10965 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:11,173 [Rank 0]: > Tokens per epoch: 25729670 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,175 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,175 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:11,178 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003171 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 10965 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 3140 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,181 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002608 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,181 [Rank 0]: > building shuffle index with split [0, 3140) and [3140, 3140) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,183 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002510 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,185 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_6ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,192 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_6ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,193 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/perl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_6ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,193 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:11,193 [Rank 0]: total number of samples: 3141 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,193 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,277 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]: > finished creating indexed dataset in 0.001016 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]: number of documents: 39042 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]: document indices in [37832, 39003) total of 1171 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:11,278 [Rank 0]: > Tokens per epoch: 2880088 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,281 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,281 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:11,285 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003717 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1171 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 351 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,288 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003302 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,288 [Rank 0]: > building shuffle index with split [0, 351) and [351, 351) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,291 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002865 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,291 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,299 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,299 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/r/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,300 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:11,300 [Rank 0]: total number of samples: 352 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,300 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,383 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]: > finished creating indexed dataset in 0.001383 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]: number of documents: 97167 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]: document indices in [94155, 97070) total of 2915 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:11,385 [Rank 0]: > Tokens per epoch: 2614634 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,388 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,388 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:11,391 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002748 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 2915 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 319 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,394 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003080 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,394 [Rank 0]: > building shuffle index with split [0, 319) and [319, 319) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,396 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002230 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,397 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,404 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,405 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/protocol-buffer/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,405 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:11,405 [Rank 0]: total number of samples: 320 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,405 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,489 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,490 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,490 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,491 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,491 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,491 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,491 [Rank 0]: > finished creating indexed dataset in 0.001998 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:11,491 [Rank 0]: number of documents: 186375 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,491 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:11,491 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:11,491 [Rank 0]: document indices in [180597, 186189) total of 5592 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:11,491 [Rank 0]: > Tokens per epoch: 4338734 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,494 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,494 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:11,497 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003543 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 5592 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 529 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,500 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002967 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,501 [Rank 0]: > building shuffle index with split [0, 529) and [529, 529) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,502 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001862 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,503 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,508 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,510 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cmake/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,512 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:11,513 [Rank 0]: total number of samples: 530 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,513 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,597 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,597 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,597 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,597 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,597 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,597 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,598 [Rank 0]: > finished creating indexed dataset in 0.000777 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:11,598 [Rank 0]: number of documents: 9226 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,598 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:11,598 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:11,598 [Rank 0]: document indices in [8940, 9217) total of 277 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:11,598 [Rank 0]: > Tokens per epoch: 1021218 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,600 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,600 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:11,603 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002658 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 277 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 124 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,606 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003479 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,606 [Rank 0]: > building shuffle index with split [0, 124) and [124, 124) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,608 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001835 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,611 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,615 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,619 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sas/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,623 [Rank 0]: loaded indexed file in 0.012 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:11,623 [Rank 0]: total number of samples: 125 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,623 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,707 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,709 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,709 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,709 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,709 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,709 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,709 [Rank 0]: > finished creating indexed dataset in 0.002208 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:11,709 [Rank 0]: number of documents: 3390320 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,709 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:11,709 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:11,709 [Rank 0]: document indices in [3285220, 3386930) total of 101710 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:11,710 [Rank 0]: > Tokens per epoch: 61345928 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,712 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,712 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:11,718 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.005851 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 101710 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 7488 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,721 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002472 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,721 [Rank 0]: > building shuffle index with split [0, 7488) and [7488, 7488) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,723 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002289 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,724 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_19ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,733 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_19ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,733 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ruby/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_19ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,734 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:11,734 [Rank 0]: total number of samples: 7489 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,734 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,817 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,819 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,819 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,819 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,819 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,819 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,819 [Rank 0]: > finished creating indexed dataset in 0.002295 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:11,819 [Rank 0]: number of documents: 1380468 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,819 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:11,820 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:11,820 [Rank 0]: document indices in [1337673, 1379088) total of 41415 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:11,820 [Rank 0]: > Tokens per epoch: 81845020 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,823 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,823 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:11,826 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003589 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 41415 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 9990 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,829 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002887 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,829 [Rank 0]: > building shuffle index with split [0, 9990) and [9990, 9990) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,832 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002872 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,833 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_25ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,838 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_25ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,838 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rust/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_25ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,841 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:11,841 [Rank 0]: total number of samples: 9991 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,841 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,925 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]: > finished creating indexed dataset in 0.000769 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]: number of documents: 5386 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]: document indices in [5219, 5381) total of 162 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:11,926 [Rank 0]: > Tokens per epoch: 626200 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,928 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,928 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:11,931 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002702 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 162 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 76 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,934 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003063 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,935 [Rank 0]: > building shuffle index with split [0, 76) and [76, 76) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:11,937 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002100 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,940 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,944 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,945 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/rmarkdown/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:11,947 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:11,947 [Rank 0]: total number of samples: 77 +[ip-26-0-150-122:0]:2023-06-21 17:28:11,947 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,031 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,033 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,033 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,033 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,033 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,033 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,033 [Rank 0]: > finished creating indexed dataset in 0.002331 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:12,033 [Rank 0]: number of documents: 10801285 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,033 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:12,033 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:12,033 [Rank 0]: document indices in [10466445, 10790484) total of 324039 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:12,036 [Rank 0]: > Tokens per epoch: 318261515 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,037 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,037 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:12,051 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.013492 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 324039 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 38850 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,055 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003927 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,055 [Rank 0]: > building shuffle index with split [0, 38850) and [38850, 38850) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,059 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004215 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,060 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_120ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,070 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_120ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,071 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/c-sharp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_120ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,071 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:12,071 [Rank 0]: total number of samples: 38851 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,071 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,155 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,157 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,157 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,157 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,157 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,157 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,157 [Rank 0]: > finished creating indexed dataset in 0.001766 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:12,157 [Rank 0]: number of documents: 587748 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,157 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:12,157 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:12,157 [Rank 0]: document indices in [569528, 587160) total of 17632 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:12,158 [Rank 0]: > Tokens per epoch: 6393705 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,160 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,160 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:12,163 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003214 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 17632 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 780 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,167 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003279 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,167 [Rank 0]: > building shuffle index with split [0, 780) and [780, 780) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,169 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002647 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,173 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,178 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,178 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/smalltalk/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,178 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:12,179 [Rank 0]: total number of samples: 781 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,179 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,263 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,265 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,265 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,265 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,265 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,265 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,265 [Rank 0]: > finished creating indexed dataset in 0.002192 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:12,265 [Rank 0]: number of documents: 541454 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,265 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:12,265 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:12,265 [Rank 0]: document indices in [524669, 540913) total of 16244 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:12,266 [Rank 0]: > Tokens per epoch: 19105324 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,266 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,267 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:12,270 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003258 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 16244 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2332 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,273 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002468 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,273 [Rank 0]: > building shuffle index with split [0, 2332) and [2332, 2332) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,275 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002231 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,278 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_6ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,283 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_6ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,284 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/haskell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_6ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,284 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:12,284 [Rank 0]: total number of samples: 2333 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,284 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,368 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]: > finished creating indexed dataset in 0.000680 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]: number of documents: 1152 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]: document indices in [1116, 1151) total of 35 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:12,369 [Rank 0]: > Tokens per epoch: 30587 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,385 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,388 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,391 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/maple/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,395 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:12,395 [Rank 0]: total number of samples: 4 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,395 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,482 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,483 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,483 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,483 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,483 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,483 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,483 [Rank 0]: > finished creating indexed dataset in 0.000777 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:12,483 [Rank 0]: number of documents: 22653 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,483 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:12,483 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:12,484 [Rank 0]: document indices in [21951, 22630) total of 679 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:12,484 [Rank 0]: > Tokens per epoch: 16838913 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,485 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,485 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:12,487 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002180 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 679 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2055 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,490 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003326 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,490 [Rank 0]: > building shuffle index with split [0, 2055) and [2055, 2055) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,493 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002627 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,493 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,498 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,498 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/mathematica/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,500 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:12,500 [Rank 0]: total number of samples: 2056 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,500 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,584 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,585 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,586 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,586 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,586 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,586 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,586 [Rank 0]: > finished creating indexed dataset in 0.001711 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:12,586 [Rank 0]: number of documents: 158356 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,586 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:12,586 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:12,586 [Rank 0]: document indices in [153447, 158198) total of 4751 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:12,586 [Rank 0]: > Tokens per epoch: 9867998 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,588 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,589 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:12,591 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002278 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 4751 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 1204 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,594 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002527 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,594 [Rank 0]: > building shuffle index with split [0, 1204) and [1204, 1204) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,596 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002608 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,599 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,603 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,603 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/ocaml/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,604 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:12,604 [Rank 0]: total number of samples: 1205 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,604 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,688 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,690 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,690 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,690 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,690 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,690 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,690 [Rank 0]: > finished creating indexed dataset in 0.002049 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:12,691 [Rank 0]: number of documents: 657349 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,691 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:12,691 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:12,691 [Rank 0]: document indices in [636971, 656692) total of 19721 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:12,691 [Rank 0]: > Tokens per epoch: 14806733 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,694 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,694 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:12,697 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003486 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 19721 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 1807 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,700 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002162 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,700 [Rank 0]: > building shuffle index with split [0, 1807) and [1807, 1807) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,703 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002931 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,708 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,714 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,714 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/makefile/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,715 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:12,715 [Rank 0]: total number of samples: 1808 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,715 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,799 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,801 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,801 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,801 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,801 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,801 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,801 [Rank 0]: > finished creating indexed dataset in 0.002136 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:12,801 [Rank 0]: number of documents: 549459 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,801 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:12,801 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:12,801 [Rank 0]: document indices in [532426, 548910) total of 16484 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:12,802 [Rank 0]: > Tokens per epoch: 29891276 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,804 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,804 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:12,807 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002825 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 16484 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 3648 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,809 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002457 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,809 [Rank 0]: > building shuffle index with split [0, 3648) and [3648, 3648) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,812 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002820 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,817 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_8ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,823 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_8ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,823 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/lua/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_8ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,823 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:12,824 [Rank 0]: total number of samples: 3649 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,824 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,907 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,908 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,908 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,908 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,908 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,908 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:12,908 [Rank 0]: > finished creating indexed dataset in 0.000729 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:12,908 [Rank 0]: number of documents: 1133 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,908 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:12,909 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:12,909 [Rank 0]: document indices in [1098, 1132) total of 34 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:12,909 [Rank 0]: > Tokens per epoch: 39416 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,925 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,929 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,930 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-coffeescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:12,933 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:12,933 [Rank 0]: total number of samples: 5 +[ip-26-0-150-122:0]:2023-06-21 17:28:12,933 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,017 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,017 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,018 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,018 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,018 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,018 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,018 [Rank 0]: > finished creating indexed dataset in 0.000767 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:13,018 [Rank 0]: number of documents: 6104 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,018 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:13,018 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:13,018 [Rank 0]: document indices in [5915, 6098) total of 183 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:13,018 [Rank 0]: > Tokens per epoch: 518557 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,020 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,020 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:13,023 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002899 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 183 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 63 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,026 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003113 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,026 [Rank 0]: > building shuffle index with split [0, 63) and [63, 63) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,028 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001994 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,028 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,033 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,033 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/literate-haskell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,035 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:13,035 [Rank 0]: total number of samples: 64 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,035 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,119 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,121 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,122 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,122 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,122 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,122 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,122 [Rank 0]: > finished creating indexed dataset in 0.002302 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:13,122 [Rank 0]: number of documents: 896880 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,122 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:13,122 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:13,122 [Rank 0]: document indices in [869077, 895983) total of 26906 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:13,122 [Rank 0]: > Tokens per epoch: 31882370 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,125 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,125 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:13,128 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003160 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 26906 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 3891 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,131 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002524 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,131 [Rank 0]: > building shuffle index with split [0, 3891) and [3891, 3891) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,133 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002579 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,134 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_9ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,141 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_9ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,141 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/restructuredtext/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_9ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,142 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:13,142 [Rank 0]: total number of samples: 3892 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,142 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,226 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,226 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,226 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,226 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,226 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,226 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,226 [Rank 0]: > finished creating indexed dataset in 0.000706 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:13,226 [Rank 0]: number of documents: 3688 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,227 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:13,227 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:13,227 [Rank 0]: document indices in [3574, 3684) total of 110 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:13,227 [Rank 0]: > Tokens per epoch: 233387 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,229 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,229 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:13,231 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002137 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 110 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 28 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,233 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.001910 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,233 [Rank 0]: > building shuffle index with split [0, 28) and [28, 28) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,236 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002616 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,239 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,243 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,247 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/racket/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,250 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:13,250 [Rank 0]: total number of samples: 29 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,251 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,335 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,335 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,335 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,336 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,336 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,336 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,336 [Rank 0]: > finished creating indexed dataset in 0.000871 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:13,336 [Rank 0]: number of documents: 19630 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,336 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:13,336 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:13,336 [Rank 0]: document indices in [19021, 19610) total of 589 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:13,336 [Rank 0]: > Tokens per epoch: 2060914 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,337 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,337 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:13,339 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.001882 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 589 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 251 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,342 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002619 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,342 [Rank 0]: > building shuffle index with split [0, 251) and [251, 251) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,344 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002281 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,345 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,349 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,349 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/standard-ml/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,351 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:13,351 [Rank 0]: total number of samples: 252 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,352 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,436 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,437 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,437 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,437 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,437 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,437 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,437 [Rank 0]: > finished creating indexed dataset in 0.001044 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:13,437 [Rank 0]: number of documents: 46270 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,437 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:13,437 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:13,437 [Rank 0]: document indices in [44836, 46224) total of 1388 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:13,438 [Rank 0]: > Tokens per epoch: 4206961 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,438 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,438 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:13,441 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002783 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1388 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 513 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,443 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.001830 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,443 [Rank 0]: > building shuffle index with split [0, 513) and [513, 513) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,446 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003026 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,447 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,455 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,455 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/systemverilog/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,455 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:13,455 [Rank 0]: total number of samples: 514 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,456 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,539 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,541 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,541 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,541 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,542 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,542 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,542 [Rank 0]: > finished creating indexed dataset in 0.002116 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:13,542 [Rank 0]: number of documents: 522778 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,542 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:13,542 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:13,542 [Rank 0]: document indices in [506572, 522255) total of 15683 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:13,542 [Rank 0]: > Tokens per epoch: 56256264 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,544 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,544 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:13,548 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003553 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 15683 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 6867 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,551 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003154 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,551 [Rank 0]: > building shuffle index with split [0, 6867) and [6867, 6867) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,553 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001761 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,553 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_14ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,560 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_14ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,561 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tex/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_14ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,561 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:13,561 [Rank 0]: total number of samples: 6868 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,561 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,645 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,646 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,646 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,646 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,646 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,646 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,646 [Rank 0]: > finished creating indexed dataset in 0.000779 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:13,646 [Rank 0]: number of documents: 10289 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,646 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:13,646 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:13,646 [Rank 0]: document indices in [9970, 10279) total of 309 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:13,647 [Rank 0]: > Tokens per epoch: 224077 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,657 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,662 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,663 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/awk/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,666 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:13,666 [Rank 0]: total number of samples: 28 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,666 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,750 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,752 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,752 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,752 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,752 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,752 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,753 [Rank 0]: > finished creating indexed dataset in 0.002376 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:13,753 [Rank 0]: number of documents: 247919 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,753 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:13,753 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:13,753 [Rank 0]: document indices in [240234, 247671) total of 7437 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:13,753 [Rank 0]: > Tokens per epoch: 23244839 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,754 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,754 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:13,757 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002579 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 7437 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2837 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,760 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002428 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,760 [Rank 0]: > building shuffle index with split [0, 2837) and [2837, 2837) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,762 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002113 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,762 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_5ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,767 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_5ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,768 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/assembly/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_5ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,769 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:13,769 [Rank 0]: total number of samples: 2838 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,770 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,854 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,854 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,854 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,854 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,854 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,855 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,855 [Rank 0]: > finished creating indexed dataset in 0.000720 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:13,855 [Rank 0]: number of documents: 5368 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,855 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:13,855 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:13,855 [Rank 0]: document indices in [5202, 5363) total of 161 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:13,855 [Rank 0]: > Tokens per epoch: 60505 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,866 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,870 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,874 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/alloy/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,877 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:13,878 [Rank 0]: total number of samples: 8 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,878 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,962 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,962 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,962 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,962 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,963 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,963 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,963 [Rank 0]: > finished creating indexed dataset in 0.000803 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:13,963 [Rank 0]: number of documents: 17554 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,963 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:13,963 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:13,963 [Rank 0]: document indices in [17010, 17536) total of 526 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:13,963 [Rank 0]: > Tokens per epoch: 791611 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,964 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,964 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:13,967 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002345 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 526 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 96 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,970 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003401 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,970 [Rank 0]: > building shuffle index with split [0, 96) and [96, 96) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:13,972 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001927 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,972 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,977 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,977 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/agda/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:13,979 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:13,980 [Rank 0]: total number of samples: 97 +[ip-26-0-150-122:0]:2023-06-21 17:28:13,980 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,064 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,065 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,065 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,065 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,065 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,065 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,065 [Rank 0]: > finished creating indexed dataset in 0.001090 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:14,065 [Rank 0]: number of documents: 52838 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,065 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:14,065 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:14,066 [Rank 0]: document indices in [51200, 52785) total of 1585 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:14,066 [Rank 0]: > Tokens per epoch: 3599819 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,067 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,067 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:14,069 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002339 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1585 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 439 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,071 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.001771 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,071 [Rank 0]: > building shuffle index with split [0, 439) and [439, 439) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,073 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001737 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,075 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,082 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,083 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/emacs-lisp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,083 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:14,083 [Rank 0]: total number of samples: 440 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,083 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,167 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,169 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,169 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,169 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,170 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,170 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,170 [Rank 0]: > finished creating indexed dataset in 0.002252 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:14,170 [Rank 0]: number of documents: 928415 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,170 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:14,170 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:14,170 [Rank 0]: document indices in [899634, 927487) total of 27853 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:14,170 [Rank 0]: > Tokens per epoch: 27319085 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,172 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,172 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:14,176 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003651 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 27853 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 3334 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,178 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002606 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,178 [Rank 0]: > building shuffle index with split [0, 3334) and [3334, 3334) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,182 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003556 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,182 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_10ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,190 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_10ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,191 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dart/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_10ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,191 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:14,191 [Rank 0]: total number of samples: 3335 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,191 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,275 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,276 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,276 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,276 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,276 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,277 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,277 [Rank 0]: > finished creating indexed dataset in 0.001104 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:14,277 [Rank 0]: number of documents: 58151 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,277 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:14,277 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:14,277 [Rank 0]: document indices in [56348, 58093) total of 1745 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:14,277 [Rank 0]: > Tokens per epoch: 5481832 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,278 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,278 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:14,281 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002538 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1745 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 669 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,283 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002462 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,283 [Rank 0]: > building shuffle index with split [0, 669) and [669, 669) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,286 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002933 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,289 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,297 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,305 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cuda/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,305 [Rank 0]: loaded indexed file in 0.016 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:14,305 [Rank 0]: total number of samples: 670 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,305 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,391 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,391 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,391 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,391 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,391 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,391 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,391 [Rank 0]: > finished creating indexed dataset in 0.000700 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:14,392 [Rank 0]: number of documents: 5928 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,392 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:14,392 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:14,392 [Rank 0]: document indices in [5744, 5922) total of 178 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:14,392 [Rank 0]: > Tokens per epoch: 389178 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,394 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,394 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:14,396 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002733 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 178 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 47 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,399 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002642 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,399 [Rank 0]: > building shuffle index with split [0, 47) and [47, 47) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,402 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002614 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,402 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,407 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,407 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/bluespec/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,408 [Rank 0]: loaded indexed file in 0.005 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:14,408 [Rank 0]: total number of samples: 48 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,408 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,492 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,492 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,492 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,492 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,493 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,493 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,493 [Rank 0]: > finished creating indexed dataset in 0.000684 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:14,493 [Rank 0]: number of documents: 180 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,493 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:14,493 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:14,493 [Rank 0]: document indices in [174, 180) total of 6 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:14,493 [Rank 0]: > Tokens per epoch: 7815 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,502 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,507 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,510 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/augeas/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,511 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:14,511 [Rank 0]: total number of samples: 2 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,511 [Rank 0]: total number of epochs: 2 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,595 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,597 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,597 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,597 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,597 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,597 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,597 [Rank 0]: > finished creating indexed dataset in 0.002417 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:14,597 [Rank 0]: number of documents: 239568 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,597 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:14,597 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:14,598 [Rank 0]: document indices in [232141, 239328) total of 7187 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:14,598 [Rank 0]: > Tokens per epoch: 3729565 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,600 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,600 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:14,604 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003165 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 7187 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 455 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,606 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002676 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,606 [Rank 0]: > building shuffle index with split [0, 455) and [455, 455) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,610 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003518 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,610 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,618 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,619 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/batchfile/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,619 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:14,619 [Rank 0]: total number of samples: 456 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,619 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,703 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,704 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,704 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,704 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,704 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,704 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,704 [Rank 0]: > finished creating indexed dataset in 0.000751 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:14,704 [Rank 0]: number of documents: 4806 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,704 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:14,704 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:14,704 [Rank 0]: document indices in [4657, 4801) total of 144 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:14,705 [Rank 0]: > Tokens per epoch: 118601 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,713 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,718 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,721 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcsh/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,726 [Rank 0]: loaded indexed file in 0.013 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:14,726 [Rank 0]: total number of samples: 15 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,726 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,811 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,812 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,812 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,812 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,812 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,812 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,812 [Rank 0]: > finished creating indexed dataset in 0.000733 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:14,812 [Rank 0]: number of documents: 5429 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,812 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:14,812 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:14,812 [Rank 0]: document indices in [5261, 5424) total of 163 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:14,813 [Rank 0]: > Tokens per epoch: 146349 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,816 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,820 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,824 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stan/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,826 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:14,826 [Rank 0]: total number of samples: 18 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,826 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,912 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,914 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,914 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,914 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,914 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,914 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,914 [Rank 0]: > finished creating indexed dataset in 0.001934 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:14,914 [Rank 0]: number of documents: 1355788 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,914 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:14,914 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:14,914 [Rank 0]: document indices in [1313759, 1354432) total of 40673 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:14,915 [Rank 0]: > Tokens per epoch: 38836780 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,917 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,917 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:14,921 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003928 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 40673 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 4740 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,924 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002658 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,924 [Rank 0]: > building shuffle index with split [0, 4740) and [4740, 4740) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:14,927 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002755 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,927 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_13ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,933 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_13ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,934 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scala/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_13ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:14,936 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:14,936 [Rank 0]: total number of samples: 4741 +[ip-26-0-150-122:0]:2023-06-21 17:28:14,936 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,022 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,023 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,023 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,023 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,023 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,024 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,024 [Rank 0]: > finished creating indexed dataset in 0.000984 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:15,024 [Rank 0]: number of documents: 49335 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,024 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:15,024 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:15,024 [Rank 0]: document indices in [47806, 49286) total of 1480 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:15,024 [Rank 0]: > Tokens per epoch: 3611088 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,026 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,026 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:15,028 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002204 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1480 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 440 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,031 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002141 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,031 [Rank 0]: > building shuffle index with split [0, 440) and [440, 440) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,033 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002876 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,079 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,083 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,085 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/tcl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,087 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:15,087 [Rank 0]: total number of samples: 441 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,087 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,171 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,172 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,172 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,172 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,172 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,172 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,172 [Rank 0]: > finished creating indexed dataset in 0.000877 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:15,173 [Rank 0]: number of documents: 24208 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,173 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:15,173 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:15,173 [Rank 0]: document indices in [23458, 24184) total of 726 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:15,173 [Rank 0]: > Tokens per epoch: 5577566 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,174 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,174 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:15,176 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002153 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 726 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 680 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,179 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002290 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,179 [Rank 0]: > building shuffle index with split [0, 680) and [680, 680) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,181 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002041 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,209 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,213 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,213 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/stata/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,215 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:15,215 [Rank 0]: total number of samples: 681 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,215 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,299 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,300 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,300 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,300 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,300 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,300 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,300 [Rank 0]: > finished creating indexed dataset in 0.000756 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:15,300 [Rank 0]: number of documents: 4737 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,300 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:15,300 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:15,300 [Rank 0]: document indices in [4590, 4732) total of 142 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:15,301 [Rank 0]: > Tokens per epoch: 63420 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,311 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,315 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,319 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/applescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,321 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:15,321 [Rank 0]: total number of samples: 8 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,321 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,405 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,407 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,408 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,408 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,408 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,408 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,408 [Rank 0]: > finished creating indexed dataset in 0.002352 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:15,408 [Rank 0]: number of documents: 2206327 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,408 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:15,408 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:15,408 [Rank 0]: document indices in [2137931, 2204121) total of 66190 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:15,409 [Rank 0]: > Tokens per epoch: 31891052 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,410 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,411 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:15,415 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.004590 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 66190 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 3892 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,419 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003354 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,419 [Rank 0]: > building shuffle index with split [0, 3892) and [3892, 3892) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,421 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002542 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,422 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_9ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,428 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_9ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,429 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/shell/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_9ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,429 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:15,429 [Rank 0]: total number of samples: 3893 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,429 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,514 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,515 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,515 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,515 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,515 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,515 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,515 [Rank 0]: > finished creating indexed dataset in 0.001544 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:15,515 [Rank 0]: number of documents: 125163 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,515 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:15,516 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:15,516 [Rank 0]: document indices in [121283, 125038) total of 3755 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:15,516 [Rank 0]: > Tokens per epoch: 3837021 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,517 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,517 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:15,519 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002499 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 3755 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 468 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,523 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003726 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,523 [Rank 0]: > building shuffle index with split [0, 468) and [468, 468) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,526 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003039 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,531 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,537 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,538 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/clojure/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,538 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:15,538 [Rank 0]: total number of samples: 469 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,538 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,623 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,623 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,623 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,623 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,624 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,624 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,624 [Rank 0]: > finished creating indexed dataset in 0.000978 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:15,624 [Rank 0]: number of documents: 41890 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,624 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:15,624 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:15,624 [Rank 0]: document indices in [40591, 41848) total of 1257 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:15,624 [Rank 0]: > Tokens per epoch: 2017219 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,626 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,626 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:15,628 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002203 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1257 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 246 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,631 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002365 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,631 [Rank 0]: > building shuffle index with split [0, 246) and [246, 246) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,633 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002468 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,676 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,684 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,685 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/scheme/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,685 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:15,685 [Rank 0]: total number of samples: 247 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,685 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,770 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,771 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,771 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,771 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,771 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,771 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,771 [Rank 0]: > finished creating indexed dataset in 0.000719 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:15,771 [Rank 0]: number of documents: 7917 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,771 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:15,771 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:15,772 [Rank 0]: document indices in [7672, 7909) total of 237 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:15,772 [Rank 0]: > Tokens per epoch: 1102148 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,772 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,773 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:15,775 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002410 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 237 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 134 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,778 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002763 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,778 [Rank 0]: > building shuffle index with split [0, 134) and [134, 134) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,780 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001769 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,780 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,785 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,785 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/antlr/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,787 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:15,787 [Rank 0]: total number of samples: 135 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,787 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,872 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,872 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,873 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,873 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,873 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,873 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,873 [Rank 0]: > finished creating indexed dataset in 0.000787 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:15,873 [Rank 0]: number of documents: 13716 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,873 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:15,873 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:15,873 [Rank 0]: document indices in [13291, 13702) total of 411 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:15,873 [Rank 0]: > Tokens per epoch: 465467 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,875 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,875 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:15,878 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002776 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 411 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 56 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,880 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002538 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,880 [Rank 0]: > building shuffle index with split [0, 56) and [56, 56) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:15,882 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002017 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,923 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,927 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,928 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sparql/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:15,930 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:15,930 [Rank 0]: total number of samples: 57 +[ip-26-0-150-122:0]:2023-06-21 17:28:15,930 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,015 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,017 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,017 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,017 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,017 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,017 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,017 [Rank 0]: > finished creating indexed dataset in 0.002290 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:16,017 [Rank 0]: number of documents: 975420 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,017 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:16,017 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:16,017 [Rank 0]: document indices in [945182, 974445) total of 29263 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:16,018 [Rank 0]: > Tokens per epoch: 164859090 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,018 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,019 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:16,022 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003657 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 29263 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 20124 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,025 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002956 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,025 [Rank 0]: > building shuffle index with split [0, 20124) and [20124, 20124) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,028 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002082 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,034 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_30ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,043 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_30ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,043 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/sql/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_30ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,044 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:16,044 [Rank 0]: total number of samples: 20125 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,044 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,127 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,129 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,129 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,129 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,129 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,129 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,129 [Rank 0]: > finished creating indexed dataset in 0.001750 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:16,129 [Rank 0]: number of documents: 167701 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,129 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:16,129 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:16,129 [Rank 0]: document indices in [162502, 167533) total of 5031 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:16,130 [Rank 0]: > Tokens per epoch: 5272081 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,130 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,131 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:16,134 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003043 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 5031 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 643 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,136 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002185 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,136 [Rank 0]: > building shuffle index with split [0, 643) and [643, 643) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,138 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002272 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,142 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,149 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,149 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/glsl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,150 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:16,150 [Rank 0]: total number of samples: 644 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,150 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,234 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,235 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,235 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,235 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,235 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,235 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,235 [Rank 0]: > finished creating indexed dataset in 0.001125 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:16,235 [Rank 0]: number of documents: 62033 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,235 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:16,235 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:16,236 [Rank 0]: document indices in [60110, 61971) total of 1861 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:16,236 [Rank 0]: > Tokens per epoch: 2205938 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,238 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,238 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:16,241 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003110 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1861 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 269 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,243 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.001885 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,243 [Rank 0]: > building shuffle index with split [0, 269) and [269, 269) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,246 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003120 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,247 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,252 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,252 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elm/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,254 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:16,254 [Rank 0]: total number of samples: 270 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,254 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,339 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,341 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,341 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,341 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,341 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,341 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,341 [Rank 0]: > finished creating indexed dataset in 0.002170 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:16,341 [Rank 0]: number of documents: 571506 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,341 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:16,341 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:16,341 [Rank 0]: document indices in [553789, 570934) total of 17145 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:16,342 [Rank 0]: > Tokens per epoch: 4375164 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,342 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,343 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:16,346 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003622 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 17145 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 534 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,350 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003655 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,350 [Rank 0]: > building shuffle index with split [0, 534) and [534, 534) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,353 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003345 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,354 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,360 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,360 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/dockerfile/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,361 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:16,361 [Rank 0]: total number of samples: 535 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,361 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,445 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,447 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,447 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,447 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,447 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,448 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,448 [Rank 0]: > finished creating indexed dataset in 0.002257 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:16,448 [Rank 0]: number of documents: 6353527 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,448 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:16,448 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:16,448 [Rank 0]: document indices in [6156568, 6347173) total of 190605 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:16,449 [Rank 0]: > Tokens per epoch: 476705041 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,450 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,450 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:16,458 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.008151 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 190605 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 58191 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,462 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003879 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,463 [Rank 0]: > building shuffle index with split [0, 58191) and [58191, 58191) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,466 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003294 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,466 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_132ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,477 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_132ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,477 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/cpp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_132ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,478 [Rank 0]: loaded indexed file in 0.012 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:16,478 [Rank 0]: total number of samples: 58192 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,478 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,562 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,563 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,564 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,564 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,564 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,564 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,564 [Rank 0]: > finished creating indexed dataset in 0.001764 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:16,564 [Rank 0]: number of documents: 226209 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,564 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:16,564 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:16,564 [Rank 0]: document indices in [219197, 225983) total of 6786 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:16,564 [Rank 0]: > Tokens per epoch: 5560129 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,566 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,566 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:16,570 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003138 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 6786 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 678 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,573 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002916 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,573 [Rank 0]: > building shuffle index with split [0, 678) and [678, 678) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,574 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001654 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,581 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,589 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,595 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/coffeescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,595 [Rank 0]: loaded indexed file in 0.014 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:16,595 [Rank 0]: total number of samples: 679 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,595 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,679 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,680 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,680 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,680 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,680 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,681 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,681 [Rank 0]: > finished creating indexed dataset in 0.001380 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:16,681 [Rank 0]: number of documents: 98733 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,681 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:16,681 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:16,681 [Rank 0]: document indices in [95672, 98634) total of 2962 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:16,681 [Rank 0]: > Tokens per epoch: 16829467 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,682 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,682 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:16,685 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002942 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 2962 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 2054 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,688 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003067 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,689 [Rank 0]: > building shuffle index with split [0, 2054) and [2054, 2054) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,691 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002519 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,691 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,699 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,700 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/common-lisp/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,700 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:16,700 [Rank 0]: total number of samples: 2055 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,700 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,784 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,786 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,786 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,786 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,786 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,787 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,787 [Rank 0]: > finished creating indexed dataset in 0.002117 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:16,787 [Rank 0]: number of documents: 281016 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,787 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:16,787 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:16,787 [Rank 0]: document indices in [272305, 280735) total of 8430 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:16,787 [Rank 0]: > Tokens per epoch: 7046176 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,788 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,789 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:16,792 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003131 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 8430 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 860 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,795 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003006 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,795 [Rank 0]: > building shuffle index with split [0, 860) and [860, 860) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,797 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001766 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,799 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,804 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,804 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/elixir/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_2ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,804 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:16,805 [Rank 0]: total number of samples: 861 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,805 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,889 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,891 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,891 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,891 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,891 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,891 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,891 [Rank 0]: > finished creating indexed dataset in 0.002249 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:16,891 [Rank 0]: number of documents: 250834 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,891 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:16,891 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:16,891 [Rank 0]: document indices in [243058, 250583) total of 7525 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:16,892 [Rank 0]: > Tokens per epoch: 7066083 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,893 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,893 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:16,896 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002884 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 7525 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 862 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,898 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002220 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,899 [Rank 0]: > building shuffle index with split [0, 862) and [862, 862) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,901 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002579 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,904 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,908 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,910 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/groovy/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:16,912 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:16,912 [Rank 0]: total number of samples: 863 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,912 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,997 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,999 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,999 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,999 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,999 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,999 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:16,999 [Rank 0]: > finished creating indexed dataset in 0.002316 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:16,999 [Rank 0]: number of documents: 3299965 +[ip-26-0-150-122:0]:2023-06-21 17:28:16,999 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:17,000 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:17,000 [Rank 0]: document indices in [3197666, 3296665) total of 98999 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:17,000 [Rank 0]: > Tokens per epoch: 293479485 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,002 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,002 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:17,007 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.005542 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 98999 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 35825 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,010 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002775 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,010 [Rank 0]: > building shuffle index with split [0, 35825) and [35825, 35825) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,013 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003100 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,056 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_79ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:17,065 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_79ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:17,066 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/html/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_79ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:17,066 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:17,066 [Rank 0]: total number of samples: 35826 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,066 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,151 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,152 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,153 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,153 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,153 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,153 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,153 [Rank 0]: > finished creating indexed dataset in 0.002153 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:17,153 [Rank 0]: number of documents: 20071773 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,153 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:17,153 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:17,153 [Rank 0]: document indices in [19449548, 20051701) total of 602153 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:17,157 [Rank 0]: > Tokens per epoch: 679829501 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,158 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,158 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:17,182 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.023708 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 602153 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 82986 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,187 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.005039 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,187 [Rank 0]: > building shuffle index with split [0, 82986) and [82986, 82986) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,192 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004693 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,217 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_234ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:17,228 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_234ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:17,231 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/java/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_234ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:17,231 [Rank 0]: loaded indexed file in 0.014 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:17,231 [Rank 0]: total number of samples: 82987 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,231 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,315 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,317 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,318 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,318 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,318 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,318 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,318 [Rank 0]: > finished creating indexed dataset in 0.002495 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:17,318 [Rank 0]: number of documents: 19544285 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,318 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:17,318 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:17,318 [Rank 0]: document indices in [18938412, 19524741) total of 586329 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:17,322 [Rank 0]: > Tokens per epoch: 565628573 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,324 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,324 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:17,346 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.021432 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 586329 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 69046 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,352 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.006332 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,352 [Rank 0]: > building shuffle index with split [0, 69046) and [69046, 69046) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,356 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003602 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,356 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_174ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:17,367 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_174ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:17,368 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/javascript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_174ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:17,368 [Rank 0]: loaded indexed file in 0.012 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:17,368 [Rank 0]: total number of samples: 69047 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,368 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,452 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,454 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,454 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,454 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,454 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,455 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,455 [Rank 0]: > finished creating indexed dataset in 0.002519 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:17,455 [Rank 0]: number of documents: 21029287 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,455 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:17,455 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:17,455 [Rank 0]: document indices in [20377379, 21008258) total of 630879 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:17,459 [Rank 0]: > Tokens per epoch: 765105610 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,460 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,460 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:17,484 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.023816 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 630879 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 93396 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,489 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.005210 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,489 [Rank 0]: > building shuffle index with split [0, 93396) and [93396, 93396) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,494 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004117 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,524 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_202ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:17,536 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_202ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:17,537 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/markdown/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_202ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:17,538 [Rank 0]: loaded indexed file in 0.014 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:17,538 [Rank 0]: total number of samples: 93397 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,538 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,622 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,624 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,624 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,624 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,624 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,625 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,625 [Rank 0]: > finished creating indexed dataset in 0.002314 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:17,625 [Rank 0]: number of documents: 15683017 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,625 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:17,625 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:17,625 [Rank 0]: document indices in [15196843, 15667334) total of 470491 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:17,629 [Rank 0]: > Tokens per epoch: 512566580 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,630 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,631 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:17,648 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.017646 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 470491 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 62569 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,652 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004050 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,653 [Rank 0]: > building shuffle index with split [0, 62569) and [62569, 62569) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,657 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.004143 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,657 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_164ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:17,670 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_164ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:17,672 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/php/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_164ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:17,673 [Rank 0]: loaded indexed file in 0.015 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:17,673 [Rank 0]: total number of samples: 62570 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,673 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,757 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,759 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,759 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,759 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,759 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,759 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,760 [Rank 0]: > finished creating indexed dataset in 0.002110 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:17,760 [Rank 0]: number of documents: 12866649 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,760 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:17,760 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:17,760 [Rank 0]: document indices in [12467783, 12853782) total of 385999 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:17,762 [Rank 0]: > Tokens per epoch: 529606827 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,764 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,764 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:17,780 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.015376 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 385999 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 64649 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,785 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.005202 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,785 [Rank 0]: > building shuffle index with split [0, 64649) and [64649, 64649) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,789 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003803 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,790 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_163ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:17,801 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_163ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:17,808 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/python/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_163ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:17,808 [Rank 0]: loaded indexed file in 0.019 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:17,808 [Rank 0]: total number of samples: 64650 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,809 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,892 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,893 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,893 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,893 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,893 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,894 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,894 [Rank 0]: > finished creating indexed dataset in 0.001840 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:17,894 [Rank 0]: number of documents: 10547331 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,894 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:17,894 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:17,894 [Rank 0]: document indices in [10220364, 10536784) total of 316420 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:17,896 [Rank 0]: > Tokens per epoch: 222078157 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,898 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,898 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:17,910 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.012172 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 316420 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 27109 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,914 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.004088 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,914 [Rank 0]: > building shuffle index with split [0, 27109) and [27109, 27109) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:17,917 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002453 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,917 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_72ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:17,926 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_72ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:17,926 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/typescript/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_72ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:17,927 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:17,927 [Rank 0]: total number of samples: 27110 +[ip-26-0-150-122:0]:2023-06-21 17:28:17,927 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,011 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,012 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,012 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,012 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,012 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,012 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,012 [Rank 0]: > finished creating indexed dataset in 0.000731 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:18,012 [Rank 0]: number of documents: 75 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,012 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:18,012 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:18,012 [Rank 0]: document indices in [73, 75) total of 2 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:18,013 [Rank 0]: > Tokens per epoch: 5184 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,026 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,030 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,031 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/verilog/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,035 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:18,035 [Rank 0]: total number of samples: 2 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,035 [Rank 0]: total number of epochs: 2 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,120 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,121 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,122 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,122 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,122 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,122 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,122 [Rank 0]: > finished creating indexed dataset in 0.001868 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:18,122 [Rank 0]: number of documents: 161239 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,122 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:18,122 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:18,122 [Rank 0]: document indices in [156241, 161078) total of 4837 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:18,122 [Rank 0]: > Tokens per epoch: 11401469 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,124 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,124 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:18,127 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002941 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 4837 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 1391 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,129 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.001961 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,129 [Rank 0]: > building shuffle index with split [0, 1391) and [1391, 1391) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,132 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002429 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,132 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,140 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,143 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/visual-basic/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_4ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,144 [Rank 0]: loaded indexed file in 0.012 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:18,144 [Rank 0]: total number of samples: 1392 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,144 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,229 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,229 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,230 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,230 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,230 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,230 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,230 [Rank 0]: > finished creating indexed dataset in 0.001112 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:18,230 [Rank 0]: number of documents: 58208 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,230 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:18,230 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:18,230 [Rank 0]: document indices in [56404, 58150) total of 1746 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:18,230 [Rank 0]: > Tokens per epoch: 12008501 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,231 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,231 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:18,234 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002831 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1746 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 1465 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,240 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.005459 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,240 [Rank 0]: > building shuffle index with split [0, 1465) and [1465, 1465) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,243 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003084 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,244 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,252 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,257 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/vhdl/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,257 [Rank 0]: loaded indexed file in 0.014 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:18,258 [Rank 0]: total number of samples: 1466 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,258 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,342 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,342 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,343 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,343 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,343 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,343 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,343 [Rank 0]: > finished creating indexed dataset in 0.000728 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:18,343 [Rank 0]: number of documents: 4661 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,343 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:18,343 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:18,343 [Rank 0]: document indices in [4517, 4656) total of 139 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:18,343 [Rank 0]: > Tokens per epoch: 98302 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,357 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,361 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,363 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/thrift/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,366 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:18,367 [Rank 0]: total number of samples: 12 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,367 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,451 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]: > finished creating indexed dataset in 0.000742 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]: number of documents: 93 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]: document indices in [90, 93) total of 3 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:18,452 [Rank 0]: > Tokens per epoch: 4277 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,456 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,459 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,460 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/matlab/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,463 [Rank 0]: loaded indexed file in 0.007 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:18,463 [Rank 0]: total number of samples: 2 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,463 [Rank 0]: total number of epochs: 2 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,547 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,548 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,548 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,548 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,548 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,548 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,548 [Rank 0]: > finished creating indexed dataset in 0.000765 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:18,548 [Rank 0]: number of documents: 7451 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,548 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:18,549 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:18,549 [Rank 0]: document indices in [7220, 7444) total of 224 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:18,549 [Rank 0]: > Tokens per epoch: 1128407 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,550 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,550 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:18,553 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002814 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 224 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 137 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,555 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002108 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,555 [Rank 0]: > building shuffle index with split [0, 137) and [137, 137) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,557 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001781 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,557 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,561 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,565 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yacc/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,568 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:18,568 [Rank 0]: total number of samples: 138 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,568 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,653 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,653 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,653 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,653 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,653 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,654 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,654 [Rank 0]: > finished creating indexed dataset in 0.000821 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:18,654 [Rank 0]: number of documents: 15850 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,654 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:18,654 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:18,654 [Rank 0]: document indices in [15359, 15834) total of 475 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:18,654 [Rank 0]: > Tokens per epoch: 2144189 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,655 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,655 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:18,658 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003053 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 475 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 261 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,661 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002448 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,661 [Rank 0]: > building shuffle index with split [0, 261) and [261, 261) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,663 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002231 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,663 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,668 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,672 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/zig/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,675 [Rank 0]: loaded indexed file in 0.011 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:18,675 [Rank 0]: total number of samples: 262 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,675 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,760 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,760 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,760 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,761 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,761 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,761 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,761 [Rank 0]: > finished creating indexed dataset in 0.001013 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:18,761 [Rank 0]: number of documents: 42103 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,761 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:18,761 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:18,761 [Rank 0]: document indices in [40798, 42061) total of 1263 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:18,761 [Rank 0]: > Tokens per epoch: 4166294 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,762 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,762 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:18,764 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002029 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 1263 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 508 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,766 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002079 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,767 [Rank 0]: > building shuffle index with split [0, 508) and [508, 508) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,768 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001777 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,769 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,776 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,776 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/xslt/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_1ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,777 [Rank 0]: loaded indexed file in 0.008 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:18,777 [Rank 0]: total number of samples: 509 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,777 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,861 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,863 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,863 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,863 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,863 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,864 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,864 [Rank 0]: > finished creating indexed dataset in 0.002177 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:18,864 [Rank 0]: number of documents: 4751547 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,864 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:18,864 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:18,864 [Rank 0]: document indices in [4604249, 4746795) total of 142546 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:18,865 [Rank 0]: > Tokens per epoch: 62884447 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,866 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,866 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:18,872 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.006435 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 142546 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 7676 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,875 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002373 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,875 [Rank 0]: > building shuffle index with split [0, 7676) and [7676, 7676) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,877 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.001934 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,877 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,885 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,886 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/json/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,886 [Rank 0]: loaded indexed file in 0.009 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:18,886 [Rank 0]: total number of samples: 7677 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,886 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,971 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,973 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,973 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,973 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,973 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,973 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,973 [Rank 0]: > finished creating indexed dataset in 0.002288 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:18,973 [Rank 0]: number of documents: 3995948 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,973 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:18,974 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:18,974 [Rank 0]: document indices in [3872074, 3991952) total of 119878 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:18,975 [Rank 0]: > Tokens per epoch: 35974762 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,976 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,976 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:18,982 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.005884 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 119878 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 4391 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,985 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002445 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,985 [Rank 0]: > building shuffle index with split [0, 4391) and [4391, 4391) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:18,987 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002585 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,988 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,997 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,998 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/code/yaml/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_3ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:18,998 [Rank 0]: loaded indexed file in 0.010 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:18,998 [Rank 0]: total number of samples: 4392 +[ip-26-0-150-122:0]:2023-06-21 17:28:18,998 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,081 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,083 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,083 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,083 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,083 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,084 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,084 [Rank 0]: > finished creating indexed dataset in 0.002353 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:19,084 [Rank 0]: number of documents: 30982955 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,084 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:19,084 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:19,084 [Rank 0]: document indices in [30022483, 30951972) total of 929489 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:19,090 [Rank 0]: > Tokens per epoch: 538755961 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,091 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,091 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:19,125 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.033432 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 929489 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 65766 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,131 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.006360 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,131 [Rank 0]: > building shuffle index with split [0, 65766) and [65766, 65766) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,135 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003373 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,159 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_146ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:19,174 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_146ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:19,175 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_issues/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_146ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:19,175 [Rank 0]: loaded indexed file in 0.016 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:19,176 [Rank 0]: total number of samples: 65767 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,176 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,258 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,261 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,261 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,261 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,261 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,261 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,261 [Rank 0]: > finished creating indexed dataset in 0.002677 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:19,261 [Rank 0]: number of documents: 7634718 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,261 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:19,261 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:19,261 [Rank 0]: document indices in [7398042, 7627083) total of 229041 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:19,263 [Rank 0]: > Tokens per epoch: 483498380 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,265 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,265 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:19,276 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.010390 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 229041 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 59020 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,279 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003644 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,279 [Rank 0]: > building shuffle index with split [0, 59020) and [59020, 59020) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,283 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.003726 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,284 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_86ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:19,296 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_86ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:19,297 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/gh_commits/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_86ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:19,297 [Rank 0]: loaded indexed file in 0.014 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:19,298 [Rank 0]: total number of samples: 59021 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,298 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,380 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,382 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,382 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,382 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,382 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,383 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,383 [Rank 0]: > finished creating indexed dataset in 0.002234 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:19,383 [Rank 0]: number of documents: 914510 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,383 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:19,383 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:19,383 [Rank 0]: document indices in [886160, 913595) total of 27435 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:19,383 [Rank 0]: > Tokens per epoch: 73709652 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,384 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,384 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:19,388 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.003489 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 27435 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 8997 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,391 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.002764 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,391 [Rank 0]: > building shuffle index with split [0, 8997) and [8997, 8997) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,393 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002206 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,399 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_20ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:19,405 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_20ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:19,405 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_scripts/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_20ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:19,406 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:19,406 [Rank 0]: total number of samples: 8998 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,406 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,488 [Rank 0]: > building dataset index ... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,491 [Rank 0]: reading sizes... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,491 [Rank 0]: reading pointers... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,491 [Rank 0]: reading document index... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,491 [Rank 0]: creating numpy buffer of mmap... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,491 [Rank 0]: creating memory view of numpy buffer... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,491 [Rank 0]: > finished creating indexed dataset in 0.002633 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:19,491 [Rank 0]: number of documents: 668743 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,491 [Rank 0]: > dataset split: +[ip-26-0-150-122:0]:2023-06-21 17:28:19,492 [Rank 0]: VALID_all_sources_weighted: +[ip-26-0-150-122:0]:2023-06-21 17:28:19,492 [Rank 0]: document indices in [648012, 668074) total of 20062 documents +[ip-26-0-150-122:0]:2023-06-21 17:28:19,492 [Rank 0]: > Tokens per epoch: 56156688 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,493 [Rank 0]: > WARNING: could not find index map files, building the indices on rank 0 ... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,493 [Rank 0]: > only one epoch required, setting separate_last_epoch to False +[ip-26-0-150-122:0]:2023-06-21 17:28:19,495 [Rank 0]: > elasped time to build and save doc-idx mapping (seconds): 0.002499 +[ip-26-0-150-122:0]: using: +[ip-26-0-150-122:0]: number of documents: 20062 +[ip-26-0-150-122:0]: number of epochs: 1 +[ip-26-0-150-122:0]: sequence length: 8192 +[ip-26-0-150-122:0]: total number of samples: 6855 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,499 [Rank 0]: > elasped time to build and save sample-idx mapping (seconds): 0.003009 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,499 [Rank 0]: > building shuffle index with split [0, 6855) and [6855, 6855) ... +[ip-26-0-150-122:0]:2023-06-21 17:28:19,501 [Rank 0]: > elasped time to build and save shuffle-idx mapping (seconds): 0.002398 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,508 [Rank 0]: > loading doc-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_17ns_8192sl_1234s_doc_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:19,513 [Rank 0]: > loading sample-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_17ns_8192sl_1234s_sample_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:19,513 [Rank 0]: > loading shuffle-idx mapping from /fsx/bigcode/bigcode-training/tokenized_stack_no_pii/jupyter_structured/gpt2-preprocessed_content_document_VALID_all_sources_weighted_indexmap_17ns_8192sl_1234s_shuffle_idx.npy +[ip-26-0-150-122:0]:2023-06-21 17:28:19,514 [Rank 0]: loaded indexed file in 0.006 seconds +[ip-26-0-150-122:0]:2023-06-21 17:28:19,514 [Rank 0]: total number of samples: 6856 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,514 [Rank 0]: total number of epochs: 1 +[ip-26-0-150-122:0]:> building indices for blendable datasets ... +[ip-26-0-150-122:0]: > sample ratios: +[ip-26-0-150-122:0]: dataset 0, input: 0.00391159, achieved: 0.00391165 +[ip-26-0-150-122:0]: dataset 1, input: 1.30386e-05, achieved: 1.30534e-05 +[ip-26-0-150-122:0]: dataset 2, input: 0.0702651, achieved: 0.0702651 +[ip-26-0-150-122:0]: dataset 3, input: 0.00232087, achieved: 0.00232023 +[ip-26-0-150-122:0]: dataset 4, input: 0.00110828, achieved: 0.00110845 +[ip-26-0-150-122:0]: dataset 5, input: 0.00740594, achieved: 0.0074056 +[ip-26-0-150-122:0]: dataset 6, input: 1.30386e-05, achieved: 1.30534e-05 +[ip-26-0-150-122:0]: dataset 7, input: 0.00170806, achieved: 0.00170781 +[ip-26-0-150-122:0]: dataset 8, input: 0.00127778, achieved: 0.00127814 +[ip-26-0-150-122:0]: dataset 9, input: 0.000104309, achieved: 0.000104427 +[ip-26-0-150-122:0]: dataset 10, input: 3.91159e-05, achieved: 3.91601e-05 +[ip-26-0-150-122:0]: dataset 11, input: 0.000117348, achieved: 0.00011748 +[ip-26-0-150-122:0]: dataset 12, input: 0.00146033, achieved: 0.0014598 +[ip-26-0-150-122:0]: dataset 13, input: 0.0310058, achieved: 0.0310061 +[ip-26-0-150-122:0]: dataset 14, input: 0.000912704, achieved: 0.000912647 +[ip-26-0-150-122:0]: dataset 15, input: 0.000795356, achieved: 0.000795167 +[ip-26-0-150-122:0]: dataset 16, input: 0.000339004, achieved: 0.000339387 +[ip-26-0-150-122:0]: dataset 17, input: 0.00219049, achieved: 0.00219079 +[ip-26-0-150-122:0]: dataset 18, input: 0.00290761, achieved: 0.00290763 +[ip-26-0-150-122:0]: dataset 19, input: 0.000391159, achieved: 0.000391601 +[ip-26-0-150-122:0]: dataset 20, input: 0.000404197, achieved: 0.000404654 +[ip-26-0-150-122:0]: dataset 21, input: 0.000586738, achieved: 0.000586313 +[ip-26-0-150-122:0]: dataset 22, input: 0.000156463, achieved: 0.00015664 +[ip-26-0-150-122:0]: dataset 23, input: 0.0088793, achieved: 0.00887954 +[ip-26-0-150-122:0]: dataset 24, input: 0.0118782, achieved: 0.0118786 +[ip-26-0-150-122:0]: dataset 25, input: 7.82317e-05, achieved: 7.83201e-05 +[ip-26-0-150-122:0]: dataset 26, input: 0.0582305, achieved: 0.0582299 +[ip-26-0-150-122:0]: dataset 27, input: 0.00075624, achieved: 0.000756007 +[ip-26-0-150-122:0]: dataset 28, input: 0.00290761, achieved: 0.00290763 +[ip-26-0-150-122:0]: dataset 29, input: 1.30386e-05, achieved: 1.30534e-05 +[ip-26-0-150-122:0]: dataset 30, input: 0.00162983, achieved: 0.00162949 +[ip-26-0-150-122:0]: dataset 31, input: 0.00134298, achieved: 0.00134341 +[ip-26-0-150-122:0]: dataset 32, input: 0.00170806, achieved: 0.00170781 +[ip-26-0-150-122:0]: dataset 33, input: 0.00374208, achieved: 0.00374196 +[ip-26-0-150-122:0]: dataset 34, input: 1.30386e-05, achieved: 1.30534e-05 +[ip-26-0-150-122:0]: dataset 35, input: 6.51931e-05, achieved: 6.52668e-05 +[ip-26-0-150-122:0]: dataset 36, input: 0.00432882, achieved: 0.00432827 +[ip-26-0-150-122:0]: dataset 37, input: 3.91159e-05, achieved: 3.91601e-05 +[ip-26-0-150-122:0]: dataset 38, input: 0.000247734, achieved: 0.000248014 +[ip-26-0-150-122:0]: dataset 39, input: 0.000508506, achieved: 0.000507993 +[ip-26-0-150-122:0]: dataset 40, input: 0.00678008, achieved: 0.00678013 +[ip-26-0-150-122:0]: dataset 41, input: 2.60772e-05, achieved: 2.61067e-05 +[ip-26-0-150-122:0]: dataset 42, input: 0.00203403, achieved: 0.00203415 +[ip-26-0-150-122:0]: dataset 43, input: 1.30386e-05, achieved: 1.30534e-05 +[ip-26-0-150-122:0]: dataset 44, input: 9.12704e-05, achieved: 9.13735e-05 +[ip-26-0-150-122:0]: dataset 45, input: 0.000534584, achieved: 0.0005341 +[ip-26-0-150-122:0]: dataset 46, input: 0.00477214, achieved: 0.00477209 +[ip-26-0-150-122:0]: dataset 47, input: 0.000730163, achieved: 0.0007299 +[ip-26-0-150-122:0]: dataset 48, input: 3.91159e-05, achieved: 3.91601e-05 +[ip-26-0-150-122:0]: dataset 49, input: 1.30386e-06, achieved: 2.17556e-06 +[ip-26-0-150-122:0]: dataset 50, input: 0.000299888, achieved: 0.000300227 +[ip-26-0-150-122:0]: dataset 51, input: 2.60772e-05, achieved: 2.61067e-05 +[ip-26-0-150-122:0]: dataset 52, input: 1.30386e-05, achieved: 1.30534e-05 +[ip-26-0-150-122:0]: dataset 53, input: 0.00611511, achieved: 0.0061155 +[ip-26-0-150-122:0]: dataset 54, input: 0.000456352, achieved: 0.00045578 +[ip-26-0-150-122:0]: dataset 55, input: 0.000430275, achieved: 0.000430761 +[ip-26-0-150-122:0]: dataset 56, input: 1.30386e-05, achieved: 1.30534e-05 +[ip-26-0-150-122:0]: dataset 57, input: 0.00402893, achieved: 0.00402914 +[ip-26-0-150-122:0]: dataset 58, input: 0.000599777, achieved: 0.000599366 +[ip-26-0-150-122:0]: dataset 59, input: 0.000260772, achieved: 0.000261067 +[ip-26-0-150-122:0]: dataset 60, input: 6.51931e-05, achieved: 6.52668e-05 +[ip-26-0-150-122:0]: dataset 61, input: 5.21545e-05, achieved: 5.22134e-05 +[ip-26-0-150-122:0]: dataset 62, input: 0.0144598, achieved: 0.0144599 +[ip-26-0-150-122:0]: dataset 63, input: 0.000521545, achieved: 0.000521046 +[ip-26-0-150-122:0]: dataset 64, input: 0.000391159, achieved: 0.000391601 +[ip-26-0-150-122:0]: dataset 65, input: 0.000547622, achieved: 0.000547153 +[ip-26-0-150-122:0]: dataset 66, input: 0.0637849, achieved: 0.0637852 +[ip-26-0-150-122:0]: dataset 67, input: 0.000834472, achieved: 0.000834327 +[ip-26-0-150-122:0]: dataset 68, input: 0.00182541, achieved: 0.00182529 +[ip-26-0-150-122:0]: dataset 69, input: 0.000925742, achieved: 0.0009257 +[ip-26-0-150-122:0]: dataset 70, input: 0.00118651, achieved: 0.00118677 +[ip-26-0-150-122:0]: dataset 71, input: 0.0382814, achieved: 0.0382811 +[ip-26-0-150-122:0]: dataset 72, input: 0.113358, achieved: 0.113357 +[ip-26-0-150-122:0]: dataset 73, input: 0.0843729, achieved: 0.0843725 +[ip-26-0-150-122:0]: dataset 74, input: 0.0976984, achieved: 0.0976978 +[ip-26-0-150-122:0]: dataset 75, input: 0.0793922, achieved: 0.0793916 +[ip-26-0-150-122:0]: dataset 76, input: 0.0787533, achieved: 0.0787531 +[ip-26-0-150-122:0]: dataset 77, input: 0.0345784, achieved: 0.0345783 +[ip-26-0-150-122:0]: dataset 78, input: 1.30386e-06, achieved: 1.08778e-06 +[ip-26-0-150-122:0]: dataset 79, input: 0.00185148, achieved: 0.0018514 +[ip-26-0-150-122:0]: dataset 80, input: 0.00122563, achieved: 0.00122593 +[ip-26-0-150-122:0]: dataset 81, input: 1.30386e-05, achieved: 1.30534e-05 +[ip-26-0-150-122:0]: dataset 82, input: 2.60772e-07, achieved: 1.08778e-06 +[ip-26-0-150-122:0]: dataset 83, input: 0.000143425, achieved: 0.000143587 +[ip-26-0-150-122:0]: dataset 84, input: 0.000234695, achieved: 0.00023496 +[ip-26-0-150-122:0]: dataset 85, input: 6.51931e-05, achieved: 6.52668e-05 +[ip-26-0-150-122:0]: dataset 86, input: 0.00130386, achieved: 0.00130425 +[ip-26-0-150-122:0]: dataset 87, input: 0.00130386, achieved: 0.00130425 +[ip-26-0-150-122:0]: dataset 88, input: 0.0709301, achieved: 0.0709297 +[ip-26-0-150-122:0]: dataset 89, input: 0.0417236, achieved: 0.041724 +[ip-26-0-150-122:0]: dataset 90, input: 0.0092835, achieved: 0.00928311 +[ip-26-0-150-122:0]: dataset 91, input: 0.00782317, achieved: 0.00782331 +[ip-26-0-150-122:0]:2023-06-21 17:28:19,705 [Rank 0]: > elapsed time for building blendable dataset indices: 0.11 (sec) +[ip-26-0-150-122:0]:2023-06-21 17:28:19,705 [Rank 0]: > finished creating GPT datasets ... +[ip-26-0-155-69:7]:2023-06-21 17:28:20,378 [Rank 63]: time (ms) | model-and-optimizer-setup: 691.12 | train/valid/test-data-iterators-setup: 57399.51 +[ip-26-0-150-122:0]:2023-06-21 17:28:20,375 [Rank 0]: [after dataloaders are built] datetime: 2023-06-21 17:28:20 +[ip-26-0-150-122:0]:2023-06-21 17:28:20,375 [Rank 0]: done with setup ... +[ip-26-0-150-122:0]:2023-06-21 17:28:20,375 [Rank 0]: training ... +[ip-26-0-155-69:7]:2023-06-21 17:28:22,858 [Rank 63]: wandb: Currently logged in as: loubnabnl. Use `wandb login --relogin` to force relogin +[ip-26-0-155-69:7]: +[ip-26-0-155-69:7]:2023-06-21 17:28:30,107 [Rank 63]: wandb: wandb version 0.15.4 is available! To upgrade, please run: +[ip-26-0-155-69:7]:wandb: $ pip install wandb --upgrade +[ip-26-0-155-69:7]: +[ip-26-0-155-69:7]:2023-06-21 17:28:30,107 [Rank 63]: wandb: Tracking run with wandb version 0.13.10 +[ip-26-0-155-69:7]: +[ip-26-0-155-69:7]:2023-06-21 17:28:30,107 [Rank 63]: wandb: Run data is saved locally in /fsx/loubna/code/Megatron-LM/wandb/run-20230621_172822-yyzr4vv2 +[ip-26-0-155-69:7]:wandb: Run `wandb offline` to turn off syncing. +[ip-26-0-155-69:7]: +[ip-26-0-155-69:7]:2023-06-21 17:28:30,113 [Rank 63]: wandb: Syncing run 1b-starcoder +[ip-26-0-155-69:7]: +[ip-26-0-155-69:7]:2023-06-21 17:28:30,113 [Rank 63]: wandb: View project at https://wandb.ai/loubnabnl/1b-model +[ip-26-0-155-69:7]: +[ip-26-0-155-69:7]:2023-06-21 17:28:30,113 [Rank 63]: wandb: View run at https://wandb.ai/loubnabnl/1b-model/runs/yyzr4vv2 +[ip-26-0-155-69:7]: +[ip-26-0-150-122:0]:2023-06-21 17:28:30,119 [Rank 0]: [before the start of training step] datetime: 2023-06-21 17:28:30 +[ip-26-0-155-69:7]:2023-06-21 17:28:42,341 [Rank 63]: iteration 10/ 150000 | consumed samples: 640 | elapsed time per iteration (ms): 1222.0 | learning rate: 1.500E-06 | global batch size: 64 | lm loss: 1.096193E+01 | loss scale: 1.0 | grad norm: 24.321 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 77.43 | tokens-per-second-per-gpu: 6703.52 | +[ip-26-0-155-69:7]:2023-06-21 17:28:42,342 [Rank 63]: time (ms) | forward-compute: 451.99 | backward-compute: 458.74 | backward-params-all-reduce: 231.46 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 231.59 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 43.83 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.71 | optimizer-copy-main-to-model-params: 8.35 | optimizer: 75.57 | batch-generator: 5.70 +[ip-26-0-150-122:0]:2023-06-21 17:28:42,339 [Rank 0]: [Rank 0] (after 10 iterations) memory (MB) | allocated: 19521.45947265625 | max allocated: 35040.9794921875 | reserved: 36068.0 | max reserved: 36068.0 +[ip-26-0-155-69:7]:2023-06-21 17:28:51,279 [Rank 63]: iteration 20/ 150000 | consumed samples: 1280 | elapsed time per iteration (ms): 893.8 | learning rate: 3.000E-06 | global batch size: 64 | lm loss: 9.533918E+00 | loss scale: 1.0 | grad norm: 10.686 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.86 | tokens-per-second-per-gpu: 9164.90 | +[ip-26-0-155-69:7]:2023-06-21 17:28:51,280 [Rank 63]: time (ms) | forward-compute: 223.56 | backward-compute: 398.31 | backward-params-all-reduce: 225.90 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.01 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.92 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:29:00,210 [Rank 63]: iteration 30/ 150000 | consumed samples: 1920 | elapsed time per iteration (ms): 893.0 | learning rate: 4.500E-06 | global batch size: 64 | lm loss: 8.796992E+00 | loss scale: 1.0 | grad norm: 4.760 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9173.13 | +[ip-26-0-155-69:7]:2023-06-21 17:29:00,210 [Rank 63]: time (ms) | forward-compute: 223.11 | backward-compute: 398.41 | backward-params-all-reduce: 225.46 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.57 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.34 | optimizer: 41.94 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:29:09,139 [Rank 63]: iteration 40/ 150000 | consumed samples: 2560 | elapsed time per iteration (ms): 892.9 | learning rate: 6.000E-06 | global batch size: 64 | lm loss: 8.444675E+00 | loss scale: 1.0 | grad norm: 3.994 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9174.64 | +[ip-26-0-155-69:7]:2023-06-21 17:29:09,140 [Rank 63]: time (ms) | forward-compute: 223.09 | backward-compute: 398.37 | backward-params-all-reduce: 225.39 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 225.51 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.91 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:29:18,068 [Rank 63]: iteration 50/ 150000 | consumed samples: 3200 | elapsed time per iteration (ms): 893.0 | learning rate: 7.500E-06 | global batch size: 64 | lm loss: 8.253671E+00 | loss scale: 1.0 | grad norm: 4.000 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.97 | tokens-per-second-per-gpu: 9173.97 | +[ip-26-0-155-69:7]:2023-06-21 17:29:18,069 [Rank 63]: time (ms) | forward-compute: 223.13 | backward-compute: 398.25 | backward-params-all-reduce: 225.57 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.67 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.89 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:29:27,000 [Rank 63]: iteration 60/ 150000 | consumed samples: 3840 | elapsed time per iteration (ms): 893.1 | learning rate: 9.000E-06 | global batch size: 64 | lm loss: 7.951717E+00 | loss scale: 1.0 | grad norm: 4.310 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.19 | +[ip-26-0-155-69:7]:2023-06-21 17:29:27,000 [Rank 63]: time (ms) | forward-compute: 223.03 | backward-compute: 398.36 | backward-params-all-reduce: 225.73 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.83 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.06 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.93 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:29:35,936 [Rank 63]: iteration 70/ 150000 | consumed samples: 4480 | elapsed time per iteration (ms): 893.6 | learning rate: 1.050E-05 | global batch size: 64 | lm loss: 7.758693E+00 | loss scale: 1.0 | grad norm: 5.879 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.89 | tokens-per-second-per-gpu: 9167.51 | +[ip-26-0-155-69:7]:2023-06-21 17:29:35,936 [Rank 63]: time (ms) | forward-compute: 223.28 | backward-compute: 398.32 | backward-params-all-reduce: 225.93 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.04 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.91 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:29:44,869 [Rank 63]: iteration 80/ 150000 | consumed samples: 5120 | elapsed time per iteration (ms): 893.3 | learning rate: 1.200E-05 | global batch size: 64 | lm loss: 7.419704E+00 | loss scale: 1.0 | grad norm: 6.722 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.41 | +[ip-26-0-155-69:7]:2023-06-21 17:29:44,870 [Rank 63]: time (ms) | forward-compute: 223.34 | backward-compute: 398.20 | backward-params-all-reduce: 225.77 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 225.89 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.89 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:29:53,797 [Rank 63]: iteration 90/ 150000 | consumed samples: 5760 | elapsed time per iteration (ms): 892.8 | learning rate: 1.350E-05 | global batch size: 64 | lm loss: 7.135265E+00 | loss scale: 1.0 | grad norm: 5.285 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.99 | tokens-per-second-per-gpu: 9175.39 | +[ip-26-0-155-69:7]:2023-06-21 17:29:53,798 [Rank 63]: time (ms) | forward-compute: 222.86 | backward-compute: 398.29 | backward-params-all-reduce: 225.67 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.78 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.89 | batch-generator: 1.73 +[ip-26-0-155-69:7]:2023-06-21 17:30:02,728 [Rank 63]: iteration 100/ 150000 | consumed samples: 6400 | elapsed time per iteration (ms): 893.1 | learning rate: 1.500E-05 | global batch size: 64 | lm loss: 7.016300E+00 | loss scale: 1.0 | grad norm: 4.335 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.48 | +[ip-26-0-155-69:7]:2023-06-21 17:30:02,729 [Rank 63]: time (ms) | forward-compute: 223.11 | backward-compute: 398.33 | backward-params-all-reduce: 225.51 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.61 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.58 | optimizer-clip-main-grad: 10.96 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 42.01 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:30:11,659 [Rank 63]: iteration 110/ 150000 | consumed samples: 7040 | elapsed time per iteration (ms): 893.0 | learning rate: 1.650E-05 | global batch size: 64 | lm loss: 6.814932E+00 | loss scale: 1.0 | grad norm: 3.932 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9173.07 | +[ip-26-0-155-69:7]:2023-06-21 17:30:11,659 [Rank 63]: time (ms) | forward-compute: 223.32 | backward-compute: 398.36 | backward-params-all-reduce: 225.42 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.51 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.89 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:30:20,587 [Rank 63]: iteration 120/ 150000 | consumed samples: 7680 | elapsed time per iteration (ms): 892.8 | learning rate: 1.800E-05 | global batch size: 64 | lm loss: 6.757275E+00 | loss scale: 1.0 | grad norm: 3.359 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9175.17 | +[ip-26-0-155-69:7]:2023-06-21 17:30:20,588 [Rank 63]: time (ms) | forward-compute: 222.69 | backward-compute: 398.32 | backward-params-all-reduce: 225.75 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.85 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.58 | optimizer-clip-main-grad: 10.94 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.95 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:30:29,514 [Rank 63]: iteration 130/ 150000 | consumed samples: 8320 | elapsed time per iteration (ms): 892.6 | learning rate: 1.950E-05 | global batch size: 64 | lm loss: 6.519125E+00 | loss scale: 1.0 | grad norm: 3.028 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.01 | tokens-per-second-per-gpu: 9177.21 | +[ip-26-0-155-69:7]:2023-06-21 17:30:29,514 [Rank 63]: time (ms) | forward-compute: 222.84 | backward-compute: 398.26 | backward-params-all-reduce: 225.66 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.76 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.83 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.81 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:30:38,456 [Rank 63]: iteration 140/ 150000 | consumed samples: 8960 | elapsed time per iteration (ms): 894.2 | learning rate: 2.100E-05 | global batch size: 64 | lm loss: 6.416656E+00 | loss scale: 1.0 | grad norm: 3.510 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.82 | tokens-per-second-per-gpu: 9161.28 | +[ip-26-0-155-69:7]:2023-06-21 17:30:38,456 [Rank 63]: time (ms) | forward-compute: 224.22 | backward-compute: 398.30 | backward-params-all-reduce: 225.72 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.82 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.83 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.85 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:30:47,384 [Rank 63]: iteration 150/ 150000 | consumed samples: 9600 | elapsed time per iteration (ms): 892.8 | learning rate: 2.250E-05 | global batch size: 64 | lm loss: 6.377288E+00 | loss scale: 1.0 | grad norm: 3.265 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9175.13 | +[ip-26-0-155-69:7]:2023-06-21 17:30:47,385 [Rank 63]: time (ms) | forward-compute: 223.14 | backward-compute: 398.10 | backward-params-all-reduce: 225.73 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.84 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.83 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.82 | batch-generator: 1.73 +[ip-26-0-155-69:7]:2023-06-21 17:30:56,316 [Rank 63]: iteration 160/ 150000 | consumed samples: 10240 | elapsed time per iteration (ms): 893.2 | learning rate: 2.400E-05 | global batch size: 64 | lm loss: 6.216093E+00 | loss scale: 1.0 | grad norm: 3.617 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.63 | +[ip-26-0-155-69:7]:2023-06-21 17:30:56,316 [Rank 63]: time (ms) | forward-compute: 223.70 | backward-compute: 398.22 | backward-params-all-reduce: 225.42 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.52 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.84 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:31:05,247 [Rank 63]: iteration 170/ 150000 | consumed samples: 10880 | elapsed time per iteration (ms): 893.1 | learning rate: 2.550E-05 | global batch size: 64 | lm loss: 6.279401E+00 | loss scale: 1.0 | grad norm: 3.731 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.16 | +[ip-26-0-155-69:7]:2023-06-21 17:31:05,248 [Rank 63]: time (ms) | forward-compute: 223.37 | backward-compute: 398.20 | backward-params-all-reduce: 225.58 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.68 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.91 | batch-generator: 1.73 +[ip-26-0-155-69:7]:2023-06-21 17:31:14,182 [Rank 63]: iteration 180/ 150000 | consumed samples: 11520 | elapsed time per iteration (ms): 893.5 | learning rate: 2.700E-05 | global batch size: 64 | lm loss: 6.152369E+00 | loss scale: 1.0 | grad norm: 3.729 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.91 | tokens-per-second-per-gpu: 9168.89 | +[ip-26-0-155-69:7]:2023-06-21 17:31:14,183 [Rank 63]: time (ms) | forward-compute: 223.66 | backward-compute: 398.17 | backward-params-all-reduce: 225.65 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 225.77 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.06 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.89 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:31:23,115 [Rank 63]: iteration 190/ 150000 | consumed samples: 12160 | elapsed time per iteration (ms): 893.4 | learning rate: 2.850E-05 | global batch size: 64 | lm loss: 6.109496E+00 | loss scale: 1.0 | grad norm: 3.128 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.92 | tokens-per-second-per-gpu: 9169.91 | +[ip-26-0-155-69:7]:2023-06-21 17:31:23,116 [Rank 63]: time (ms) | forward-compute: 223.58 | backward-compute: 398.16 | backward-params-all-reduce: 225.68 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.77 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.87 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:31:32,048 [Rank 63]: iteration 200/ 150000 | consumed samples: 12800 | elapsed time per iteration (ms): 893.3 | learning rate: 3.000E-05 | global batch size: 64 | lm loss: 6.104686E+00 | loss scale: 1.0 | grad norm: 4.132 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.54 | +[ip-26-0-155-69:7]:2023-06-21 17:31:32,049 [Rank 63]: time (ms) | forward-compute: 223.54 | backward-compute: 398.16 | backward-params-all-reduce: 225.67 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.78 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.86 | batch-generator: 1.73 +[ip-26-0-155-69:7]:2023-06-21 17:31:40,980 [Rank 63]: iteration 210/ 150000 | consumed samples: 13440 | elapsed time per iteration (ms): 893.2 | learning rate: 3.150E-05 | global batch size: 64 | lm loss: 5.995741E+00 | loss scale: 1.0 | grad norm: 3.759 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.78 | +[ip-26-0-155-69:7]:2023-06-21 17:31:40,981 [Rank 63]: time (ms) | forward-compute: 223.24 | backward-compute: 398.19 | backward-params-all-reduce: 225.70 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.81 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.57 | optimizer-clip-main-grad: 10.94 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.96 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:31:49,920 [Rank 63]: iteration 220/ 150000 | consumed samples: 14080 | elapsed time per iteration (ms): 894.0 | learning rate: 3.300E-05 | global batch size: 64 | lm loss: 6.099563E+00 | loss scale: 1.0 | grad norm: 3.449 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.84 | tokens-per-second-per-gpu: 9163.10 | +[ip-26-0-155-69:7]:2023-06-21 17:31:49,921 [Rank 63]: time (ms) | forward-compute: 223.51 | backward-compute: 398.19 | backward-params-all-reduce: 226.34 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.45 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.91 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:31:58,857 [Rank 63]: iteration 230/ 150000 | consumed samples: 14720 | elapsed time per iteration (ms): 893.7 | learning rate: 3.450E-05 | global batch size: 64 | lm loss: 5.972797E+00 | loss scale: 1.0 | grad norm: 2.340 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.88 | tokens-per-second-per-gpu: 9166.35 | +[ip-26-0-155-69:7]:2023-06-21 17:31:58,858 [Rank 63]: time (ms) | forward-compute: 223.74 | backward-compute: 398.25 | backward-params-all-reduce: 225.76 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.88 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.88 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:32:07,795 [Rank 63]: iteration 240/ 150000 | consumed samples: 15360 | elapsed time per iteration (ms): 893.7 | learning rate: 3.600E-05 | global batch size: 64 | lm loss: 5.918838E+00 | loss scale: 1.0 | grad norm: 3.002 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.88 | tokens-per-second-per-gpu: 9166.09 | +[ip-26-0-155-69:7]:2023-06-21 17:32:07,796 [Rank 63]: time (ms) | forward-compute: 223.70 | backward-compute: 398.17 | backward-params-all-reduce: 225.87 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 225.99 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.87 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:32:16,738 [Rank 63]: iteration 250/ 150000 | consumed samples: 16000 | elapsed time per iteration (ms): 894.3 | learning rate: 3.750E-05 | global batch size: 64 | lm loss: 5.861612E+00 | loss scale: 1.0 | grad norm: 2.419 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.81 | tokens-per-second-per-gpu: 9159.76 | +[ip-26-0-155-69:7]:2023-06-21 17:32:16,739 [Rank 63]: time (ms) | forward-compute: 224.00 | backward-compute: 398.17 | backward-params-all-reduce: 226.14 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.25 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.90 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:32:25,676 [Rank 63]: iteration 260/ 150000 | consumed samples: 16640 | elapsed time per iteration (ms): 893.7 | learning rate: 3.900E-05 | global batch size: 64 | lm loss: 5.845439E+00 | loss scale: 1.0 | grad norm: 2.481 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.88 | tokens-per-second-per-gpu: 9166.01 | +[ip-26-0-155-69:7]:2023-06-21 17:32:25,676 [Rank 63]: time (ms) | forward-compute: 223.57 | backward-compute: 398.23 | backward-params-all-reduce: 226.00 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.10 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.83 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.84 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:32:34,607 [Rank 63]: iteration 270/ 150000 | consumed samples: 17280 | elapsed time per iteration (ms): 893.2 | learning rate: 4.050E-05 | global batch size: 64 | lm loss: 5.770058E+00 | loss scale: 1.0 | grad norm: 1.942 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.01 | +[ip-26-0-155-69:7]:2023-06-21 17:32:34,608 [Rank 63]: time (ms) | forward-compute: 223.55 | backward-compute: 398.10 | backward-params-all-reduce: 225.55 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.65 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.89 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:32:43,541 [Rank 63]: iteration 280/ 150000 | consumed samples: 17920 | elapsed time per iteration (ms): 893.4 | learning rate: 4.200E-05 | global batch size: 64 | lm loss: 5.792897E+00 | loss scale: 1.0 | grad norm: 2.421 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.92 | tokens-per-second-per-gpu: 9169.78 | +[ip-26-0-155-69:7]:2023-06-21 17:32:43,541 [Rank 63]: time (ms) | forward-compute: 223.18 | backward-compute: 398.16 | backward-params-all-reduce: 226.06 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.17 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.89 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:32:52,473 [Rank 63]: iteration 290/ 150000 | consumed samples: 18560 | elapsed time per iteration (ms): 893.2 | learning rate: 4.350E-05 | global batch size: 64 | lm loss: 5.725514E+00 | loss scale: 1.0 | grad norm: 2.173 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.27 | +[ip-26-0-155-69:7]:2023-06-21 17:32:52,474 [Rank 63]: time (ms) | forward-compute: 223.29 | backward-compute: 398.16 | backward-params-all-reduce: 225.78 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.89 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.94 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.93 | batch-generator: 1.73 +[ip-26-0-155-69:7]:2023-06-21 17:33:01,403 [Rank 63]: iteration 300/ 150000 | consumed samples: 19200 | elapsed time per iteration (ms): 893.0 | learning rate: 4.500E-05 | global batch size: 64 | lm loss: 5.613900E+00 | loss scale: 1.0 | grad norm: 3.062 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9173.31 | +[ip-26-0-155-69:7]:2023-06-21 17:33:01,404 [Rank 63]: time (ms) | forward-compute: 223.35 | backward-compute: 398.07 | backward-params-all-reduce: 225.67 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.79 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.85 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.06 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.86 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:33:10,337 [Rank 63]: iteration 310/ 150000 | consumed samples: 19840 | elapsed time per iteration (ms): 893.4 | learning rate: 4.650E-05 | global batch size: 64 | lm loss: 5.624342E+00 | loss scale: 1.0 | grad norm: 2.426 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.92 | tokens-per-second-per-gpu: 9169.85 | +[ip-26-0-155-69:7]:2023-06-21 17:33:10,338 [Rank 63]: time (ms) | forward-compute: 223.49 | backward-compute: 398.17 | backward-params-all-reduce: 225.69 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 225.82 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.06 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.90 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:33:19,368 [Rank 63]: iteration 320/ 150000 | consumed samples: 20480 | elapsed time per iteration (ms): 903.1 | learning rate: 4.800E-05 | global batch size: 64 | lm loss: 5.548281E+00 | loss scale: 1.0 | grad norm: 2.668 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 104.78 | tokens-per-second-per-gpu: 9071.09 | +[ip-26-0-155-69:7]:2023-06-21 17:33:19,368 [Rank 63]: time (ms) | forward-compute: 233.18 | backward-compute: 398.24 | backward-params-all-reduce: 225.56 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.67 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.86 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.88 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:33:28,306 [Rank 63]: iteration 330/ 150000 | consumed samples: 21120 | elapsed time per iteration (ms): 893.8 | learning rate: 4.950E-05 | global batch size: 64 | lm loss: 5.607609E+00 | loss scale: 1.0 | grad norm: 2.161 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.87 | tokens-per-second-per-gpu: 9165.07 | +[ip-26-0-155-69:7]:2023-06-21 17:33:28,307 [Rank 63]: time (ms) | forward-compute: 223.70 | backward-compute: 398.14 | backward-params-all-reduce: 226.04 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.13 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.88 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:33:37,243 [Rank 63]: iteration 340/ 150000 | consumed samples: 21760 | elapsed time per iteration (ms): 893.7 | learning rate: 5.100E-05 | global batch size: 64 | lm loss: 5.562651E+00 | loss scale: 1.0 | grad norm: 2.971 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.88 | tokens-per-second-per-gpu: 9166.56 | +[ip-26-0-155-69:7]:2023-06-21 17:33:37,244 [Rank 63]: time (ms) | forward-compute: 223.80 | backward-compute: 398.15 | backward-params-all-reduce: 225.75 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.84 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.57 | optimizer-clip-main-grad: 10.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.92 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:33:46,183 [Rank 63]: iteration 350/ 150000 | consumed samples: 22400 | elapsed time per iteration (ms): 893.9 | learning rate: 5.250E-05 | global batch size: 64 | lm loss: 5.522157E+00 | loss scale: 1.0 | grad norm: 2.511 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.85 | tokens-per-second-per-gpu: 9163.88 | +[ip-26-0-155-69:7]:2023-06-21 17:33:46,183 [Rank 63]: time (ms) | forward-compute: 223.56 | backward-compute: 398.24 | backward-params-all-reduce: 226.15 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.25 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.90 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:33:55,116 [Rank 63]: iteration 360/ 150000 | consumed samples: 23040 | elapsed time per iteration (ms): 893.3 | learning rate: 5.400E-05 | global batch size: 64 | lm loss: 5.491142E+00 | loss scale: 1.0 | grad norm: 2.720 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.28 | +[ip-26-0-155-69:7]:2023-06-21 17:33:55,116 [Rank 63]: time (ms) | forward-compute: 223.39 | backward-compute: 398.24 | backward-params-all-reduce: 225.69 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.79 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.94 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:34:04,050 [Rank 63]: iteration 370/ 150000 | consumed samples: 23680 | elapsed time per iteration (ms): 893.4 | learning rate: 5.550E-05 | global batch size: 64 | lm loss: 5.483192E+00 | loss scale: 1.0 | grad norm: 2.772 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.91 | tokens-per-second-per-gpu: 9169.19 | +[ip-26-0-155-69:7]:2023-06-21 17:34:04,051 [Rank 63]: time (ms) | forward-compute: 223.31 | backward-compute: 398.21 | backward-params-all-reduce: 225.87 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.98 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.93 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:34:12,982 [Rank 63]: iteration 380/ 150000 | consumed samples: 24320 | elapsed time per iteration (ms): 893.2 | learning rate: 5.700E-05 | global batch size: 64 | lm loss: 5.409019E+00 | loss scale: 1.0 | grad norm: 2.076 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9171.99 | +[ip-26-0-155-69:7]:2023-06-21 17:34:12,982 [Rank 63]: time (ms) | forward-compute: 223.24 | backward-compute: 398.16 | backward-params-all-reduce: 225.75 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.86 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.89 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:34:21,911 [Rank 63]: iteration 390/ 150000 | consumed samples: 24960 | elapsed time per iteration (ms): 892.9 | learning rate: 5.850E-05 | global batch size: 64 | lm loss: 5.394781E+00 | loss scale: 1.0 | grad norm: 2.562 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9174.66 | +[ip-26-0-155-69:7]:2023-06-21 17:34:21,911 [Rank 63]: time (ms) | forward-compute: 223.04 | backward-compute: 398.13 | backward-params-all-reduce: 225.65 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.76 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.34 | optimizer: 41.95 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:34:30,843 [Rank 63]: iteration 400/ 150000 | consumed samples: 25600 | elapsed time per iteration (ms): 893.2 | learning rate: 6.000E-05 | global batch size: 64 | lm loss: 5.375826E+00 | loss scale: 1.0 | grad norm: 2.044 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.41 | +[ip-26-0-155-69:7]:2023-06-21 17:34:30,843 [Rank 63]: time (ms) | forward-compute: 223.65 | backward-compute: 398.08 | backward-params-all-reduce: 225.57 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.67 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.86 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:34:39,772 [Rank 63]: iteration 410/ 150000 | consumed samples: 26240 | elapsed time per iteration (ms): 893.0 | learning rate: 6.150E-05 | global batch size: 64 | lm loss: 5.348161E+00 | loss scale: 1.0 | grad norm: 2.454 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.97 | tokens-per-second-per-gpu: 9173.80 | +[ip-26-0-155-69:7]:2023-06-21 17:34:39,773 [Rank 63]: time (ms) | forward-compute: 223.43 | backward-compute: 398.14 | backward-params-all-reduce: 225.46 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.56 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.60 | optimizer-clip-main-grad: 10.82 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.88 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:34:48,707 [Rank 63]: iteration 420/ 150000 | consumed samples: 26880 | elapsed time per iteration (ms): 893.5 | learning rate: 6.300E-05 | global batch size: 64 | lm loss: 5.274976E+00 | loss scale: 1.0 | grad norm: 2.302 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.91 | tokens-per-second-per-gpu: 9168.69 | +[ip-26-0-155-69:7]:2023-06-21 17:34:48,708 [Rank 63]: time (ms) | forward-compute: 223.82 | backward-compute: 398.13 | backward-params-all-reduce: 225.51 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.61 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.34 | optimizer: 41.94 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:34:57,639 [Rank 63]: iteration 430/ 150000 | consumed samples: 27520 | elapsed time per iteration (ms): 893.2 | learning rate: 6.450E-05 | global batch size: 64 | lm loss: 5.287198E+00 | loss scale: 1.0 | grad norm: 2.816 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.52 | +[ip-26-0-155-69:7]:2023-06-21 17:34:57,640 [Rank 63]: time (ms) | forward-compute: 223.67 | backward-compute: 398.00 | backward-params-all-reduce: 225.56 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.67 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.90 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:35:06,572 [Rank 63]: iteration 440/ 150000 | consumed samples: 28160 | elapsed time per iteration (ms): 893.3 | learning rate: 6.600E-05 | global batch size: 64 | lm loss: 5.272359E+00 | loss scale: 1.0 | grad norm: 2.385 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.71 | +[ip-26-0-155-69:7]:2023-06-21 17:35:06,572 [Rank 63]: time (ms) | forward-compute: 223.48 | backward-compute: 398.14 | backward-params-all-reduce: 225.62 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.73 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.06 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.94 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:35:15,506 [Rank 63]: iteration 450/ 150000 | consumed samples: 28800 | elapsed time per iteration (ms): 893.4 | learning rate: 6.750E-05 | global batch size: 64 | lm loss: 5.157737E+00 | loss scale: 1.0 | grad norm: 1.884 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.92 | tokens-per-second-per-gpu: 9169.48 | +[ip-26-0-155-69:7]:2023-06-21 17:35:15,507 [Rank 63]: time (ms) | forward-compute: 223.19 | backward-compute: 398.11 | backward-params-all-reduce: 226.04 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.15 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.94 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:35:24,447 [Rank 63]: iteration 460/ 150000 | consumed samples: 29440 | elapsed time per iteration (ms): 894.1 | learning rate: 6.900E-05 | global batch size: 64 | lm loss: 5.164676E+00 | loss scale: 1.0 | grad norm: 2.064 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.84 | tokens-per-second-per-gpu: 9162.49 | +[ip-26-0-155-69:7]:2023-06-21 17:35:24,447 [Rank 63]: time (ms) | forward-compute: 222.98 | backward-compute: 398.16 | backward-params-all-reduce: 226.75 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.87 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.97 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.34 | optimizer: 42.02 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:35:33,377 [Rank 63]: iteration 470/ 150000 | consumed samples: 30080 | elapsed time per iteration (ms): 893.0 | learning rate: 7.050E-05 | global batch size: 64 | lm loss: 5.136440E+00 | loss scale: 1.0 | grad norm: 2.273 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9173.14 | +[ip-26-0-155-69:7]:2023-06-21 17:35:33,378 [Rank 63]: time (ms) | forward-compute: 223.44 | backward-compute: 398.14 | backward-params-all-reduce: 225.53 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.64 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.86 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.85 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:35:42,308 [Rank 63]: iteration 480/ 150000 | consumed samples: 30720 | elapsed time per iteration (ms): 893.1 | learning rate: 7.200E-05 | global batch size: 64 | lm loss: 5.159489E+00 | loss scale: 1.0 | grad norm: 1.732 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.50 | +[ip-26-0-155-69:7]:2023-06-21 17:35:42,309 [Rank 63]: time (ms) | forward-compute: 223.25 | backward-compute: 398.14 | backward-params-all-reduce: 225.71 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.82 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.91 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:35:51,239 [Rank 63]: iteration 490/ 150000 | consumed samples: 31360 | elapsed time per iteration (ms): 893.1 | learning rate: 7.350E-05 | global batch size: 64 | lm loss: 5.114197E+00 | loss scale: 1.0 | grad norm: 1.781 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9172.75 | +[ip-26-0-155-69:7]:2023-06-21 17:35:51,240 [Rank 63]: time (ms) | forward-compute: 222.66 | backward-compute: 398.17 | backward-params-all-reduce: 226.20 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.31 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.93 | batch-generator: 1.80 +[ip-26-0-155-69:7]:2023-06-21 17:36:00,171 [Rank 63]: iteration 500/ 150000 | consumed samples: 32000 | elapsed time per iteration (ms): 893.2 | learning rate: 7.500E-05 | global batch size: 64 | lm loss: 5.069198E+00 | loss scale: 1.0 | grad norm: 1.696 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.36 | +[ip-26-0-155-69:7]:2023-06-21 17:36:00,172 [Rank 63]: time (ms) | forward-compute: 222.62 | backward-compute: 398.19 | backward-params-all-reduce: 226.32 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.43 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.97 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.98 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:36:09,107 [Rank 63]: iteration 510/ 150000 | consumed samples: 32640 | elapsed time per iteration (ms): 893.6 | learning rate: 7.650E-05 | global batch size: 64 | lm loss: 5.068162E+00 | loss scale: 1.0 | grad norm: 2.051 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.90 | tokens-per-second-per-gpu: 9167.91 | +[ip-26-0-155-69:7]:2023-06-21 17:36:09,108 [Rank 63]: time (ms) | forward-compute: 222.88 | backward-compute: 398.32 | backward-params-all-reduce: 226.19 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 226.32 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.57 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.07 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.97 | batch-generator: 1.83 +[ip-26-0-155-69:7]:2023-06-21 17:36:18,040 [Rank 63]: iteration 520/ 150000 | consumed samples: 33280 | elapsed time per iteration (ms): 893.3 | learning rate: 7.800E-05 | global batch size: 64 | lm loss: 5.031533E+00 | loss scale: 1.0 | grad norm: 1.916 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.92 | tokens-per-second-per-gpu: 9170.03 | +[ip-26-0-155-69:7]:2023-06-21 17:36:18,041 [Rank 63]: time (ms) | forward-compute: 223.18 | backward-compute: 398.20 | backward-params-all-reduce: 225.92 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.04 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.92 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:36:26,973 [Rank 63]: iteration 530/ 150000 | consumed samples: 33920 | elapsed time per iteration (ms): 893.3 | learning rate: 7.950E-05 | global batch size: 64 | lm loss: 5.029686E+00 | loss scale: 1.0 | grad norm: 1.798 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.73 | +[ip-26-0-155-69:7]:2023-06-21 17:36:26,973 [Rank 63]: time (ms) | forward-compute: 222.69 | backward-compute: 398.25 | backward-params-all-reduce: 226.27 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.38 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.95 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.96 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:36:35,906 [Rank 63]: iteration 540/ 150000 | consumed samples: 34560 | elapsed time per iteration (ms): 893.3 | learning rate: 8.100E-05 | global batch size: 64 | lm loss: 5.031442E+00 | loss scale: 1.0 | grad norm: 1.951 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.17 | +[ip-26-0-155-69:7]:2023-06-21 17:36:35,907 [Rank 63]: time (ms) | forward-compute: 222.88 | backward-compute: 398.24 | backward-params-all-reduce: 226.18 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.29 | backward-gather-model-params: 0.02 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.92 | batch-generator: 1.80 +[ip-26-0-155-69:7]:2023-06-21 17:36:44,841 [Rank 63]: iteration 550/ 150000 | consumed samples: 35200 | elapsed time per iteration (ms): 893.5 | learning rate: 8.250E-05 | global batch size: 64 | lm loss: 5.012273E+00 | loss scale: 1.0 | grad norm: 1.726 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.91 | tokens-per-second-per-gpu: 9168.83 | +[ip-26-0-155-69:7]:2023-06-21 17:36:44,841 [Rank 63]: time (ms) | forward-compute: 223.23 | backward-compute: 398.20 | backward-params-all-reduce: 226.03 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.14 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.92 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:36:53,775 [Rank 63]: iteration 560/ 150000 | consumed samples: 35840 | elapsed time per iteration (ms): 893.4 | learning rate: 8.400E-05 | global batch size: 64 | lm loss: 4.852672E+00 | loss scale: 1.0 | grad norm: 1.536 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.92 | tokens-per-second-per-gpu: 9169.49 | +[ip-26-0-155-69:7]:2023-06-21 17:36:53,775 [Rank 63]: time (ms) | forward-compute: 223.19 | backward-compute: 398.17 | backward-params-all-reduce: 226.08 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.18 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.88 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:37:02,768 [Rank 63]: iteration 570/ 150000 | consumed samples: 36480 | elapsed time per iteration (ms): 899.3 | learning rate: 8.550E-05 | global batch size: 64 | lm loss: 4.964608E+00 | loss scale: 1.0 | grad norm: 1.570 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.23 | tokens-per-second-per-gpu: 9109.74 | +[ip-26-0-155-69:7]:2023-06-21 17:37:02,768 [Rank 63]: time (ms) | forward-compute: 227.82 | backward-compute: 398.21 | backward-params-all-reduce: 227.14 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 227.24 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.57 | optimizer-clip-main-grad: 11.00 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 42.02 | batch-generator: 1.80 +[ip-26-0-155-69:7]:2023-06-21 17:37:11,738 [Rank 63]: iteration 580/ 150000 | consumed samples: 37120 | elapsed time per iteration (ms): 897.1 | learning rate: 8.700E-05 | global batch size: 64 | lm loss: 4.988046E+00 | loss scale: 1.0 | grad norm: 1.668 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.48 | tokens-per-second-per-gpu: 9132.00 | +[ip-26-0-155-69:7]:2023-06-21 17:37:11,739 [Rank 63]: time (ms) | forward-compute: 225.71 | backward-compute: 398.15 | backward-params-all-reduce: 226.45 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.55 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 11.68 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 42.69 | batch-generator: 1.83 +[ip-26-0-155-69:7]:2023-06-21 17:37:20,711 [Rank 63]: iteration 590/ 150000 | consumed samples: 37760 | elapsed time per iteration (ms): 897.3 | learning rate: 8.850E-05 | global batch size: 64 | lm loss: 4.848716E+00 | loss scale: 1.0 | grad norm: 1.516 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.45 | tokens-per-second-per-gpu: 9129.36 | +[ip-26-0-155-69:7]:2023-06-21 17:37:20,712 [Rank 63]: time (ms) | forward-compute: 225.90 | backward-compute: 398.07 | backward-params-all-reduce: 227.47 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 227.56 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.30 | optimizer: 41.81 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:37:29,703 [Rank 63]: iteration 600/ 150000 | consumed samples: 38400 | elapsed time per iteration (ms): 899.2 | learning rate: 9.000E-05 | global batch size: 64 | lm loss: 4.889231E+00 | loss scale: 1.0 | grad norm: 1.931 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.24 | tokens-per-second-per-gpu: 9110.76 | +[ip-26-0-155-69:7]:2023-06-21 17:37:29,704 [Rank 63]: time (ms) | forward-compute: 227.13 | backward-compute: 398.11 | backward-params-all-reduce: 227.04 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 227.43 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 11.58 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 42.55 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:37:38,696 [Rank 63]: iteration 610/ 150000 | consumed samples: 39040 | elapsed time per iteration (ms): 899.3 | learning rate: 9.150E-05 | global batch size: 64 | lm loss: 4.808221E+00 | loss scale: 1.0 | grad norm: 1.401 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.23 | tokens-per-second-per-gpu: 9109.76 | +[ip-26-0-155-69:7]:2023-06-21 17:37:38,696 [Rank 63]: time (ms) | forward-compute: 228.96 | backward-compute: 398.22 | backward-params-all-reduce: 226.10 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.20 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.57 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.90 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:37:47,636 [Rank 63]: iteration 620/ 150000 | consumed samples: 39680 | elapsed time per iteration (ms): 894.1 | learning rate: 9.300E-05 | global batch size: 64 | lm loss: 4.808089E+00 | loss scale: 1.0 | grad norm: 1.950 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.84 | tokens-per-second-per-gpu: 9162.50 | +[ip-26-0-155-69:7]:2023-06-21 17:37:47,637 [Rank 63]: time (ms) | forward-compute: 223.79 | backward-compute: 398.16 | backward-params-all-reduce: 226.23 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.33 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.84 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:37:56,582 [Rank 63]: iteration 630/ 150000 | consumed samples: 40320 | elapsed time per iteration (ms): 894.5 | learning rate: 9.450E-05 | global batch size: 64 | lm loss: 4.800224E+00 | loss scale: 1.0 | grad norm: 1.574 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.79 | tokens-per-second-per-gpu: 9158.00 | +[ip-26-0-155-69:7]:2023-06-21 17:37:56,582 [Rank 63]: time (ms) | forward-compute: 223.59 | backward-compute: 398.13 | backward-params-all-reduce: 226.88 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.97 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.94 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.92 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:38:05,535 [Rank 63]: iteration 640/ 150000 | consumed samples: 40960 | elapsed time per iteration (ms): 895.4 | learning rate: 9.600E-05 | global batch size: 64 | lm loss: 4.780347E+00 | loss scale: 1.0 | grad norm: 1.543 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.68 | tokens-per-second-per-gpu: 9149.31 | +[ip-26-0-155-69:7]:2023-06-21 17:38:05,536 [Rank 63]: time (ms) | forward-compute: 224.03 | backward-compute: 398.16 | backward-params-all-reduce: 227.10 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 227.21 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.95 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:38:14,479 [Rank 63]: iteration 650/ 150000 | consumed samples: 41600 | elapsed time per iteration (ms): 894.4 | learning rate: 9.750E-05 | global batch size: 64 | lm loss: 4.692219E+00 | loss scale: 1.0 | grad norm: 1.669 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.80 | tokens-per-second-per-gpu: 9159.09 | +[ip-26-0-155-69:7]:2023-06-21 17:38:14,480 [Rank 63]: time (ms) | forward-compute: 224.39 | backward-compute: 398.19 | backward-params-all-reduce: 225.91 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.01 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.84 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.80 | batch-generator: 1.80 +[ip-26-0-155-69:7]:2023-06-21 17:38:23,427 [Rank 63]: iteration 660/ 150000 | consumed samples: 42240 | elapsed time per iteration (ms): 894.7 | learning rate: 9.900E-05 | global batch size: 64 | lm loss: 4.747536E+00 | loss scale: 1.0 | grad norm: 1.602 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.76 | tokens-per-second-per-gpu: 9155.98 | +[ip-26-0-155-69:7]:2023-06-21 17:38:23,427 [Rank 63]: time (ms) | forward-compute: 224.55 | backward-compute: 398.17 | backward-params-all-reduce: 225.81 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.91 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 11.04 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.09 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 42.10 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:38:32,487 [Rank 63]: iteration 670/ 150000 | consumed samples: 42880 | elapsed time per iteration (ms): 906.0 | learning rate: 1.005E-04 | global batch size: 64 | lm loss: 4.675434E+00 | loss scale: 1.0 | grad norm: 1.829 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 104.44 | tokens-per-second-per-gpu: 9041.71 | +[ip-26-0-155-69:7]:2023-06-21 17:38:32,487 [Rank 63]: time (ms) | forward-compute: 235.41 | backward-compute: 398.13 | backward-params-all-reduce: 225.79 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.89 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.94 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.30 | optimizer: 41.90 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:38:42,562 [Rank 63]: iteration 680/ 150000 | consumed samples: 43520 | elapsed time per iteration (ms): 1007.5 | learning rate: 1.020E-04 | global batch size: 64 | lm loss: 4.675757E+00 | loss scale: 1.0 | grad norm: 1.328 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 93.92 | tokens-per-second-per-gpu: 8130.96 | +[ip-26-0-155-69:7]:2023-06-21 17:38:42,562 [Rank 63]: time (ms) | forward-compute: 287.37 | backward-compute: 399.98 | backward-params-all-reduce: 261.04 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 261.14 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 1.71 | optimizer-clip-main-grad: 20.31 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.26 | optimizer-copy-main-to-model-params: 8.58 | optimizer: 52.96 | batch-generator: 5.67 +[ip-26-0-155-69:7]:2023-06-21 17:38:51,816 [Rank 63]: iteration 690/ 150000 | consumed samples: 44160 | elapsed time per iteration (ms): 925.4 | learning rate: 1.035E-04 | global batch size: 64 | lm loss: 4.607193E+00 | loss scale: 1.0 | grad norm: 1.564 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 102.26 | tokens-per-second-per-gpu: 8852.63 | +[ip-26-0-155-69:7]:2023-06-21 17:38:51,816 [Rank 63]: time (ms) | forward-compute: 231.94 | backward-compute: 400.73 | backward-params-all-reduce: 245.32 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 245.42 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.63 | optimizer-clip-main-grad: 11.44 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.64 | optimizer-copy-main-to-model-params: 8.35 | optimizer: 43.16 | batch-generator: 3.11 +[ip-26-0-155-69:7]:2023-06-21 17:39:00,757 [Rank 63]: iteration 700/ 150000 | consumed samples: 44800 | elapsed time per iteration (ms): 894.1 | learning rate: 1.050E-04 | global batch size: 64 | lm loss: 4.614832E+00 | loss scale: 1.0 | grad norm: 1.521 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.83 | tokens-per-second-per-gpu: 9162.08 | +[ip-26-0-155-69:7]:2023-06-21 17:39:00,757 [Rank 63]: time (ms) | forward-compute: 224.12 | backward-compute: 398.13 | backward-params-all-reduce: 225.98 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.08 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.30 | optimizer: 41.89 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:39:09,698 [Rank 63]: iteration 710/ 150000 | consumed samples: 45440 | elapsed time per iteration (ms): 894.1 | learning rate: 1.065E-04 | global batch size: 64 | lm loss: 4.601290E+00 | loss scale: 1.0 | grad norm: 1.367 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.83 | tokens-per-second-per-gpu: 9162.04 | +[ip-26-0-155-69:7]:2023-06-21 17:39:09,699 [Rank 63]: time (ms) | forward-compute: 223.75 | backward-compute: 398.14 | backward-params-all-reduce: 226.27 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.37 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.89 | batch-generator: 1.80 +[ip-26-0-155-69:7]:2023-06-21 17:39:18,635 [Rank 63]: iteration 720/ 150000 | consumed samples: 46080 | elapsed time per iteration (ms): 893.7 | learning rate: 1.080E-04 | global batch size: 64 | lm loss: 4.536628E+00 | loss scale: 1.0 | grad norm: 1.323 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.88 | tokens-per-second-per-gpu: 9166.55 | +[ip-26-0-155-69:7]:2023-06-21 17:39:18,636 [Rank 63]: time (ms) | forward-compute: 223.77 | backward-compute: 398.11 | backward-params-all-reduce: 225.92 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.01 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.30 | optimizer: 41.82 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:39:27,573 [Rank 63]: iteration 730/ 150000 | consumed samples: 46720 | elapsed time per iteration (ms): 893.8 | learning rate: 1.095E-04 | global batch size: 64 | lm loss: 4.509668E+00 | loss scale: 1.0 | grad norm: 1.453 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.87 | tokens-per-second-per-gpu: 9164.97 | +[ip-26-0-155-69:7]:2023-06-21 17:39:27,574 [Rank 63]: time (ms) | forward-compute: 223.89 | backward-compute: 398.10 | backward-params-all-reduce: 226.01 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.11 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.83 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:39:36,511 [Rank 63]: iteration 740/ 150000 | consumed samples: 47360 | elapsed time per iteration (ms): 893.7 | learning rate: 1.110E-04 | global batch size: 64 | lm loss: 4.574774E+00 | loss scale: 1.0 | grad norm: 1.336 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.88 | tokens-per-second-per-gpu: 9165.89 | +[ip-26-0-155-69:7]:2023-06-21 17:39:36,511 [Rank 63]: time (ms) | forward-compute: 223.87 | backward-compute: 398.07 | backward-params-all-reduce: 225.98 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.08 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.30 | optimizer: 41.81 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:39:45,451 [Rank 63]: iteration 750/ 150000 | consumed samples: 48000 | elapsed time per iteration (ms): 894.0 | learning rate: 1.125E-04 | global batch size: 64 | lm loss: 4.512045E+00 | loss scale: 1.0 | grad norm: 1.408 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.85 | tokens-per-second-per-gpu: 9163.20 | +[ip-26-0-155-69:7]:2023-06-21 17:39:45,451 [Rank 63]: time (ms) | forward-compute: 224.03 | backward-compute: 398.07 | backward-params-all-reduce: 225.84 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.95 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.95 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.98 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:39:54,388 [Rank 63]: iteration 760/ 150000 | consumed samples: 48640 | elapsed time per iteration (ms): 893.7 | learning rate: 1.140E-04 | global batch size: 64 | lm loss: 4.472682E+00 | loss scale: 1.0 | grad norm: 1.327 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.88 | tokens-per-second-per-gpu: 9166.21 | +[ip-26-0-155-69:7]:2023-06-21 17:39:54,389 [Rank 63]: time (ms) | forward-compute: 223.61 | backward-compute: 398.09 | backward-params-all-reduce: 226.06 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.17 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.86 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.87 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:40:03,328 [Rank 63]: iteration 770/ 150000 | consumed samples: 49280 | elapsed time per iteration (ms): 894.0 | learning rate: 1.155E-04 | global batch size: 64 | lm loss: 4.452821E+00 | loss scale: 1.0 | grad norm: 1.213 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.84 | tokens-per-second-per-gpu: 9163.00 | +[ip-26-0-155-69:7]:2023-06-21 17:40:03,329 [Rank 63]: time (ms) | forward-compute: 223.95 | backward-compute: 398.19 | backward-params-all-reduce: 225.91 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.02 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.89 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:40:12,260 [Rank 63]: iteration 780/ 150000 | consumed samples: 49920 | elapsed time per iteration (ms): 893.1 | learning rate: 1.170E-04 | global batch size: 64 | lm loss: 4.386324E+00 | loss scale: 1.0 | grad norm: 1.504 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.49 | +[ip-26-0-155-69:7]:2023-06-21 17:40:12,260 [Rank 63]: time (ms) | forward-compute: 222.84 | backward-compute: 398.06 | backward-params-all-reduce: 226.17 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 226.29 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.59 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.34 | optimizer: 41.96 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:40:21,190 [Rank 63]: iteration 790/ 150000 | consumed samples: 50560 | elapsed time per iteration (ms): 893.1 | learning rate: 1.185E-04 | global batch size: 64 | lm loss: 4.306153E+00 | loss scale: 1.0 | grad norm: 1.187 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9172.82 | +[ip-26-0-155-69:7]:2023-06-21 17:40:21,191 [Rank 63]: time (ms) | forward-compute: 223.26 | backward-compute: 398.02 | backward-params-all-reduce: 225.89 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.01 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.79 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.06 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.82 | batch-generator: 1.80 +[ip-26-0-155-69:7]:2023-06-21 17:40:30,120 [Rank 63]: iteration 800/ 150000 | consumed samples: 51200 | elapsed time per iteration (ms): 893.0 | learning rate: 1.200E-04 | global batch size: 64 | lm loss: 4.318950E+00 | loss scale: 1.0 | grad norm: 1.484 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.97 | tokens-per-second-per-gpu: 9173.80 | +[ip-26-0-155-69:7]:2023-06-21 17:40:30,121 [Rank 63]: time (ms) | forward-compute: 223.13 | backward-compute: 398.05 | backward-params-all-reduce: 225.88 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 225.99 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.81 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.82 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:40:39,050 [Rank 63]: iteration 810/ 150000 | consumed samples: 51840 | elapsed time per iteration (ms): 893.0 | learning rate: 1.215E-04 | global batch size: 64 | lm loss: 4.220854E+00 | loss scale: 1.0 | grad norm: 1.429 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9173.29 | +[ip-26-0-155-69:7]:2023-06-21 17:40:39,051 [Rank 63]: time (ms) | forward-compute: 223.09 | backward-compute: 398.10 | backward-params-all-reduce: 225.96 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.07 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.79 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.81 | batch-generator: 1.85 +[ip-26-0-155-69:7]:2023-06-21 17:40:47,984 [Rank 63]: iteration 820/ 150000 | consumed samples: 52480 | elapsed time per iteration (ms): 893.3 | learning rate: 1.230E-04 | global batch size: 64 | lm loss: 4.197039E+00 | loss scale: 1.0 | grad norm: 1.459 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.92 | tokens-per-second-per-gpu: 9170.08 | +[ip-26-0-155-69:7]:2023-06-21 17:40:47,984 [Rank 63]: time (ms) | forward-compute: 223.04 | backward-compute: 398.09 | backward-params-all-reduce: 226.27 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.37 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 10.81 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.07 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.86 | batch-generator: 1.81 +[ip-26-0-155-69:7]:2023-06-21 17:40:56,913 [Rank 63]: iteration 830/ 150000 | consumed samples: 53120 | elapsed time per iteration (ms): 893.0 | learning rate: 1.245E-04 | global batch size: 64 | lm loss: 4.195742E+00 | loss scale: 1.0 | grad norm: 1.413 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.97 | tokens-per-second-per-gpu: 9173.90 | +[ip-26-0-155-69:7]:2023-06-21 17:40:56,914 [Rank 63]: time (ms) | forward-compute: 223.13 | backward-compute: 398.11 | backward-params-all-reduce: 225.95 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.04 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.78 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.77 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:41:05,842 [Rank 63]: iteration 840/ 150000 | consumed samples: 53760 | elapsed time per iteration (ms): 892.9 | learning rate: 1.260E-04 | global batch size: 64 | lm loss: 4.195538E+00 | loss scale: 1.0 | grad norm: 1.445 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9174.70 | +[ip-26-0-155-69:7]:2023-06-21 17:41:05,843 [Rank 63]: time (ms) | forward-compute: 223.01 | backward-compute: 398.10 | backward-params-all-reduce: 226.01 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.11 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.78 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.74 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:41:14,772 [Rank 63]: iteration 850/ 150000 | consumed samples: 54400 | elapsed time per iteration (ms): 893.0 | learning rate: 1.275E-04 | global batch size: 64 | lm loss: 4.161403E+00 | loss scale: 1.0 | grad norm: 1.486 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.97 | tokens-per-second-per-gpu: 9173.72 | +[ip-26-0-155-69:7]:2023-06-21 17:41:14,773 [Rank 63]: time (ms) | forward-compute: 222.86 | backward-compute: 398.12 | backward-params-all-reduce: 226.07 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.18 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.86 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.06 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.89 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:41:23,710 [Rank 63]: iteration 860/ 150000 | consumed samples: 55040 | elapsed time per iteration (ms): 893.7 | learning rate: 1.290E-04 | global batch size: 64 | lm loss: 4.088557E+00 | loss scale: 1.0 | grad norm: 1.173 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.88 | tokens-per-second-per-gpu: 9165.91 | +[ip-26-0-155-69:7]:2023-06-21 17:41:23,710 [Rank 63]: time (ms) | forward-compute: 223.19 | backward-compute: 398.17 | backward-params-all-reduce: 226.41 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 226.54 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.81 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.35 | optimizer: 41.84 | batch-generator: 1.83 +[ip-26-0-155-69:7]:2023-06-21 17:41:32,642 [Rank 63]: iteration 870/ 150000 | consumed samples: 55680 | elapsed time per iteration (ms): 893.2 | learning rate: 1.305E-04 | global batch size: 64 | lm loss: 4.082836E+00 | loss scale: 1.0 | grad norm: 1.657 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.60 | +[ip-26-0-155-69:7]:2023-06-21 17:41:32,642 [Rank 63]: time (ms) | forward-compute: 223.09 | backward-compute: 398.07 | backward-params-all-reduce: 226.18 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.28 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.79 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.77 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:41:41,577 [Rank 63]: iteration 880/ 150000 | consumed samples: 56320 | elapsed time per iteration (ms): 893.5 | learning rate: 1.320E-04 | global batch size: 64 | lm loss: 4.020747E+00 | loss scale: 1.0 | grad norm: 1.410 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.90 | tokens-per-second-per-gpu: 9168.23 | +[ip-26-0-155-69:7]:2023-06-21 17:41:41,577 [Rank 63]: time (ms) | forward-compute: 223.35 | backward-compute: 398.08 | backward-params-all-reduce: 226.25 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.36 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.78 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.78 | batch-generator: 1.80 +[ip-26-0-155-69:7]:2023-06-21 17:41:50,509 [Rank 63]: iteration 890/ 150000 | consumed samples: 56960 | elapsed time per iteration (ms): 893.2 | learning rate: 1.335E-04 | global batch size: 64 | lm loss: 3.955831E+00 | loss scale: 1.0 | grad norm: 1.574 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.55 | +[ip-26-0-155-69:7]:2023-06-21 17:41:50,509 [Rank 63]: time (ms) | forward-compute: 222.86 | backward-compute: 398.11 | backward-params-all-reduce: 226.29 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.40 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.81 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.81 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:41:59,437 [Rank 63]: iteration 900/ 150000 | consumed samples: 57600 | elapsed time per iteration (ms): 892.8 | learning rate: 1.350E-04 | global batch size: 64 | lm loss: 3.915794E+00 | loss scale: 1.0 | grad norm: 1.664 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.99 | tokens-per-second-per-gpu: 9175.65 | +[ip-26-0-155-69:7]:2023-06-21 17:41:59,437 [Rank 63]: time (ms) | forward-compute: 222.47 | backward-compute: 398.08 | backward-params-all-reduce: 226.34 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.45 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.57 | optimizer-clip-main-grad: 10.79 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.84 | batch-generator: 1.81 +[ip-26-0-155-69:7]:2023-06-21 17:42:08,368 [Rank 63]: iteration 910/ 150000 | consumed samples: 58240 | elapsed time per iteration (ms): 893.2 | learning rate: 1.365E-04 | global batch size: 64 | lm loss: 3.910498E+00 | loss scale: 1.0 | grad norm: 1.533 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.01 | +[ip-26-0-155-69:7]:2023-06-21 17:42:08,369 [Rank 63]: time (ms) | forward-compute: 222.92 | backward-compute: 398.08 | backward-params-all-reduce: 226.31 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.42 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.79 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.78 | batch-generator: 1.81 +[ip-26-0-155-69:7]:2023-06-21 17:42:17,297 [Rank 63]: iteration 920/ 150000 | consumed samples: 58880 | elapsed time per iteration (ms): 892.9 | learning rate: 1.380E-04 | global batch size: 64 | lm loss: 3.750729E+00 | loss scale: 1.0 | grad norm: 1.919 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9174.53 | +[ip-26-0-155-69:7]:2023-06-21 17:42:17,298 [Rank 63]: time (ms) | forward-compute: 222.75 | backward-compute: 398.16 | backward-params-all-reduce: 226.17 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.28 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.78 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.78 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:42:26,228 [Rank 63]: iteration 930/ 150000 | consumed samples: 59520 | elapsed time per iteration (ms): 893.1 | learning rate: 1.395E-04 | global batch size: 64 | lm loss: 3.721997E+00 | loss scale: 1.0 | grad norm: 2.101 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9172.86 | +[ip-26-0-155-69:7]:2023-06-21 17:42:26,229 [Rank 63]: time (ms) | forward-compute: 222.94 | backward-compute: 398.10 | backward-params-all-reduce: 226.09 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.21 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.81 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.83 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:42:35,161 [Rank 63]: iteration 940/ 150000 | consumed samples: 60160 | elapsed time per iteration (ms): 893.2 | learning rate: 1.410E-04 | global batch size: 64 | lm loss: 3.698772E+00 | loss scale: 1.0 | grad norm: 2.170 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.16 | +[ip-26-0-155-69:7]:2023-06-21 17:42:35,161 [Rank 63]: time (ms) | forward-compute: 223.03 | backward-compute: 398.12 | backward-params-all-reduce: 226.31 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.41 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.77 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.74 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:42:44,093 [Rank 63]: iteration 950/ 150000 | consumed samples: 60800 | elapsed time per iteration (ms): 893.3 | learning rate: 1.425E-04 | global batch size: 64 | lm loss: 3.612666E+00 | loss scale: 1.0 | grad norm: 2.250 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.57 | +[ip-26-0-155-69:7]:2023-06-21 17:42:44,094 [Rank 63]: time (ms) | forward-compute: 223.30 | backward-compute: 398.12 | backward-params-all-reduce: 226.13 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.22 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.77 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.72 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:42:53,022 [Rank 63]: iteration 960/ 150000 | consumed samples: 61440 | elapsed time per iteration (ms): 892.9 | learning rate: 1.440E-04 | global batch size: 64 | lm loss: 3.540173E+00 | loss scale: 1.0 | grad norm: 1.799 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9175.07 | +[ip-26-0-155-69:7]:2023-06-21 17:42:53,023 [Rank 63]: time (ms) | forward-compute: 223.02 | backward-compute: 398.11 | backward-params-all-reduce: 225.90 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.00 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.57 | optimizer-clip-main-grad: 10.81 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.81 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:43:01,950 [Rank 63]: iteration 970/ 150000 | consumed samples: 62080 | elapsed time per iteration (ms): 892.8 | learning rate: 1.455E-04 | global batch size: 64 | lm loss: 3.456714E+00 | loss scale: 1.0 | grad norm: 2.277 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.99 | tokens-per-second-per-gpu: 9175.49 | +[ip-26-0-155-69:7]:2023-06-21 17:43:01,951 [Rank 63]: time (ms) | forward-compute: 223.06 | backward-compute: 398.09 | backward-params-all-reduce: 225.88 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.99 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.77 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.75 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:43:10,886 [Rank 63]: iteration 980/ 150000 | consumed samples: 62720 | elapsed time per iteration (ms): 893.6 | learning rate: 1.470E-04 | global batch size: 64 | lm loss: 3.340820E+00 | loss scale: 1.0 | grad norm: 2.357 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.89 | tokens-per-second-per-gpu: 9167.39 | +[ip-26-0-155-69:7]:2023-06-21 17:43:10,887 [Rank 63]: time (ms) | forward-compute: 223.15 | backward-compute: 398.13 | backward-params-all-reduce: 226.50 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.60 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.77 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.76 | batch-generator: 1.80 +[ip-26-0-155-69:7]:2023-06-21 17:43:19,814 [Rank 63]: iteration 990/ 150000 | consumed samples: 63360 | elapsed time per iteration (ms): 892.8 | learning rate: 1.485E-04 | global batch size: 64 | lm loss: 3.300872E+00 | loss scale: 1.0 | grad norm: 2.031 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.99 | tokens-per-second-per-gpu: 9175.77 | +[ip-26-0-155-69:7]:2023-06-21 17:43:19,815 [Rank 63]: time (ms) | forward-compute: 223.08 | backward-compute: 398.14 | backward-params-all-reduce: 225.75 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.85 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.78 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.77 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:43:28,744 [Rank 63]: iteration 1000/ 150000 | consumed samples: 64000 | elapsed time per iteration (ms): 893.0 | learning rate: 1.500E-04 | global batch size: 64 | lm loss: 3.208639E+00 | loss scale: 1.0 | grad norm: 2.356 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.97 | tokens-per-second-per-gpu: 9173.69 | +[ip-26-0-155-69:7]:2023-06-21 17:43:28,745 [Rank 63]: time (ms) | forward-compute: 222.89 | backward-compute: 398.17 | backward-params-all-reduce: 225.98 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 226.10 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 10.80 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.81 | batch-generator: 1.82 +[ip-26-0-155-69:7]:2023-06-21 17:43:37,675 [Rank 63]: iteration 1010/ 150000 | consumed samples: 64640 | elapsed time per iteration (ms): 893.1 | learning rate: 1.515E-04 | global batch size: 64 | lm loss: 3.153380E+00 | loss scale: 1.0 | grad norm: 2.461 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.12 | +[ip-26-0-155-69:7]:2023-06-21 17:43:37,676 [Rank 63]: time (ms) | forward-compute: 223.40 | backward-compute: 398.16 | backward-params-all-reduce: 225.78 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.88 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.77 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.75 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:43:46,608 [Rank 63]: iteration 1020/ 150000 | consumed samples: 65280 | elapsed time per iteration (ms): 893.3 | learning rate: 1.530E-04 | global batch size: 64 | lm loss: 3.091166E+00 | loss scale: 1.0 | grad norm: 2.092 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.69 | +[ip-26-0-155-69:7]:2023-06-21 17:43:46,609 [Rank 63]: time (ms) | forward-compute: 223.38 | backward-compute: 398.19 | backward-params-all-reduce: 225.96 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.06 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.77 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.74 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:43:55,540 [Rank 63]: iteration 1030/ 150000 | consumed samples: 65920 | elapsed time per iteration (ms): 893.1 | learning rate: 1.545E-04 | global batch size: 64 | lm loss: 3.045628E+00 | loss scale: 1.0 | grad norm: 1.924 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.13 | +[ip-26-0-155-69:7]:2023-06-21 17:43:55,540 [Rank 63]: time (ms) | forward-compute: 223.22 | backward-compute: 398.19 | backward-params-all-reduce: 225.82 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.93 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.86 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.87 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:44:04,470 [Rank 63]: iteration 1040/ 150000 | consumed samples: 66560 | elapsed time per iteration (ms): 893.0 | learning rate: 1.560E-04 | global batch size: 64 | lm loss: 2.996407E+00 | loss scale: 1.0 | grad norm: 2.341 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9173.22 | +[ip-26-0-155-69:7]:2023-06-21 17:44:04,470 [Rank 63]: time (ms) | forward-compute: 222.96 | backward-compute: 398.11 | backward-params-all-reduce: 226.06 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.16 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.84 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:44:13,403 [Rank 63]: iteration 1050/ 150000 | consumed samples: 67200 | elapsed time per iteration (ms): 893.3 | learning rate: 1.575E-04 | global batch size: 64 | lm loss: 2.971989E+00 | loss scale: 1.0 | grad norm: 1.647 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.44 | +[ip-26-0-155-69:7]:2023-06-21 17:44:13,404 [Rank 63]: time (ms) | forward-compute: 223.33 | backward-compute: 398.12 | backward-params-all-reduce: 225.96 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.07 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.86 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:44:22,335 [Rank 63]: iteration 1060/ 150000 | consumed samples: 67840 | elapsed time per iteration (ms): 893.2 | learning rate: 1.590E-04 | global batch size: 64 | lm loss: 2.915565E+00 | loss scale: 1.0 | grad norm: 1.364 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.74 | +[ip-26-0-155-69:7]:2023-06-21 17:44:22,335 [Rank 63]: time (ms) | forward-compute: 223.23 | backward-compute: 398.12 | backward-params-all-reduce: 225.89 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.00 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.89 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:44:31,267 [Rank 63]: iteration 1070/ 150000 | consumed samples: 68480 | elapsed time per iteration (ms): 893.2 | learning rate: 1.605E-04 | global batch size: 64 | lm loss: 2.847284E+00 | loss scale: 1.0 | grad norm: 1.993 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.37 | +[ip-26-0-155-69:7]:2023-06-21 17:44:31,268 [Rank 63]: time (ms) | forward-compute: 222.99 | backward-compute: 398.21 | backward-params-all-reduce: 225.95 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.06 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.92 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:44:40,199 [Rank 63]: iteration 1080/ 150000 | consumed samples: 69120 | elapsed time per iteration (ms): 893.2 | learning rate: 1.620E-04 | global batch size: 64 | lm loss: 2.839899E+00 | loss scale: 1.0 | grad norm: 1.423 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.61 | +[ip-26-0-155-69:7]:2023-06-21 17:44:40,199 [Rank 63]: time (ms) | forward-compute: 223.16 | backward-compute: 398.13 | backward-params-all-reduce: 225.91 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.02 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.90 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:44:49,127 [Rank 63]: iteration 1090/ 150000 | consumed samples: 69760 | elapsed time per iteration (ms): 892.8 | learning rate: 1.635E-04 | global batch size: 64 | lm loss: 2.849504E+00 | loss scale: 1.0 | grad norm: 1.503 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.99 | tokens-per-second-per-gpu: 9175.53 | +[ip-26-0-155-69:7]:2023-06-21 17:44:49,127 [Rank 63]: time (ms) | forward-compute: 222.69 | backward-compute: 398.16 | backward-params-all-reduce: 226.01 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.12 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.88 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:44:58,056 [Rank 63]: iteration 1100/ 150000 | consumed samples: 70400 | elapsed time per iteration (ms): 892.9 | learning rate: 1.650E-04 | global batch size: 64 | lm loss: 2.744584E+00 | loss scale: 1.0 | grad norm: 2.211 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9174.57 | +[ip-26-0-155-69:7]:2023-06-21 17:44:58,056 [Rank 63]: time (ms) | forward-compute: 223.09 | backward-compute: 398.13 | backward-params-all-reduce: 225.80 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.90 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.85 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:45:06,989 [Rank 63]: iteration 1110/ 150000 | consumed samples: 71040 | elapsed time per iteration (ms): 893.3 | learning rate: 1.665E-04 | global batch size: 64 | lm loss: 2.695924E+00 | loss scale: 1.0 | grad norm: 2.254 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.87 | +[ip-26-0-155-69:7]:2023-06-21 17:45:06,989 [Rank 63]: time (ms) | forward-compute: 222.99 | backward-compute: 398.16 | backward-params-all-reduce: 226.14 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.25 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.90 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:45:15,926 [Rank 63]: iteration 1120/ 150000 | consumed samples: 71680 | elapsed time per iteration (ms): 893.7 | learning rate: 1.680E-04 | global batch size: 64 | lm loss: 2.687495E+00 | loss scale: 1.0 | grad norm: 1.497 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.88 | tokens-per-second-per-gpu: 9166.41 | +[ip-26-0-155-69:7]:2023-06-21 17:45:15,926 [Rank 63]: time (ms) | forward-compute: 223.50 | backward-compute: 398.08 | backward-params-all-reduce: 226.30 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.39 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.51 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.83 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:45:24,859 [Rank 63]: iteration 1130/ 150000 | consumed samples: 72320 | elapsed time per iteration (ms): 893.3 | learning rate: 1.695E-04 | global batch size: 64 | lm loss: 2.661385E+00 | loss scale: 1.0 | grad norm: 1.295 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.92 | tokens-per-second-per-gpu: 9170.08 | +[ip-26-0-155-69:7]:2023-06-21 17:45:24,860 [Rank 63]: time (ms) | forward-compute: 223.32 | backward-compute: 398.16 | backward-params-all-reduce: 225.83 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.93 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.97 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.97 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:45:33,789 [Rank 63]: iteration 1140/ 150000 | consumed samples: 72960 | elapsed time per iteration (ms): 893.0 | learning rate: 1.710E-04 | global batch size: 64 | lm loss: 2.660387E+00 | loss scale: 1.0 | grad norm: 1.376 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.97 | tokens-per-second-per-gpu: 9173.97 | +[ip-26-0-155-69:7]:2023-06-21 17:45:33,789 [Rank 63]: time (ms) | forward-compute: 223.03 | backward-compute: 398.13 | backward-params-all-reduce: 225.88 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.98 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.88 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:45:42,717 [Rank 63]: iteration 1150/ 150000 | consumed samples: 73600 | elapsed time per iteration (ms): 892.8 | learning rate: 1.725E-04 | global batch size: 64 | lm loss: 2.605412E+00 | loss scale: 1.0 | grad norm: 1.620 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.99 | tokens-per-second-per-gpu: 9175.41 | +[ip-26-0-155-69:7]:2023-06-21 17:45:42,718 [Rank 63]: time (ms) | forward-compute: 223.02 | backward-compute: 398.14 | backward-params-all-reduce: 225.72 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 225.84 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.87 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:45:51,652 [Rank 63]: iteration 1160/ 150000 | consumed samples: 74240 | elapsed time per iteration (ms): 893.5 | learning rate: 1.740E-04 | global batch size: 64 | lm loss: 2.514670E+00 | loss scale: 1.0 | grad norm: 1.174 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.91 | tokens-per-second-per-gpu: 9168.58 | +[ip-26-0-155-69:7]:2023-06-21 17:45:51,652 [Rank 63]: time (ms) | forward-compute: 223.31 | backward-compute: 398.13 | backward-params-all-reduce: 226.15 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.24 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.83 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:46:00,581 [Rank 63]: iteration 1170/ 150000 | consumed samples: 74880 | elapsed time per iteration (ms): 892.9 | learning rate: 1.755E-04 | global batch size: 64 | lm loss: 2.608316E+00 | loss scale: 1.0 | grad norm: 1.315 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9174.82 | +[ip-26-0-155-69:7]:2023-06-21 17:46:00,581 [Rank 63]: time (ms) | forward-compute: 223.28 | backward-compute: 398.17 | backward-params-all-reduce: 225.60 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.69 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.51 | optimizer-clip-main-grad: 10.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 41.80 | batch-generator: 1.80 +[ip-26-0-155-69:7]:2023-06-21 17:46:09,511 [Rank 63]: iteration 1180/ 150000 | consumed samples: 75520 | elapsed time per iteration (ms): 893.0 | learning rate: 1.770E-04 | global batch size: 64 | lm loss: 2.505637E+00 | loss scale: 1.0 | grad norm: 1.357 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9173.48 | +[ip-26-0-155-69:7]:2023-06-21 17:46:09,511 [Rank 63]: time (ms) | forward-compute: 223.09 | backward-compute: 398.21 | backward-params-all-reduce: 225.58 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.69 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.94 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.34 | optimizer: 41.94 | batch-generator: 1.80 +[ip-26-0-155-69:7]:2023-06-21 17:46:18,434 [Rank 63]: iteration 1190/ 150000 | consumed samples: 76160 | elapsed time per iteration (ms): 892.4 | learning rate: 1.785E-04 | global batch size: 64 | lm loss: 2.492915E+00 | loss scale: 1.0 | grad norm: 1.201 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.04 | tokens-per-second-per-gpu: 9180.16 | +[ip-26-0-155-69:7]:2023-06-21 17:46:18,435 [Rank 63]: time (ms) | forward-compute: 222.93 | backward-compute: 398.13 | backward-params-all-reduce: 225.89 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.00 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.30 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.34 | optimizer: 41.33 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:46:27,363 [Rank 63]: iteration 1200/ 150000 | consumed samples: 76800 | elapsed time per iteration (ms): 892.9 | learning rate: 1.800E-04 | global batch size: 64 | lm loss: 2.512564E+00 | loss scale: 1.0 | grad norm: 1.095 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9174.77 | +[ip-26-0-155-69:7]:2023-06-21 17:46:27,364 [Rank 63]: time (ms) | forward-compute: 222.97 | backward-compute: 398.21 | backward-params-all-reduce: 225.73 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.84 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.88 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:46:36,293 [Rank 63]: iteration 1210/ 150000 | consumed samples: 77440 | elapsed time per iteration (ms): 893.0 | learning rate: 1.815E-04 | global batch size: 64 | lm loss: 2.474167E+00 | loss scale: 1.0 | grad norm: 1.300 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.96 | tokens-per-second-per-gpu: 9173.25 | +[ip-26-0-155-69:7]:2023-06-21 17:46:36,294 [Rank 63]: time (ms) | forward-compute: 223.01 | backward-compute: 398.15 | backward-params-all-reduce: 225.78 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.89 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.91 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:46:45,226 [Rank 63]: iteration 1220/ 150000 | consumed samples: 78080 | elapsed time per iteration (ms): 893.3 | learning rate: 1.830E-04 | global batch size: 64 | lm loss: 2.465782E+00 | loss scale: 1.0 | grad norm: 1.207 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.93 | tokens-per-second-per-gpu: 9170.77 | +[ip-26-0-155-69:7]:2023-06-21 17:46:45,227 [Rank 63]: time (ms) | forward-compute: 222.95 | backward-compute: 398.13 | backward-params-all-reduce: 226.02 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 226.14 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.36 | optimizer: 41.97 | batch-generator: 1.84 +[ip-26-0-155-69:7]:2023-06-21 17:46:54,158 [Rank 63]: iteration 1230/ 150000 | consumed samples: 78720 | elapsed time per iteration (ms): 893.2 | learning rate: 1.845E-04 | global batch size: 64 | lm loss: 2.344203E+00 | loss scale: 1.0 | grad norm: 1.232 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.94 | tokens-per-second-per-gpu: 9171.55 | +[ip-26-0-155-69:7]:2023-06-21 17:46:54,159 [Rank 63]: time (ms) | forward-compute: 223.12 | backward-compute: 398.05 | backward-params-all-reduce: 226.02 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.14 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 41.87 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:47:03,102 [Rank 63]: iteration 1240/ 150000 | consumed samples: 79360 | elapsed time per iteration (ms): 894.3 | learning rate: 1.860E-04 | global batch size: 64 | lm loss: 2.391261E+00 | loss scale: 1.0 | grad norm: 1.046 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.81 | tokens-per-second-per-gpu: 9159.81 | +[ip-26-0-155-69:7]:2023-06-21 17:47:03,102 [Rank 63]: time (ms) | forward-compute: 224.88 | backward-compute: 398.05 | backward-params-all-reduce: 226.06 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.17 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 10.30 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.29 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:47:12,218 [Rank 63]: iteration 1250/ 150000 | consumed samples: 80000 | elapsed time per iteration (ms): 911.7 | learning rate: 1.875E-04 | global batch size: 64 | lm loss: 2.366144E+00 | loss scale: 1.0 | grad norm: 1.226 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 103.80 | tokens-per-second-per-gpu: 8985.86 | +[ip-26-0-155-69:7]:2023-06-21 17:47:12,222 [Rank 63]: time (ms) | forward-compute: 227.78 | backward-compute: 410.39 | backward-params-all-reduce: 226.88 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.99 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 1.11 | optimizer-clip-main-grad: 10.94 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.34 | optimizer: 42.50 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:47:21,436 [Rank 63]: iteration 1260/ 150000 | consumed samples: 80640 | elapsed time per iteration (ms): 921.8 | learning rate: 1.890E-04 | global batch size: 64 | lm loss: 2.398231E+00 | loss scale: 1.0 | grad norm: 1.184 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 102.66 | tokens-per-second-per-gpu: 8887.10 | +[ip-26-0-155-69:7]:2023-06-21 17:47:21,436 [Rank 63]: time (ms) | forward-compute: 239.96 | backward-compute: 398.08 | backward-params-all-reduce: 238.05 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 238.15 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 10.31 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.33 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:47:30,378 [Rank 63]: iteration 1270/ 150000 | consumed samples: 81280 | elapsed time per iteration (ms): 894.2 | learning rate: 1.905E-04 | global batch size: 64 | lm loss: 2.373417E+00 | loss scale: 1.0 | grad norm: 1.207 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.82 | tokens-per-second-per-gpu: 9160.81 | +[ip-26-0-155-69:7]:2023-06-21 17:47:30,379 [Rank 63]: time (ms) | forward-compute: 223.89 | backward-compute: 398.48 | backward-params-all-reduce: 227.15 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 227.24 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 9.71 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 40.68 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:47:39,307 [Rank 63]: iteration 1280/ 150000 | consumed samples: 81920 | elapsed time per iteration (ms): 892.9 | learning rate: 1.920E-04 | global batch size: 64 | lm loss: 2.270634E+00 | loss scale: 1.0 | grad norm: 0.874 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.98 | tokens-per-second-per-gpu: 9174.81 | +[ip-26-0-155-69:7]:2023-06-21 17:47:39,308 [Rank 63]: time (ms) | forward-compute: 223.30 | backward-compute: 398.17 | backward-params-all-reduce: 226.13 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.22 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 10.29 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 41.26 | batch-generator: 1.73 +[ip-26-0-155-69:7]:2023-06-21 17:47:48,229 [Rank 63]: iteration 1290/ 150000 | consumed samples: 82560 | elapsed time per iteration (ms): 892.2 | learning rate: 1.935E-04 | global batch size: 64 | lm loss: 2.339629E+00 | loss scale: 1.0 | grad norm: 1.124 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.06 | tokens-per-second-per-gpu: 9181.62 | +[ip-26-0-155-69:7]:2023-06-21 17:47:48,230 [Rank 63]: time (ms) | forward-compute: 224.19 | backward-compute: 398.22 | backward-params-all-reduce: 226.23 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.33 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 8.52 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 39.49 | batch-generator: 1.73 +[ip-26-0-155-69:7]:2023-06-21 17:47:57,137 [Rank 63]: iteration 1300/ 150000 | consumed samples: 83200 | elapsed time per iteration (ms): 890.7 | learning rate: 1.950E-04 | global batch size: 64 | lm loss: 2.293056E+00 | loss scale: 1.0 | grad norm: 1.426 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.24 | tokens-per-second-per-gpu: 9196.99 | +[ip-26-0-155-69:7]:2023-06-21 17:47:57,137 [Rank 63]: time (ms) | forward-compute: 223.00 | backward-compute: 398.18 | backward-params-all-reduce: 226.00 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.10 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 8.50 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 39.44 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:48:06,068 [Rank 63]: iteration 1310/ 150000 | consumed samples: 83840 | elapsed time per iteration (ms): 893.1 | learning rate: 1.965E-04 | global batch size: 64 | lm loss: 2.309850E+00 | loss scale: 1.0 | grad norm: 0.914 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.45 | +[ip-26-0-155-69:7]:2023-06-21 17:48:06,068 [Rank 63]: time (ms) | forward-compute: 224.07 | backward-compute: 398.18 | backward-params-all-reduce: 226.15 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.25 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 9.68 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 40.63 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:48:14,955 [Rank 63]: iteration 1320/ 150000 | consumed samples: 84480 | elapsed time per iteration (ms): 888.7 | learning rate: 1.980E-04 | global batch size: 64 | lm loss: 2.227564E+00 | loss scale: 1.0 | grad norm: 0.763 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.48 | tokens-per-second-per-gpu: 9218.03 | +[ip-26-0-155-69:7]:2023-06-21 17:48:14,955 [Rank 63]: time (ms) | forward-compute: 223.51 | backward-compute: 398.19 | backward-params-all-reduce: 225.81 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.92 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 6.12 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 37.12 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:48:23,868 [Rank 63]: iteration 1330/ 150000 | consumed samples: 85120 | elapsed time per iteration (ms): 891.3 | learning rate: 1.995E-04 | global batch size: 64 | lm loss: 2.216090E+00 | loss scale: 1.0 | grad norm: 0.923 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.16 | tokens-per-second-per-gpu: 9190.61 | +[ip-26-0-155-69:7]:2023-06-21 17:48:23,869 [Rank 63]: time (ms) | forward-compute: 224.40 | backward-compute: 398.15 | backward-params-all-reduce: 226.54 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.64 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 7.29 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 38.25 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:48:32,800 [Rank 63]: iteration 1340/ 150000 | consumed samples: 85760 | elapsed time per iteration (ms): 893.1 | learning rate: 2.010E-04 | global batch size: 64 | lm loss: 2.260810E+00 | loss scale: 1.0 | grad norm: 1.052 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.95 | tokens-per-second-per-gpu: 9172.09 | +[ip-26-0-155-69:7]:2023-06-21 17:48:32,801 [Rank 63]: time (ms) | forward-compute: 224.46 | backward-compute: 398.23 | backward-params-all-reduce: 226.26 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.36 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.58 | optimizer-clip-main-grad: 8.73 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 39.76 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:48:41,863 [Rank 63]: iteration 1350/ 150000 | consumed samples: 86400 | elapsed time per iteration (ms): 906.3 | learning rate: 2.025E-04 | global batch size: 64 | lm loss: 2.207488E+00 | loss scale: 1.0 | grad norm: 0.992 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 104.41 | tokens-per-second-per-gpu: 9038.78 | +[ip-26-0-155-69:7]:2023-06-21 17:48:41,863 [Rank 63]: time (ms) | forward-compute: 232.47 | backward-compute: 398.53 | backward-params-all-reduce: 227.51 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 227.60 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 9.47 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.07 | optimizer-copy-main-to-model-params: 9.17 | optimizer: 41.33 | batch-generator: 3.51 +[ip-26-0-155-69:7]:2023-06-21 17:48:50,846 [Rank 63]: iteration 1360/ 150000 | consumed samples: 87040 | elapsed time per iteration (ms): 898.3 | learning rate: 2.040E-04 | global batch size: 64 | lm loss: 2.270252E+00 | loss scale: 1.0 | grad norm: 0.899 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.34 | tokens-per-second-per-gpu: 9119.72 | +[ip-26-0-155-69:7]:2023-06-21 17:48:50,846 [Rank 63]: time (ms) | forward-compute: 229.50 | backward-compute: 398.07 | backward-params-all-reduce: 225.71 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.80 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 9.72 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 40.68 | batch-generator: 2.08 +[ip-26-0-155-69:7]:2023-06-21 17:48:59,739 [Rank 63]: iteration 1370/ 150000 | consumed samples: 87680 | elapsed time per iteration (ms): 889.4 | learning rate: 2.055E-04 | global batch size: 64 | lm loss: 2.186746E+00 | loss scale: 1.0 | grad norm: 0.852 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.40 | tokens-per-second-per-gpu: 9211.04 | +[ip-26-0-155-69:7]:2023-06-21 17:48:59,740 [Rank 63]: time (ms) | forward-compute: 223.31 | backward-compute: 398.11 | backward-params-all-reduce: 225.63 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.73 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 7.31 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 38.26 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:49:08,608 [Rank 63]: iteration 1380/ 150000 | consumed samples: 88320 | elapsed time per iteration (ms): 886.9 | learning rate: 2.070E-04 | global batch size: 64 | lm loss: 2.170895E+00 | loss scale: 1.0 | grad norm: 0.820 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.69 | tokens-per-second-per-gpu: 9236.76 | +[ip-26-0-155-69:7]:2023-06-21 17:49:08,609 [Rank 63]: time (ms) | forward-compute: 223.06 | backward-compute: 398.13 | backward-params-all-reduce: 225.81 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.91 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.86 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:49:17,478 [Rank 63]: iteration 1390/ 150000 | consumed samples: 88960 | elapsed time per iteration (ms): 887.0 | learning rate: 2.085E-04 | global batch size: 64 | lm loss: 2.184438E+00 | loss scale: 1.0 | grad norm: 0.667 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.69 | tokens-per-second-per-gpu: 9236.05 | +[ip-26-0-155-69:7]:2023-06-21 17:49:17,479 [Rank 63]: time (ms) | forward-compute: 223.03 | backward-compute: 398.14 | backward-params-all-reduce: 225.85 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.94 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.87 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:49:26,349 [Rank 63]: iteration 1400/ 150000 | consumed samples: 89600 | elapsed time per iteration (ms): 887.1 | learning rate: 2.100E-04 | global batch size: 64 | lm loss: 2.191436E+00 | loss scale: 1.0 | grad norm: 0.900 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.67 | tokens-per-second-per-gpu: 9234.72 | +[ip-26-0-155-69:7]:2023-06-21 17:49:26,349 [Rank 63]: time (ms) | forward-compute: 223.15 | backward-compute: 398.13 | backward-params-all-reduce: 225.85 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.95 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.87 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:49:35,217 [Rank 63]: iteration 1410/ 150000 | consumed samples: 90240 | elapsed time per iteration (ms): 886.9 | learning rate: 2.115E-04 | global batch size: 64 | lm loss: 2.105275E+00 | loss scale: 1.0 | grad norm: 0.648 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.70 | tokens-per-second-per-gpu: 9236.90 | +[ip-26-0-155-69:7]:2023-06-21 17:49:35,218 [Rank 63]: time (ms) | forward-compute: 223.11 | backward-compute: 398.07 | backward-params-all-reduce: 225.83 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.92 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.83 | batch-generator: 1.72 +[ip-26-0-155-69:7]:2023-06-21 17:49:44,300 [Rank 63]: iteration 1420/ 150000 | consumed samples: 90880 | elapsed time per iteration (ms): 908.2 | learning rate: 2.130E-04 | global batch size: 64 | lm loss: 2.148899E+00 | loss scale: 1.0 | grad norm: 0.771 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 104.19 | tokens-per-second-per-gpu: 9019.68 | +[ip-26-0-155-69:7]:2023-06-21 17:49:44,300 [Rank 63]: time (ms) | forward-compute: 244.20 | backward-compute: 398.14 | backward-params-all-reduce: 226.03 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.14 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.82 | batch-generator: 1.72 +[ip-26-0-155-69:7]:2023-06-21 17:49:53,437 [Rank 63]: iteration 1430/ 150000 | consumed samples: 91520 | elapsed time per iteration (ms): 913.8 | learning rate: 2.145E-04 | global batch size: 64 | lm loss: 2.106895E+00 | loss scale: 1.0 | grad norm: 0.953 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 103.56 | tokens-per-second-per-gpu: 8965.18 | +[ip-26-0-155-69:7]:2023-06-21 17:49:53,438 [Rank 63]: time (ms) | forward-compute: 249.00 | backward-compute: 398.13 | backward-params-all-reduce: 226.04 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.15 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 5.53 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 36.51 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:50:02,328 [Rank 63]: iteration 1440/ 150000 | consumed samples: 92160 | elapsed time per iteration (ms): 889.1 | learning rate: 2.160E-04 | global batch size: 64 | lm loss: 2.082574E+00 | loss scale: 1.0 | grad norm: 0.964 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.43 | tokens-per-second-per-gpu: 9214.22 | +[ip-26-0-155-69:7]:2023-06-21 17:50:02,329 [Rank 63]: time (ms) | forward-compute: 224.51 | backward-compute: 398.18 | backward-params-all-reduce: 225.82 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.93 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 5.51 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 36.49 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:50:11,226 [Rank 63]: iteration 1450/ 150000 | consumed samples: 92800 | elapsed time per iteration (ms): 889.8 | learning rate: 2.175E-04 | global batch size: 64 | lm loss: 2.120988E+00 | loss scale: 1.0 | grad norm: 1.176 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.35 | tokens-per-second-per-gpu: 9206.48 | +[ip-26-0-155-69:7]:2023-06-21 17:50:11,227 [Rank 63]: time (ms) | forward-compute: 222.94 | backward-compute: 398.08 | backward-params-all-reduce: 225.92 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.02 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 7.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 38.85 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:50:20,109 [Rank 63]: iteration 1460/ 150000 | consumed samples: 93440 | elapsed time per iteration (ms): 888.3 | learning rate: 2.190E-04 | global batch size: 64 | lm loss: 2.086980E+00 | loss scale: 1.0 | grad norm: 0.844 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.52 | tokens-per-second-per-gpu: 9221.81 | +[ip-26-0-155-69:7]:2023-06-21 17:50:20,110 [Rank 63]: time (ms) | forward-compute: 223.28 | backward-compute: 398.14 | backward-params-all-reduce: 225.81 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.92 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 6.09 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.00 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 37.06 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:50:28,993 [Rank 63]: iteration 1470/ 150000 | consumed samples: 94080 | elapsed time per iteration (ms): 888.4 | learning rate: 2.205E-04 | global batch size: 64 | lm loss: 2.114236E+00 | loss scale: 1.0 | grad norm: 0.840 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.52 | tokens-per-second-per-gpu: 9221.27 | +[ip-26-0-155-69:7]:2023-06-21 17:50:28,994 [Rank 63]: time (ms) | forward-compute: 223.55 | backward-compute: 398.22 | backward-params-all-reduce: 225.90 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.01 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 5.57 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.05 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 36.59 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:50:37,865 [Rank 63]: iteration 1480/ 150000 | consumed samples: 94720 | elapsed time per iteration (ms): 887.2 | learning rate: 2.220E-04 | global batch size: 64 | lm loss: 2.105153E+00 | loss scale: 1.0 | grad norm: 0.724 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.66 | tokens-per-second-per-gpu: 9233.89 | +[ip-26-0-155-69:7]:2023-06-21 17:50:37,866 [Rank 63]: time (ms) | forward-compute: 223.00 | backward-compute: 398.19 | backward-params-all-reduce: 226.00 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 226.13 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.87 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:50:46,748 [Rank 63]: iteration 1490/ 150000 | consumed samples: 95360 | elapsed time per iteration (ms): 888.3 | learning rate: 2.235E-04 | global batch size: 64 | lm loss: 2.085087E+00 | loss scale: 1.0 | grad norm: 0.795 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.53 | tokens-per-second-per-gpu: 9222.58 | +[ip-26-0-155-69:7]:2023-06-21 17:50:46,748 [Rank 63]: time (ms) | forward-compute: 222.63 | backward-compute: 398.13 | backward-params-all-reduce: 226.30 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.40 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 6.10 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 37.07 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:50:55,624 [Rank 63]: iteration 1500/ 150000 | consumed samples: 96000 | elapsed time per iteration (ms): 887.6 | learning rate: 2.250E-04 | global batch size: 64 | lm loss: 2.111531E+00 | loss scale: 1.0 | grad norm: 0.722 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.60 | tokens-per-second-per-gpu: 9228.97 | +[ip-26-0-155-69:7]:2023-06-21 17:50:55,624 [Rank 63]: time (ms) | forward-compute: 222.95 | backward-compute: 398.16 | backward-params-all-reduce: 225.95 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.06 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 5.52 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 36.50 | batch-generator: 1.80 +[ip-26-0-155-69:7]:2023-06-21 17:51:04,505 [Rank 63]: iteration 1510/ 150000 | consumed samples: 96640 | elapsed time per iteration (ms): 888.1 | learning rate: 2.265E-04 | global batch size: 64 | lm loss: 2.091608E+00 | loss scale: 1.0 | grad norm: 0.586 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.55 | tokens-per-second-per-gpu: 9224.57 | +[ip-26-0-155-69:7]:2023-06-21 17:51:04,505 [Rank 63]: time (ms) | forward-compute: 222.86 | backward-compute: 398.17 | backward-params-all-reduce: 225.94 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.03 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 6.10 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 37.05 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:51:13,378 [Rank 63]: iteration 1520/ 150000 | consumed samples: 97280 | elapsed time per iteration (ms): 887.3 | learning rate: 2.280E-04 | global batch size: 64 | lm loss: 2.089523E+00 | loss scale: 1.0 | grad norm: 0.847 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.64 | tokens-per-second-per-gpu: 9232.23 | +[ip-26-0-155-69:7]:2023-06-21 17:51:13,378 [Rank 63]: time (ms) | forward-compute: 222.85 | backward-compute: 398.20 | backward-params-all-reduce: 226.44 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.53 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.84 | batch-generator: 1.73 +[ip-26-0-155-69:7]:2023-06-21 17:51:22,247 [Rank 63]: iteration 1530/ 150000 | consumed samples: 97920 | elapsed time per iteration (ms): 886.9 | learning rate: 2.295E-04 | global batch size: 64 | lm loss: 2.017301E+00 | loss scale: 1.0 | grad norm: 0.643 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.69 | tokens-per-second-per-gpu: 9236.22 | +[ip-26-0-155-69:7]:2023-06-21 17:51:22,248 [Rank 63]: time (ms) | forward-compute: 222.99 | backward-compute: 398.20 | backward-params-all-reduce: 225.83 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.94 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 4.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.88 | batch-generator: 1.73 +[ip-26-0-155-69:7]:2023-06-21 17:51:31,117 [Rank 63]: iteration 1540/ 150000 | consumed samples: 98560 | elapsed time per iteration (ms): 887.0 | learning rate: 2.310E-04 | global batch size: 64 | lm loss: 2.060225E+00 | loss scale: 1.0 | grad norm: 0.548 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.68 | tokens-per-second-per-gpu: 9235.67 | +[ip-26-0-155-69:7]:2023-06-21 17:51:31,118 [Rank 63]: time (ms) | forward-compute: 222.73 | backward-compute: 398.16 | backward-params-all-reduce: 226.12 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.23 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 4.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 35.91 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:51:39,995 [Rank 63]: iteration 1550/ 150000 | consumed samples: 99200 | elapsed time per iteration (ms): 887.8 | learning rate: 2.325E-04 | global batch size: 64 | lm loss: 2.057044E+00 | loss scale: 1.0 | grad norm: 0.851 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.58 | tokens-per-second-per-gpu: 9227.20 | +[ip-26-0-155-69:7]:2023-06-21 17:51:39,996 [Rank 63]: time (ms) | forward-compute: 223.17 | backward-compute: 398.18 | backward-params-all-reduce: 225.97 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.07 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 5.49 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 36.45 | batch-generator: 1.72 +[ip-26-0-155-69:7]:2023-06-21 17:51:48,889 [Rank 63]: iteration 1560/ 150000 | consumed samples: 99840 | elapsed time per iteration (ms): 889.4 | learning rate: 2.340E-04 | global batch size: 64 | lm loss: 2.083456E+00 | loss scale: 1.0 | grad norm: 1.189 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.40 | tokens-per-second-per-gpu: 9210.97 | +[ip-26-0-155-69:7]:2023-06-21 17:51:48,890 [Rank 63]: time (ms) | forward-compute: 223.48 | backward-compute: 398.14 | backward-params-all-reduce: 226.02 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.12 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 6.69 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 37.67 | batch-generator: 1.73 +[ip-26-0-155-69:7]:2023-06-21 17:51:57,785 [Rank 63]: iteration 1570/ 150000 | consumed samples: 100480 | elapsed time per iteration (ms): 889.6 | learning rate: 2.355E-04 | global batch size: 64 | lm loss: 2.047602E+00 | loss scale: 1.0 | grad norm: 0.527 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.38 | tokens-per-second-per-gpu: 9209.10 | +[ip-26-0-155-69:7]:2023-06-21 17:51:57,785 [Rank 63]: time (ms) | forward-compute: 223.80 | backward-compute: 398.15 | backward-params-all-reduce: 227.62 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 227.73 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.90 | batch-generator: 1.72 +[ip-26-0-155-69:7]:2023-06-21 17:52:06,667 [Rank 63]: iteration 1580/ 150000 | consumed samples: 101120 | elapsed time per iteration (ms): 888.2 | learning rate: 2.370E-04 | global batch size: 64 | lm loss: 2.038838E+00 | loss scale: 1.0 | grad norm: 0.620 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.53 | tokens-per-second-per-gpu: 9222.74 | +[ip-26-0-155-69:7]:2023-06-21 17:52:06,668 [Rank 63]: time (ms) | forward-compute: 223.71 | backward-compute: 398.12 | backward-params-all-reduce: 225.95 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.05 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.51 | optimizer-clip-main-grad: 5.48 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 36.44 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:52:15,538 [Rank 63]: iteration 1590/ 150000 | consumed samples: 101760 | elapsed time per iteration (ms): 887.1 | learning rate: 2.385E-04 | global batch size: 64 | lm loss: 2.056241E+00 | loss scale: 1.0 | grad norm: 0.598 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.67 | tokens-per-second-per-gpu: 9234.62 | +[ip-26-0-155-69:7]:2023-06-21 17:52:15,539 [Rank 63]: time (ms) | forward-compute: 223.34 | backward-compute: 398.15 | backward-params-all-reduce: 225.69 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.79 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.84 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:52:24,416 [Rank 63]: iteration 1600/ 150000 | consumed samples: 102400 | elapsed time per iteration (ms): 887.8 | learning rate: 2.400E-04 | global batch size: 64 | lm loss: 2.030428E+00 | loss scale: 1.0 | grad norm: 0.765 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.59 | tokens-per-second-per-gpu: 9227.45 | +[ip-26-0-155-69:7]:2023-06-21 17:52:24,416 [Rank 63]: time (ms) | forward-compute: 223.26 | backward-compute: 398.18 | backward-params-all-reduce: 225.75 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.86 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.57 | optimizer-clip-main-grad: 5.49 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 36.50 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:52:33,292 [Rank 63]: iteration 1610/ 150000 | consumed samples: 103040 | elapsed time per iteration (ms): 887.6 | learning rate: 2.415E-04 | global batch size: 64 | lm loss: 2.053657E+00 | loss scale: 1.0 | grad norm: 0.599 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.61 | tokens-per-second-per-gpu: 9229.35 | +[ip-26-0-155-69:7]:2023-06-21 17:52:33,292 [Rank 63]: time (ms) | forward-compute: 223.23 | backward-compute: 398.13 | backward-params-all-reduce: 226.22 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.33 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.34 | optimizer: 35.93 | batch-generator: 1.73 +[ip-26-0-155-69:7]:2023-06-21 17:52:42,171 [Rank 63]: iteration 1620/ 150000 | consumed samples: 103680 | elapsed time per iteration (ms): 887.9 | learning rate: 2.430E-04 | global batch size: 64 | lm loss: 1.990064E+00 | loss scale: 1.0 | grad norm: 0.870 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.57 | tokens-per-second-per-gpu: 9225.78 | +[ip-26-0-155-69:7]:2023-06-21 17:52:42,172 [Rank 63]: time (ms) | forward-compute: 223.60 | backward-compute: 398.14 | backward-params-all-reduce: 226.30 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.41 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.84 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:52:51,054 [Rank 63]: iteration 1630/ 150000 | consumed samples: 104320 | elapsed time per iteration (ms): 888.3 | learning rate: 2.445E-04 | global batch size: 64 | lm loss: 2.021906E+00 | loss scale: 1.0 | grad norm: 1.073 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.52 | tokens-per-second-per-gpu: 9222.04 | +[ip-26-0-155-69:7]:2023-06-21 17:52:51,055 [Rank 63]: time (ms) | forward-compute: 223.80 | backward-compute: 398.08 | backward-params-all-reduce: 225.86 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.97 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 5.51 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 36.47 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:52:59,938 [Rank 63]: iteration 1640/ 150000 | consumed samples: 104960 | elapsed time per iteration (ms): 888.4 | learning rate: 2.460E-04 | global batch size: 64 | lm loss: 2.026669E+00 | loss scale: 1.0 | grad norm: 0.577 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.52 | tokens-per-second-per-gpu: 9221.25 | +[ip-26-0-155-69:7]:2023-06-21 17:52:59,939 [Rank 63]: time (ms) | forward-compute: 223.40 | backward-compute: 398.12 | backward-params-all-reduce: 225.74 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.85 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 6.08 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 37.08 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:53:08,815 [Rank 63]: iteration 1650/ 150000 | consumed samples: 105600 | elapsed time per iteration (ms): 887.7 | learning rate: 2.475E-04 | global batch size: 64 | lm loss: 2.012196E+00 | loss scale: 1.0 | grad norm: 0.685 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.60 | tokens-per-second-per-gpu: 9228.24 | +[ip-26-0-155-69:7]:2023-06-21 17:53:08,816 [Rank 63]: time (ms) | forward-compute: 223.94 | backward-compute: 398.15 | backward-params-all-reduce: 225.63 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.74 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.90 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:53:17,694 [Rank 63]: iteration 1660/ 150000 | consumed samples: 106240 | elapsed time per iteration (ms): 887.8 | learning rate: 2.490E-04 | global batch size: 64 | lm loss: 1.991595E+00 | loss scale: 1.0 | grad norm: 0.674 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.58 | tokens-per-second-per-gpu: 9227.17 | +[ip-26-0-155-69:7]:2023-06-21 17:53:17,694 [Rank 63]: time (ms) | forward-compute: 223.28 | backward-compute: 398.17 | backward-params-all-reduce: 225.78 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.89 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 5.53 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 36.53 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:53:26,570 [Rank 63]: iteration 1670/ 150000 | consumed samples: 106880 | elapsed time per iteration (ms): 887.6 | learning rate: 2.505E-04 | global batch size: 64 | lm loss: 2.005218E+00 | loss scale: 1.0 | grad norm: 0.549 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.61 | tokens-per-second-per-gpu: 9229.19 | +[ip-26-0-155-69:7]:2023-06-21 17:53:26,570 [Rank 63]: time (ms) | forward-compute: 223.06 | backward-compute: 398.21 | backward-params-all-reduce: 225.77 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.88 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 5.50 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 36.48 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:53:35,447 [Rank 63]: iteration 1680/ 150000 | consumed samples: 107520 | elapsed time per iteration (ms): 887.7 | learning rate: 2.520E-04 | global batch size: 64 | lm loss: 1.973621E+00 | loss scale: 1.0 | grad norm: 0.650 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.60 | tokens-per-second-per-gpu: 9228.36 | +[ip-26-0-155-69:7]:2023-06-21 17:53:35,447 [Rank 63]: time (ms) | forward-compute: 223.70 | backward-compute: 398.18 | backward-params-all-reduce: 225.75 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.85 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.95 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.92 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:53:44,329 [Rank 63]: iteration 1690/ 150000 | consumed samples: 108160 | elapsed time per iteration (ms): 888.2 | learning rate: 2.535E-04 | global batch size: 64 | lm loss: 2.005421E+00 | loss scale: 1.0 | grad norm: 0.619 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.54 | tokens-per-second-per-gpu: 9223.39 | +[ip-26-0-155-69:7]:2023-06-21 17:53:44,329 [Rank 63]: time (ms) | forward-compute: 223.25 | backward-compute: 398.22 | backward-params-all-reduce: 226.18 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.27 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 5.51 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 36.47 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:53:53,210 [Rank 63]: iteration 1700/ 150000 | consumed samples: 108800 | elapsed time per iteration (ms): 888.1 | learning rate: 2.550E-04 | global batch size: 64 | lm loss: 2.014932E+00 | loss scale: 1.0 | grad norm: 1.424 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.55 | tokens-per-second-per-gpu: 9224.22 | +[ip-26-0-155-69:7]:2023-06-21 17:53:53,210 [Rank 63]: time (ms) | forward-compute: 223.47 | backward-compute: 398.15 | backward-params-all-reduce: 225.99 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.09 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 5.49 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 36.47 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:54:02,092 [Rank 63]: iteration 1710/ 150000 | consumed samples: 109440 | elapsed time per iteration (ms): 888.2 | learning rate: 2.565E-04 | global batch size: 64 | lm loss: 1.982130E+00 | loss scale: 1.0 | grad norm: 1.028 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.53 | tokens-per-second-per-gpu: 9222.91 | +[ip-26-0-155-69:7]:2023-06-21 17:54:02,092 [Rank 63]: time (ms) | forward-compute: 223.23 | backward-compute: 398.14 | backward-params-all-reduce: 225.77 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.87 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 6.08 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 37.01 | batch-generator: 1.81 +[ip-26-0-155-69:7]:2023-06-21 17:54:10,981 [Rank 63]: iteration 1720/ 150000 | consumed samples: 110080 | elapsed time per iteration (ms): 888.9 | learning rate: 2.580E-04 | global batch size: 64 | lm loss: 1.984422E+00 | loss scale: 1.0 | grad norm: 1.076 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.45 | tokens-per-second-per-gpu: 9215.77 | +[ip-26-0-155-69:7]:2023-06-21 17:54:10,981 [Rank 63]: time (ms) | forward-compute: 223.52 | backward-compute: 398.12 | backward-params-all-reduce: 226.12 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.22 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 6.11 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 37.07 | batch-generator: 1.83 +[ip-26-0-155-69:7]:2023-06-21 17:54:19,854 [Rank 63]: iteration 1730/ 150000 | consumed samples: 110720 | elapsed time per iteration (ms): 887.3 | learning rate: 2.595E-04 | global batch size: 64 | lm loss: 1.928419E+00 | loss scale: 1.0 | grad norm: 0.959 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.65 | tokens-per-second-per-gpu: 9232.87 | +[ip-26-0-155-69:7]:2023-06-21 17:54:19,854 [Rank 63]: time (ms) | forward-compute: 223.60 | backward-compute: 398.14 | backward-params-all-reduce: 225.60 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.69 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.86 | batch-generator: 1.83 +[ip-26-0-155-69:7]:2023-06-21 17:54:28,726 [Rank 63]: iteration 1740/ 150000 | consumed samples: 111360 | elapsed time per iteration (ms): 887.2 | learning rate: 2.610E-04 | global batch size: 64 | lm loss: 1.964355E+00 | loss scale: 1.0 | grad norm: 0.532 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.65 | tokens-per-second-per-gpu: 9233.06 | +[ip-26-0-155-69:7]:2023-06-21 17:54:28,727 [Rank 63]: time (ms) | forward-compute: 223.54 | backward-compute: 398.09 | backward-params-all-reduce: 225.77 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.86 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.82 | batch-generator: 1.73 +[ip-26-0-155-69:7]:2023-06-21 17:54:37,602 [Rank 63]: iteration 1750/ 150000 | consumed samples: 112000 | elapsed time per iteration (ms): 887.5 | learning rate: 2.625E-04 | global batch size: 64 | lm loss: 1.910443E+00 | loss scale: 1.0 | grad norm: 0.638 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.62 | tokens-per-second-per-gpu: 9229.94 | +[ip-26-0-155-69:7]:2023-06-21 17:54:37,602 [Rank 63]: time (ms) | forward-compute: 223.35 | backward-compute: 398.10 | backward-params-all-reduce: 226.15 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.25 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.90 | batch-generator: 1.73 +[ip-26-0-155-69:7]:2023-06-21 17:54:46,474 [Rank 63]: iteration 1760/ 150000 | consumed samples: 112640 | elapsed time per iteration (ms): 887.2 | learning rate: 2.640E-04 | global batch size: 64 | lm loss: 1.922675E+00 | loss scale: 1.0 | grad norm: 0.677 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.65 | tokens-per-second-per-gpu: 9233.07 | +[ip-26-0-155-69:7]:2023-06-21 17:54:46,475 [Rank 63]: time (ms) | forward-compute: 223.35 | backward-compute: 398.15 | backward-params-all-reduce: 225.83 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.93 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.85 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:54:55,357 [Rank 63]: iteration 1770/ 150000 | consumed samples: 113280 | elapsed time per iteration (ms): 888.3 | learning rate: 2.655E-04 | global batch size: 64 | lm loss: 1.933500E+00 | loss scale: 1.0 | grad norm: 0.982 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.52 | tokens-per-second-per-gpu: 9222.02 | +[ip-26-0-155-69:7]:2023-06-21 17:54:55,358 [Rank 63]: time (ms) | forward-compute: 223.06 | backward-compute: 398.07 | backward-params-all-reduce: 226.09 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.18 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 6.09 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 37.03 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:55:04,236 [Rank 63]: iteration 1780/ 150000 | consumed samples: 113920 | elapsed time per iteration (ms): 887.9 | learning rate: 2.670E-04 | global batch size: 64 | lm loss: 1.984064E+00 | loss scale: 1.0 | grad norm: 0.498 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.58 | tokens-per-second-per-gpu: 9226.77 | +[ip-26-0-155-69:7]:2023-06-21 17:55:04,236 [Rank 63]: time (ms) | forward-compute: 223.09 | backward-compute: 398.08 | backward-params-all-reduce: 226.23 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.33 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 5.47 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 36.42 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:55:13,132 [Rank 63]: iteration 1790/ 150000 | consumed samples: 114560 | elapsed time per iteration (ms): 889.6 | learning rate: 2.685E-04 | global batch size: 64 | lm loss: 1.964734E+00 | loss scale: 1.0 | grad norm: 1.575 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.36 | tokens-per-second-per-gpu: 9208.16 | +[ip-26-0-155-69:7]:2023-06-21 17:55:13,132 [Rank 63]: time (ms) | forward-compute: 222.83 | backward-compute: 398.18 | backward-params-all-reduce: 226.33 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.43 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 7.32 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 38.29 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:55:22,003 [Rank 63]: iteration 1800/ 150000 | consumed samples: 115200 | elapsed time per iteration (ms): 887.1 | learning rate: 2.700E-04 | global batch size: 64 | lm loss: 1.968569E+00 | loss scale: 1.0 | grad norm: 0.595 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.67 | tokens-per-second-per-gpu: 9234.51 | +[ip-26-0-155-69:7]:2023-06-21 17:55:22,004 [Rank 63]: time (ms) | forward-compute: 222.95 | backward-compute: 398.03 | backward-params-all-reduce: 226.28 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.37 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.84 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:55:30,876 [Rank 63]: iteration 1810/ 150000 | consumed samples: 115840 | elapsed time per iteration (ms): 887.2 | learning rate: 2.715E-04 | global batch size: 64 | lm loss: 1.939122E+00 | loss scale: 1.0 | grad norm: 0.444 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.65 | tokens-per-second-per-gpu: 9233.04 | +[ip-26-0-155-69:7]:2023-06-21 17:55:30,876 [Rank 63]: time (ms) | forward-compute: 222.91 | backward-compute: 398.21 | backward-params-all-reduce: 226.15 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.26 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 4.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.91 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:55:39,747 [Rank 63]: iteration 1820/ 150000 | consumed samples: 116480 | elapsed time per iteration (ms): 887.2 | learning rate: 2.730E-04 | global batch size: 64 | lm loss: 1.910997E+00 | loss scale: 1.0 | grad norm: 0.426 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.66 | tokens-per-second-per-gpu: 9233.77 | +[ip-26-0-155-69:7]:2023-06-21 17:55:39,748 [Rank 63]: time (ms) | forward-compute: 223.12 | backward-compute: 398.14 | backward-params-all-reduce: 226.03 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.14 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.86 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:55:48,618 [Rank 63]: iteration 1830/ 150000 | consumed samples: 117120 | elapsed time per iteration (ms): 887.0 | learning rate: 2.745E-04 | global batch size: 64 | lm loss: 1.864534E+00 | loss scale: 1.0 | grad norm: 0.498 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.68 | tokens-per-second-per-gpu: 9235.12 | +[ip-26-0-155-69:7]:2023-06-21 17:55:48,618 [Rank 63]: time (ms) | forward-compute: 222.90 | backward-compute: 398.03 | backward-params-all-reduce: 226.18 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.28 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.89 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:55:57,504 [Rank 63]: iteration 1840/ 150000 | consumed samples: 117760 | elapsed time per iteration (ms): 888.6 | learning rate: 2.760E-04 | global batch size: 64 | lm loss: 1.888983E+00 | loss scale: 1.0 | grad norm: 0.654 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.48 | tokens-per-second-per-gpu: 9218.55 | +[ip-26-0-155-69:7]:2023-06-21 17:55:57,505 [Rank 63]: time (ms) | forward-compute: 223.24 | backward-compute: 398.08 | backward-params-all-reduce: 226.20 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.29 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 6.09 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 37.07 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 17:56:06,390 [Rank 63]: iteration 1850/ 150000 | consumed samples: 118400 | elapsed time per iteration (ms): 888.6 | learning rate: 2.775E-04 | global batch size: 64 | lm loss: 1.936798E+00 | loss scale: 1.0 | grad norm: 0.526 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.49 | tokens-per-second-per-gpu: 9219.29 | +[ip-26-0-155-69:7]:2023-06-21 17:56:06,391 [Rank 63]: time (ms) | forward-compute: 223.70 | backward-compute: 398.08 | backward-params-all-reduce: 226.23 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.33 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 5.51 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 36.48 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:56:15,291 [Rank 63]: iteration 1860/ 150000 | consumed samples: 119040 | elapsed time per iteration (ms): 890.1 | learning rate: 2.790E-04 | global batch size: 64 | lm loss: 1.896784E+00 | loss scale: 1.0 | grad norm: 0.453 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.31 | tokens-per-second-per-gpu: 9203.43 | +[ip-26-0-155-69:7]:2023-06-21 17:56:15,292 [Rank 63]: time (ms) | forward-compute: 223.72 | backward-compute: 398.05 | backward-params-all-reduce: 228.31 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 228.42 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.94 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.92 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:56:24,175 [Rank 63]: iteration 1870/ 150000 | consumed samples: 119680 | elapsed time per iteration (ms): 888.4 | learning rate: 2.805E-04 | global batch size: 64 | lm loss: 1.881472E+00 | loss scale: 1.0 | grad norm: 0.505 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.52 | tokens-per-second-per-gpu: 9221.52 | +[ip-26-0-155-69:7]:2023-06-21 17:56:24,175 [Rank 63]: time (ms) | forward-compute: 223.62 | backward-compute: 398.02 | backward-params-all-reduce: 226.21 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.31 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 5.49 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 36.46 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:56:33,048 [Rank 63]: iteration 1880/ 150000 | consumed samples: 120320 | elapsed time per iteration (ms): 887.3 | learning rate: 2.820E-04 | global batch size: 64 | lm loss: 1.873608E+00 | loss scale: 1.0 | grad norm: 0.489 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.64 | tokens-per-second-per-gpu: 9232.14 | +[ip-26-0-155-69:7]:2023-06-21 17:56:33,049 [Rank 63]: time (ms) | forward-compute: 223.33 | backward-compute: 398.05 | backward-params-all-reduce: 226.10 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.19 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.85 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:56:41,925 [Rank 63]: iteration 1890/ 150000 | consumed samples: 120960 | elapsed time per iteration (ms): 887.7 | learning rate: 2.835E-04 | global batch size: 64 | lm loss: 1.878910E+00 | loss scale: 1.0 | grad norm: 0.452 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.60 | tokens-per-second-per-gpu: 9228.79 | +[ip-26-0-155-69:7]:2023-06-21 17:56:41,925 [Rank 63]: time (ms) | forward-compute: 223.66 | backward-compute: 398.04 | backward-params-all-reduce: 226.13 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.22 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.00 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.81 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:56:50,806 [Rank 63]: iteration 1900/ 150000 | consumed samples: 121600 | elapsed time per iteration (ms): 888.1 | learning rate: 2.850E-04 | global batch size: 64 | lm loss: 1.867750E+00 | loss scale: 1.0 | grad norm: 0.446 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.55 | tokens-per-second-per-gpu: 9224.04 | +[ip-26-0-155-69:7]:2023-06-21 17:56:50,806 [Rank 63]: time (ms) | forward-compute: 223.87 | backward-compute: 398.06 | backward-params-all-reduce: 226.18 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.28 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 35.94 | batch-generator: 1.81 +[ip-26-0-155-69:7]:2023-06-21 17:56:59,708 [Rank 63]: iteration 1910/ 150000 | consumed samples: 122240 | elapsed time per iteration (ms): 890.3 | learning rate: 2.865E-04 | global batch size: 64 | lm loss: 1.903045E+00 | loss scale: 1.0 | grad norm: 0.461 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.29 | tokens-per-second-per-gpu: 9201.73 | +[ip-26-0-155-69:7]:2023-06-21 17:56:59,709 [Rank 63]: time (ms) | forward-compute: 223.93 | backward-compute: 398.16 | backward-params-all-reduce: 227.09 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 227.18 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.57 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.90 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 17:57:08,726 [Rank 63]: iteration 1920/ 150000 | consumed samples: 122880 | elapsed time per iteration (ms): 901.7 | learning rate: 2.880E-04 | global batch size: 64 | lm loss: 1.804187E+00 | loss scale: 1.0 | grad norm: 0.463 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 104.94 | tokens-per-second-per-gpu: 9084.71 | +[ip-26-0-155-69:7]:2023-06-21 17:57:08,726 [Rank 63]: time (ms) | forward-compute: 237.47 | backward-compute: 398.10 | backward-params-all-reduce: 226.30 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.39 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.82 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:57:18,334 [Rank 63]: iteration 1930/ 150000 | consumed samples: 123520 | elapsed time per iteration (ms): 960.8 | learning rate: 2.895E-04 | global batch size: 64 | lm loss: 1.886427E+00 | loss scale: 1.0 | grad norm: 1.246 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 98.49 | tokens-per-second-per-gpu: 8526.48 | +[ip-26-0-155-69:7]:2023-06-21 17:57:18,334 [Rank 63]: time (ms) | forward-compute: 257.77 | backward-compute: 398.14 | backward-params-all-reduce: 262.94 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 263.04 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 6.81 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 37.78 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:57:27,268 [Rank 63]: iteration 1940/ 150000 | consumed samples: 124160 | elapsed time per iteration (ms): 893.4 | learning rate: 2.910E-04 | global batch size: 64 | lm loss: 1.821943E+00 | loss scale: 1.0 | grad norm: 0.496 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.92 | tokens-per-second-per-gpu: 9169.53 | +[ip-26-0-155-69:7]:2023-06-21 17:57:27,268 [Rank 63]: time (ms) | forward-compute: 226.21 | backward-compute: 397.94 | backward-params-all-reduce: 226.73 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.83 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.51 | optimizer-clip-main-grad: 4.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.80 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:57:36,161 [Rank 63]: iteration 1950/ 150000 | consumed samples: 124800 | elapsed time per iteration (ms): 889.4 | learning rate: 2.925E-04 | global batch size: 64 | lm loss: 1.846065E+00 | loss scale: 1.0 | grad norm: 0.370 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.40 | tokens-per-second-per-gpu: 9211.12 | +[ip-26-0-155-69:7]:2023-06-21 17:57:36,162 [Rank 63]: time (ms) | forward-compute: 224.99 | backward-compute: 397.97 | backward-params-all-reduce: 226.49 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.59 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.85 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:57:45,045 [Rank 63]: iteration 1960/ 150000 | consumed samples: 125440 | elapsed time per iteration (ms): 888.4 | learning rate: 2.940E-04 | global batch size: 64 | lm loss: 1.848107E+00 | loss scale: 1.0 | grad norm: 0.439 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.52 | tokens-per-second-per-gpu: 9221.40 | +[ip-26-0-155-69:7]:2023-06-21 17:57:45,045 [Rank 63]: time (ms) | forward-compute: 224.49 | backward-compute: 398.01 | backward-params-all-reduce: 225.82 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.93 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 35.89 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:57:53,934 [Rank 63]: iteration 1970/ 150000 | consumed samples: 126080 | elapsed time per iteration (ms): 888.9 | learning rate: 2.955E-04 | global batch size: 64 | lm loss: 1.835699E+00 | loss scale: 1.0 | grad norm: 0.533 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.46 | tokens-per-second-per-gpu: 9216.04 | +[ip-26-0-155-69:7]:2023-06-21 17:57:53,934 [Rank 63]: time (ms) | forward-compute: 223.71 | backward-compute: 398.03 | backward-params-all-reduce: 225.90 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.01 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 6.13 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 37.13 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:58:02,814 [Rank 63]: iteration 1980/ 150000 | consumed samples: 126720 | elapsed time per iteration (ms): 888.0 | learning rate: 2.970E-04 | global batch size: 64 | lm loss: 1.814756E+00 | loss scale: 1.0 | grad norm: 0.656 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.56 | tokens-per-second-per-gpu: 9224.94 | +[ip-26-0-155-69:7]:2023-06-21 17:58:02,814 [Rank 63]: time (ms) | forward-compute: 223.60 | backward-compute: 398.00 | backward-params-all-reduce: 226.54 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.65 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.84 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:58:11,721 [Rank 63]: iteration 1990/ 150000 | consumed samples: 127360 | elapsed time per iteration (ms): 890.7 | learning rate: 2.985E-04 | global batch size: 64 | lm loss: 1.804731E+00 | loss scale: 1.0 | grad norm: 0.541 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.24 | tokens-per-second-per-gpu: 9197.44 | +[ip-26-0-155-69:7]:2023-06-21 17:58:11,721 [Rank 63]: time (ms) | forward-compute: 226.70 | backward-compute: 398.09 | backward-params-all-reduce: 225.99 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.09 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.85 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:58:20,597 [Rank 63]: iteration 2000/ 150000 | consumed samples: 128000 | elapsed time per iteration (ms): 887.6 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.843141E+00 | loss scale: 1.0 | grad norm: 0.463 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.61 | tokens-per-second-per-gpu: 9229.20 | +[ip-26-0-155-69:7]:2023-06-21 17:58:20,597 [Rank 63]: time (ms) | forward-compute: 223.66 | backward-compute: 398.16 | backward-params-all-reduce: 225.95 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.05 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.00 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.82 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:58:29,475 [Rank 63]: iteration 2010/ 150000 | consumed samples: 128640 | elapsed time per iteration (ms): 887.8 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.846218E+00 | loss scale: 1.0 | grad norm: 0.441 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.58 | tokens-per-second-per-gpu: 9226.83 | +[ip-26-0-155-69:7]:2023-06-21 17:58:29,476 [Rank 63]: time (ms) | forward-compute: 223.69 | backward-compute: 398.27 | backward-params-all-reduce: 225.86 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.96 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.56 | optimizer-clip-main-grad: 4.94 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.93 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:58:38,353 [Rank 63]: iteration 2020/ 150000 | consumed samples: 129280 | elapsed time per iteration (ms): 887.7 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.828299E+00 | loss scale: 1.0 | grad norm: 0.364 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.60 | tokens-per-second-per-gpu: 9228.28 | +[ip-26-0-155-69:7]:2023-06-21 17:58:38,353 [Rank 63]: time (ms) | forward-compute: 223.21 | backward-compute: 398.27 | backward-params-all-reduce: 225.76 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.86 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.51 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.83 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 17:58:47,381 [Rank 63]: iteration 2030/ 150000 | consumed samples: 129920 | elapsed time per iteration (ms): 902.8 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.837031E+00 | loss scale: 1.0 | grad norm: 0.462 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 104.81 | tokens-per-second-per-gpu: 9073.96 | +[ip-26-0-155-69:7]:2023-06-21 17:58:47,381 [Rank 63]: time (ms) | forward-compute: 238.88 | backward-compute: 398.15 | backward-params-all-reduce: 225.84 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.95 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.85 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:58:56,356 [Rank 63]: iteration 2040/ 150000 | consumed samples: 130560 | elapsed time per iteration (ms): 897.5 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.806408E+00 | loss scale: 1.0 | grad norm: 0.349 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 105.43 | tokens-per-second-per-gpu: 9127.12 | +[ip-26-0-155-69:7]:2023-06-21 17:58:56,356 [Rank 63]: time (ms) | forward-compute: 224.36 | backward-compute: 398.22 | backward-params-all-reduce: 234.67 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 234.77 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 4.95 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 35.93 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:59:05,227 [Rank 63]: iteration 2050/ 150000 | consumed samples: 131200 | elapsed time per iteration (ms): 887.1 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.813356E+00 | loss scale: 1.0 | grad norm: 0.400 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.67 | tokens-per-second-per-gpu: 9234.61 | +[ip-26-0-155-69:7]:2023-06-21 17:59:05,228 [Rank 63]: time (ms) | forward-compute: 223.01 | backward-compute: 398.25 | backward-params-all-reduce: 225.91 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.01 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.85 | batch-generator: 1.81 +[ip-26-0-155-69:7]:2023-06-21 17:59:14,101 [Rank 63]: iteration 2060/ 150000 | consumed samples: 131840 | elapsed time per iteration (ms): 887.4 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.818771E+00 | loss scale: 1.0 | grad norm: 0.470 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.63 | tokens-per-second-per-gpu: 9231.31 | +[ip-26-0-155-69:7]:2023-06-21 17:59:14,102 [Rank 63]: time (ms) | forward-compute: 223.20 | backward-compute: 398.28 | backward-params-all-reduce: 225.99 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.08 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.86 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 17:59:22,974 [Rank 63]: iteration 2070/ 150000 | consumed samples: 132480 | elapsed time per iteration (ms): 887.3 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.796845E+00 | loss scale: 1.0 | grad norm: 0.401 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.64 | tokens-per-second-per-gpu: 9232.27 | +[ip-26-0-155-69:7]:2023-06-21 17:59:22,975 [Rank 63]: time (ms) | forward-compute: 222.97 | backward-compute: 398.28 | backward-params-all-reduce: 226.12 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.22 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.85 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 17:59:31,879 [Rank 63]: iteration 2080/ 150000 | consumed samples: 133120 | elapsed time per iteration (ms): 890.4 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.824200E+00 | loss scale: 1.0 | grad norm: 0.370 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.27 | tokens-per-second-per-gpu: 9200.08 | +[ip-26-0-155-69:7]:2023-06-21 17:59:31,879 [Rank 63]: time (ms) | forward-compute: 225.75 | backward-compute: 398.24 | backward-params-all-reduce: 226.47 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.56 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.89 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 17:59:40,757 [Rank 63]: iteration 2090/ 150000 | consumed samples: 133760 | elapsed time per iteration (ms): 887.8 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.760048E+00 | loss scale: 1.0 | grad norm: 0.412 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.58 | tokens-per-second-per-gpu: 9226.97 | +[ip-26-0-155-69:7]:2023-06-21 17:59:40,757 [Rank 63]: time (ms) | forward-compute: 222.84 | backward-compute: 398.21 | backward-params-all-reduce: 226.93 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 227.02 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.84 | batch-generator: 1.73 +[ip-26-0-155-69:7]:2023-06-21 17:59:49,631 [Rank 63]: iteration 2100/ 150000 | consumed samples: 134400 | elapsed time per iteration (ms): 887.4 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.805639E+00 | loss scale: 1.0 | grad norm: 0.564 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.63 | tokens-per-second-per-gpu: 9231.04 | +[ip-26-0-155-69:7]:2023-06-21 17:59:49,632 [Rank 63]: time (ms) | forward-compute: 222.98 | backward-compute: 398.20 | backward-params-all-reduce: 226.35 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.45 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 4.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.89 | batch-generator: 1.72 +[ip-26-0-155-69:7]:2023-06-21 17:59:58,520 [Rank 63]: iteration 2110/ 150000 | consumed samples: 135040 | elapsed time per iteration (ms): 888.8 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.775501E+00 | loss scale: 1.0 | grad norm: 0.398 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.46 | tokens-per-second-per-gpu: 9216.42 | +[ip-26-0-155-69:7]:2023-06-21 17:59:58,520 [Rank 63]: time (ms) | forward-compute: 224.48 | backward-compute: 398.15 | backward-params-all-reduce: 226.28 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.38 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.88 | batch-generator: 1.72 +[ip-26-0-155-69:7]:2023-06-21 18:00:07,399 [Rank 63]: iteration 2120/ 150000 | consumed samples: 135680 | elapsed time per iteration (ms): 887.9 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.761086E+00 | loss scale: 1.0 | grad norm: 0.475 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.58 | tokens-per-second-per-gpu: 9226.59 | +[ip-26-0-155-69:7]:2023-06-21 18:00:07,399 [Rank 63]: time (ms) | forward-compute: 223.72 | backward-compute: 398.08 | backward-params-all-reduce: 226.17 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.26 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.91 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.85 | batch-generator: 1.73 +[ip-26-0-155-69:7]:2023-06-21 18:00:16,270 [Rank 63]: iteration 2130/ 150000 | consumed samples: 136320 | elapsed time per iteration (ms): 887.2 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.825589E+00 | loss scale: 1.0 | grad norm: 0.381 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.66 | tokens-per-second-per-gpu: 9233.87 | +[ip-26-0-155-69:7]:2023-06-21 18:00:16,271 [Rank 63]: time (ms) | forward-compute: 222.78 | backward-compute: 398.11 | backward-params-all-reduce: 226.35 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.44 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 4.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.89 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 18:00:25,141 [Rank 63]: iteration 2140/ 150000 | consumed samples: 136960 | elapsed time per iteration (ms): 887.0 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.760596E+00 | loss scale: 1.0 | grad norm: 0.346 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.68 | tokens-per-second-per-gpu: 9235.20 | +[ip-26-0-155-69:7]:2023-06-21 18:00:25,141 [Rank 63]: time (ms) | forward-compute: 223.01 | backward-compute: 398.11 | backward-params-all-reduce: 226.01 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.11 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.86 | batch-generator: 1.73 +[ip-26-0-155-69:7]:2023-06-21 18:00:34,022 [Rank 63]: iteration 2150/ 150000 | consumed samples: 137600 | elapsed time per iteration (ms): 888.1 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.764927E+00 | loss scale: 1.0 | grad norm: 0.420 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.55 | tokens-per-second-per-gpu: 9224.08 | +[ip-26-0-155-69:7]:2023-06-21 18:00:34,023 [Rank 63]: time (ms) | forward-compute: 223.06 | backward-compute: 398.09 | backward-params-all-reduce: 226.92 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.04 | backward-reduce-model-grads: 227.03 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.94 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.92 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 18:00:42,895 [Rank 63]: iteration 2160/ 150000 | consumed samples: 138240 | elapsed time per iteration (ms): 887.3 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.762487E+00 | loss scale: 1.0 | grad norm: 0.949 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.65 | tokens-per-second-per-gpu: 9232.88 | +[ip-26-0-155-69:7]:2023-06-21 18:00:42,895 [Rank 63]: time (ms) | forward-compute: 222.95 | backward-compute: 398.11 | backward-params-all-reduce: 226.27 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.37 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.86 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 18:00:51,766 [Rank 63]: iteration 2170/ 150000 | consumed samples: 138880 | elapsed time per iteration (ms): 887.2 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.754759E+00 | loss scale: 1.0 | grad norm: 0.527 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.66 | tokens-per-second-per-gpu: 9234.05 | +[ip-26-0-155-69:7]:2023-06-21 18:00:51,766 [Rank 63]: time (ms) | forward-compute: 223.00 | backward-compute: 398.11 | backward-params-all-reduce: 226.17 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.26 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.85 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 18:01:00,638 [Rank 63]: iteration 2180/ 150000 | consumed samples: 139520 | elapsed time per iteration (ms): 887.2 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.740713E+00 | loss scale: 1.0 | grad norm: 0.444 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.66 | tokens-per-second-per-gpu: 9233.40 | +[ip-26-0-155-69:7]:2023-06-21 18:01:00,639 [Rank 63]: time (ms) | forward-compute: 223.01 | backward-compute: 398.08 | backward-params-all-reduce: 226.23 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.34 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.51 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 35.84 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 18:01:09,539 [Rank 63]: iteration 2190/ 150000 | consumed samples: 140160 | elapsed time per iteration (ms): 890.1 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.739953E+00 | loss scale: 1.0 | grad norm: 0.428 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.32 | tokens-per-second-per-gpu: 9203.97 | +[ip-26-0-155-69:7]:2023-06-21 18:01:09,539 [Rank 63]: time (ms) | forward-compute: 225.38 | backward-compute: 398.10 | backward-params-all-reduce: 226.63 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.73 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.00 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.86 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 18:01:18,414 [Rank 63]: iteration 2200/ 150000 | consumed samples: 140800 | elapsed time per iteration (ms): 887.5 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.741342E+00 | loss scale: 1.0 | grad norm: 0.442 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.62 | tokens-per-second-per-gpu: 9230.52 | +[ip-26-0-155-69:7]:2023-06-21 18:01:18,414 [Rank 63]: time (ms) | forward-compute: 222.81 | backward-compute: 398.06 | backward-params-all-reduce: 226.72 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.82 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.00 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.85 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 18:01:27,317 [Rank 63]: iteration 2210/ 150000 | consumed samples: 141440 | elapsed time per iteration (ms): 890.3 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.756096E+00 | loss scale: 1.0 | grad norm: 0.364 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.28 | tokens-per-second-per-gpu: 9201.21 | +[ip-26-0-155-69:7]:2023-06-21 18:01:27,317 [Rank 63]: time (ms) | forward-compute: 225.62 | backward-compute: 398.12 | backward-params-all-reduce: 226.64 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.75 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.82 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 18:01:36,189 [Rank 63]: iteration 2220/ 150000 | consumed samples: 142080 | elapsed time per iteration (ms): 887.2 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.738577E+00 | loss scale: 1.0 | grad norm: 0.474 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.66 | tokens-per-second-per-gpu: 9233.73 | +[ip-26-0-155-69:7]:2023-06-21 18:01:36,189 [Rank 63]: time (ms) | forward-compute: 222.81 | backward-compute: 398.07 | backward-params-all-reduce: 226.32 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.43 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 35.89 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 18:01:45,059 [Rank 63]: iteration 2230/ 150000 | consumed samples: 142720 | elapsed time per iteration (ms): 887.1 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.702038E+00 | loss scale: 1.0 | grad norm: 0.623 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.67 | tokens-per-second-per-gpu: 9234.98 | +[ip-26-0-155-69:7]:2023-06-21 18:01:45,060 [Rank 63]: time (ms) | forward-compute: 222.88 | backward-compute: 398.13 | backward-params-all-reduce: 226.15 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.25 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.51 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.00 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.82 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 18:01:53,930 [Rank 63]: iteration 2240/ 150000 | consumed samples: 143360 | elapsed time per iteration (ms): 887.1 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.779816E+00 | loss scale: 1.0 | grad norm: 0.470 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.67 | tokens-per-second-per-gpu: 9234.95 | +[ip-26-0-155-69:7]:2023-06-21 18:01:53,930 [Rank 63]: time (ms) | forward-compute: 223.21 | backward-compute: 398.17 | backward-params-all-reduce: 225.80 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.90 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 21.99 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.81 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 18:02:02,802 [Rank 63]: iteration 2250/ 150000 | consumed samples: 144000 | elapsed time per iteration (ms): 887.2 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.703117E+00 | loss scale: 1.0 | grad norm: 0.395 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.66 | tokens-per-second-per-gpu: 9233.46 | +[ip-26-0-155-69:7]:2023-06-21 18:02:02,803 [Rank 63]: time (ms) | forward-compute: 223.12 | backward-compute: 398.17 | backward-params-all-reduce: 226.02 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.12 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.51 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.82 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 18:02:11,677 [Rank 63]: iteration 2260/ 150000 | consumed samples: 144640 | elapsed time per iteration (ms): 887.5 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.744578E+00 | loss scale: 1.0 | grad norm: 0.372 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.62 | tokens-per-second-per-gpu: 9230.24 | +[ip-26-0-155-69:7]:2023-06-21 18:02:11,678 [Rank 63]: time (ms) | forward-compute: 222.97 | backward-compute: 398.17 | backward-params-all-reduce: 226.33 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.44 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 35.90 | batch-generator: 1.83 +[ip-26-0-155-69:7]:2023-06-21 18:02:20,552 [Rank 63]: iteration 2270/ 150000 | consumed samples: 145280 | elapsed time per iteration (ms): 887.5 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.698719E+00 | loss scale: 1.0 | grad norm: 1.019 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.63 | tokens-per-second-per-gpu: 9230.86 | +[ip-26-0-155-69:7]:2023-06-21 18:02:20,552 [Rank 63]: time (ms) | forward-compute: 222.83 | backward-compute: 398.14 | backward-params-all-reduce: 225.95 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.05 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 5.49 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 36.44 | batch-generator: 1.75 +[ip-26-0-155-69:7]:2023-06-21 18:02:29,424 [Rank 63]: iteration 2280/ 150000 | consumed samples: 145920 | elapsed time per iteration (ms): 887.2 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.711292E+00 | loss scale: 1.0 | grad norm: 0.430 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.66 | tokens-per-second-per-gpu: 9233.44 | +[ip-26-0-155-69:7]:2023-06-21 18:02:29,424 [Rank 63]: time (ms) | forward-compute: 223.25 | backward-compute: 398.11 | backward-params-all-reduce: 225.98 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.08 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.81 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 18:02:38,293 [Rank 63]: iteration 2290/ 150000 | consumed samples: 146560 | elapsed time per iteration (ms): 886.9 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.690299E+00 | loss scale: 1.0 | grad norm: 0.336 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.69 | tokens-per-second-per-gpu: 9236.16 | +[ip-26-0-155-69:7]:2023-06-21 18:02:38,294 [Rank 63]: time (ms) | forward-compute: 222.88 | backward-compute: 398.13 | backward-params-all-reduce: 226.03 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.13 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.87 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.84 | batch-generator: 1.80 +[ip-26-0-155-69:7]:2023-06-21 18:02:47,168 [Rank 63]: iteration 2300/ 150000 | consumed samples: 147200 | elapsed time per iteration (ms): 887.4 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.739259E+00 | loss scale: 1.0 | grad norm: 0.317 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.63 | tokens-per-second-per-gpu: 9231.03 | +[ip-26-0-155-69:7]:2023-06-21 18:02:47,168 [Rank 63]: time (ms) | forward-compute: 223.02 | backward-compute: 398.20 | backward-params-all-reduce: 226.27 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.37 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.88 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 18:02:56,039 [Rank 63]: iteration 2310/ 150000 | consumed samples: 147840 | elapsed time per iteration (ms): 887.1 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.689472E+00 | loss scale: 1.0 | grad norm: 0.328 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.67 | tokens-per-second-per-gpu: 9234.68 | +[ip-26-0-155-69:7]:2023-06-21 18:02:56,039 [Rank 63]: time (ms) | forward-compute: 222.87 | backward-compute: 398.11 | backward-params-all-reduce: 226.25 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.35 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.83 | batch-generator: 1.76 +[ip-26-0-155-69:7]:2023-06-21 18:03:04,914 [Rank 63]: iteration 2320/ 150000 | consumed samples: 148480 | elapsed time per iteration (ms): 887.5 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.747082E+00 | loss scale: 1.0 | grad norm: 0.409 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.62 | tokens-per-second-per-gpu: 9229.93 | +[ip-26-0-155-69:7]:2023-06-21 18:03:04,915 [Rank 63]: time (ms) | forward-compute: 222.94 | backward-compute: 398.12 | backward-params-all-reduce: 226.05 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.15 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 5.47 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 36.42 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 18:03:13,788 [Rank 63]: iteration 2330/ 150000 | consumed samples: 149120 | elapsed time per iteration (ms): 887.3 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.746517E+00 | loss scale: 1.0 | grad norm: 0.334 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.64 | tokens-per-second-per-gpu: 9232.08 | +[ip-26-0-155-69:7]:2023-06-21 18:03:13,788 [Rank 63]: time (ms) | forward-compute: 222.63 | backward-compute: 398.17 | backward-params-all-reduce: 226.47 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.58 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.97 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.33 | optimizer: 35.98 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 18:03:22,657 [Rank 63]: iteration 2340/ 150000 | consumed samples: 149760 | elapsed time per iteration (ms): 886.9 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.695744E+00 | loss scale: 1.0 | grad norm: 0.464 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.69 | tokens-per-second-per-gpu: 9236.53 | +[ip-26-0-155-69:7]:2023-06-21 18:03:22,658 [Rank 63]: time (ms) | forward-compute: 222.78 | backward-compute: 398.11 | backward-params-all-reduce: 226.04 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.15 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.03 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.87 | batch-generator: 1.82 +[ip-26-0-155-69:7]:2023-06-21 18:03:31,525 [Rank 63]: iteration 2350/ 150000 | consumed samples: 150400 | elapsed time per iteration (ms): 886.8 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.696679E+00 | loss scale: 1.0 | grad norm: 0.349 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.70 | tokens-per-second-per-gpu: 9237.27 | +[ip-26-0-155-69:7]:2023-06-21 18:03:31,526 [Rank 63]: time (ms) | forward-compute: 222.73 | backward-compute: 398.13 | backward-params-all-reduce: 226.04 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.15 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.51 | optimizer-clip-main-grad: 4.90 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.00 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.83 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 18:03:40,393 [Rank 63]: iteration 2360/ 150000 | consumed samples: 151040 | elapsed time per iteration (ms): 886.8 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.703598E+00 | loss scale: 1.0 | grad norm: 0.336 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.71 | tokens-per-second-per-gpu: 9237.73 | +[ip-26-0-155-69:7]:2023-06-21 18:03:40,394 [Rank 63]: time (ms) | forward-compute: 222.71 | backward-compute: 398.16 | backward-params-all-reduce: 226.05 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.15 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.83 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 18:03:49,262 [Rank 63]: iteration 2370/ 150000 | consumed samples: 151680 | elapsed time per iteration (ms): 886.9 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.699918E+00 | loss scale: 1.0 | grad norm: 0.313 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.70 | tokens-per-second-per-gpu: 9236.87 | +[ip-26-0-155-69:7]:2023-06-21 18:03:49,262 [Rank 63]: time (ms) | forward-compute: 223.09 | backward-compute: 398.17 | backward-params-all-reduce: 225.66 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.75 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.04 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.89 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 18:03:58,134 [Rank 63]: iteration 2380/ 150000 | consumed samples: 152320 | elapsed time per iteration (ms): 887.2 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.646003E+00 | loss scale: 1.0 | grad norm: 0.406 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.66 | tokens-per-second-per-gpu: 9233.53 | +[ip-26-0-155-69:7]:2023-06-21 18:03:58,135 [Rank 63]: time (ms) | forward-compute: 223.34 | backward-compute: 398.13 | backward-params-all-reduce: 225.88 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.97 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.81 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 18:04:07,003 [Rank 63]: iteration 2390/ 150000 | consumed samples: 152960 | elapsed time per iteration (ms): 886.9 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.681774E+00 | loss scale: 1.0 | grad norm: 0.443 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.69 | tokens-per-second-per-gpu: 9236.35 | +[ip-26-0-155-69:7]:2023-06-21 18:04:07,004 [Rank 63]: time (ms) | forward-compute: 222.92 | backward-compute: 398.16 | backward-params-all-reduce: 225.95 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.05 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.55 | optimizer-clip-main-grad: 4.89 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.86 | batch-generator: 1.79 +[ip-26-0-155-69:7]:2023-06-21 18:04:15,879 [Rank 63]: iteration 2400/ 150000 | consumed samples: 153600 | elapsed time per iteration (ms): 887.6 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.655367E+00 | loss scale: 1.0 | grad norm: 0.279 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.61 | tokens-per-second-per-gpu: 9229.47 | +[ip-26-0-155-69:7]:2023-06-21 18:04:15,880 [Rank 63]: time (ms) | forward-compute: 223.47 | backward-compute: 398.12 | backward-params-all-reduce: 226.07 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.16 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.53 | optimizer-clip-main-grad: 4.92 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.00 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.87 | batch-generator: 1.78 +[ip-26-0-155-69:7]:2023-06-21 18:04:24,754 [Rank 63]: iteration 2410/ 150000 | consumed samples: 154240 | elapsed time per iteration (ms): 887.5 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.728302E+00 | loss scale: 1.0 | grad norm: 0.383 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.62 | tokens-per-second-per-gpu: 9230.69 | +[ip-26-0-155-69:7]:2023-06-21 18:04:24,755 [Rank 63]: time (ms) | forward-compute: 223.28 | backward-compute: 398.13 | backward-params-all-reduce: 226.23 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.33 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.51 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.01 | optimizer-copy-main-to-model-params: 8.32 | optimizer: 35.82 | batch-generator: 1.77 +[ip-26-0-155-69:7]:2023-06-21 18:04:33,624 [Rank 63]: iteration 2420/ 150000 | consumed samples: 154880 | elapsed time per iteration (ms): 886.9 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.646320E+00 | loss scale: 1.0 | grad norm: 0.326 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.69 | tokens-per-second-per-gpu: 9236.17 | +[ip-26-0-155-69:7]:2023-06-21 18:04:33,624 [Rank 63]: time (ms) | forward-compute: 223.16 | backward-compute: 398.14 | backward-params-all-reduce: 225.76 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 225.86 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.54 | optimizer-clip-main-grad: 4.93 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 21.99 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.86 | batch-generator: 1.74 +[ip-26-0-155-69:7]:2023-06-21 18:04:42,499 [Rank 63]: iteration 2430/ 150000 | consumed samples: 155520 | elapsed time per iteration (ms): 887.5 | learning rate: 3.000E-04 | global batch size: 64 | lm loss: 1.704535E+00 | loss scale: 1.0 | grad norm: 0.447 | number of skipped iterations: 0 | number of nan iterations: 0 | TFLOPs: 106.62 | tokens-per-second-per-gpu: 9230.11 | +[ip-26-0-155-69:7]:2023-06-21 18:04:42,499 [Rank 63]: time (ms) | forward-compute: 223.36 | backward-compute: 398.12 | backward-params-all-reduce: 226.19 | backward-layernorm-all-reduce: 0.02 | backward-embedding-all-reduce: 0.03 | backward-reduce-model-grads: 226.29 | backward-gather-model-params: 0.01 | optimizer-copy-to-main-grad: 0.52 | optimizer-clip-main-grad: 4.88 | optimizer-count-zeros: 0.01 | optimizer-inner-step: 22.02 | optimizer-copy-main-to-model-params: 8.31 | optimizer: 35.83 | batch-generator: 1.74 +srun: Job step aborted: Waiting up to 32 seconds for job step to finish. +slurmstepd: error: *** JOB 161653 ON ip-26-0-150-19 CANCELLED AT 2023-06-21T18:04:43 *** +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3652424 closing signal SIGTERM +WARNING:torch.distributed.elastic.agent.server.api:Received 15 death signal, shutting down workers +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3652425 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3652426 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3652427 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 281174 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 281175 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 281176 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3652428 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 711971 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3652429 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3652430 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 281177 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 281178 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 281179 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 281180 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 281181 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 711972 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 711973 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 711974 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 711975 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 3652431 closing signal SIGTERM +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 711976 closing signal SIGTERM