chrisvoncsefalvay commited on Feb 2, 2024

Commit

5e1c670

verified ·

1 Parent(s): 3c46ad8

Training in progress, step 5000

Browse files

Files changed (47) hide show

.amlignore +6 -0
.amlignore.amltmp +6 -0
.gitattributes +1 -0
.gitignore +886 -0
config.json +1 -1
data/.amlignore +6 -0
data/.amlignore.amltmp +6 -0
data/.gitkeep +0 -0
data/custom_vocab.txt +0 -0
model.safetensors +2 -2
notebooks/.amlignore +6 -0
notebooks/.amlignore.amltmp +6 -0
notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-28-1-27-56Z.ipynb +744 -0
notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-28-1-52-4Z.ipynb +788 -0
notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-28-13-2-30Z.ipynb +1147 -0
notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-28-15-7-36Z.ipynb +1452 -0
notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-28-16-26-9Z.ipynb +1246 -0
notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-28-20-56-58Z.ipynb +993 -0
notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-28-23-54-39Z.ipynb +692 -0
notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-28-3-12-1Z.ipynb +1053 -0
notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-28-4-13-53Z.ipynb +0 -0
notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-29-14-26-30Z.ipynb +739 -0
notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-29-16-5-15Z.ipynb +729 -0
notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-29-17-44-52Z.ipynb +739 -0
notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-29-3-40-27Z.ipynb +1001 -0
notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-29-4-40-54Z.ipynb +1073 -0
notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-30-21-44-8Z.ipynb +671 -0
notebooks/.ipynb_aml_checkpoints/microsample_model_comparison-checkpoint2024-0-31-14-6-22Z.ipynb +0 -0
notebooks/DAEDRA-Copy1.ipynb +1634 -0
notebooks/DAEDRA.ipynb +671 -0
notebooks/DAEDRA.yml +0 -0
notebooks/Dataset preparation.ipynb +524 -0
notebooks/Untitled.ipynb +33 -0
notebooks/comparisons.csv +3 -0
notebooks/daedra.ipynb.amltmp +671 -0
notebooks/daedra.py +134 -0
notebooks/daedra.py.amltmp +134 -0
notebooks/daedra_final_training.py.amltmp +136 -0
notebooks/emissions.csv +3 -0
notebooks/emissions.csv.amltmp +3 -0
notebooks/microsample_model_comparison.ipynb +0 -0
notebooks/tokenizer.json +0 -0
notebooks/wandb/.amlignore +6 -0
notebooks/wandb/.amlignore.amltmp +6 -0
paper/.gitkeep +0 -0
tokenizer.json +6 -1
training_args.bin +1 -1

.amlignore ADDED Viewed

	@@ -0,0 +1,6 @@

+## This file was auto generated by the Azure Machine Learning Studio. Please do not remove.
+## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots
+.ipynb_aml_checkpoints/
+*.amltmp
+*.amltemp

.amlignore.amltmp ADDED Viewed

	@@ -0,0 +1,6 @@

+## This file was auto generated by the Azure Machine Learning Studio. Please do not remove.
+## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots
+.ipynb_aml_checkpoints/
+*.amltmp
+*.amltemp

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+notebooks/comparisons.csv filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,886 @@

+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+# Data folder
+data/*.csv
+# AWS User-specific
+.idea/**/aws.xml
+# Generated files
+.idea/**/contentModel.xml
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+# CMake
+cmake-build-*/
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+# File-based project format
+*.iws
+# IntelliJ
+out/
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+# JIRA plugin
+atlassian-ide-plugin.xml
+# Cursive Clojure plugin
+.idea/replstate.xml
+# SonarLint plugin
+.idea/sonarlint/
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+# Editor-based Rest Client
+.idea/httpRequests
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+### OSX template
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+# Icon must end with two \r
+Icon
+# Thumbnails
+._*
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+### TeX template
+## Core latex/pdflatex auxiliary files:
+*.aux
+*.lof
+*.log
+*.lot
+*.fls
+*.out
+*.toc
+*.fmt
+*.fot
+*.cb
+*.cb2
+.*.lb
+## Intermediate documents:
+*.dvi
+*.xdv
+*-converted-to.*
+# these rules might exclude image files for figures etc.
+# *.ps
+# *.eps
+# *.pdf
+## Generated if empty string is given at "Please type another file name for output:"
+.pdf
+## Bibliography auxiliary files (bibtex/biblatex/biber):
+*.bbl
+*.bcf
+*.blg
+*-blx.aux
+*-blx.bib
+*.run.xml
+## Build tool auxiliary files:
+*.fdb_latexmk
+*.synctex
+*.synctex(busy)
+*.synctex.gz
+*.synctex.gz(busy)
+*.pdfsync
+## Build tool directories for auxiliary files
+# latexrun
+latex.out/
+## Auxiliary and intermediate files from other packages:
+# algorithms
+*.alg
+*.loa
+# achemso
+acs-*.bib
+# amsthm
+*.thm
+# beamer
+*.nav
+*.pre
+*.snm
+*.vrb
+# changes
+*.soc
+# comment
+*.cut
+# cprotect
+*.cpt
+# elsarticle (documentclass of Elsevier journals)
+*.spl
+# endnotes
+*.ent
+*.lox
+# feynmf/feynmp
+*.mf
+*.mp
+*.t[1-9]
+*.t[1-9][0-9]
+*.tfm
+#(r)(e)ledmac/(r)(e)ledpar
+*.end
+*.?end
+*.[1-9]
+*.[1-9][0-9]
+*.[1-9][0-9][0-9]
+*.[1-9]R
+*.[1-9][0-9]R
+*.[1-9][0-9][0-9]R
+*.eledsec[1-9]
+*.eledsec[1-9]R
+*.eledsec[1-9][0-9]
+*.eledsec[1-9][0-9]R
+*.eledsec[1-9][0-9][0-9]
+*.eledsec[1-9][0-9][0-9]R
+# glossaries
+*.acn
+*.acr
+*.glg
+*.glo
+*.gls
+*.glsdefs
+*.lzo
+*.lzs
+*.slg
+*.slo
+*.sls
+# uncomment this for glossaries-extra (will ignore makeindex's style files!)
+# *.ist
+# gnuplot
+*.gnuplot
+*.table
+# gnuplottex
+*-gnuplottex-*
+# gregoriotex
+*.gaux
+*.glog
+*.gtex
+# htlatex
+*.4ct
+*.4tc
+*.idv
+*.lg
+*.trc
+*.xref
+# hyperref
+*.brf
+# knitr
+*-concordance.tex
+# *.tikz
+*-tikzDictionary
+# listings
+*.lol
+# luatexja-ruby
+*.ltjruby
+# makeidx
+*.idx
+*.ilg
+*.ind
+# minitoc
+*.maf
+*.mlf
+*.mlt
+*.mtc[0-9]*
+*.slf[0-9]*
+*.slt[0-9]*
+*.stc[0-9]*
+# minted
+_minted*
+*.pyg
+# morewrites
+*.mw
+# newpax
+*.newpax
+# nomencl
+*.nlg
+*.nlo
+*.nls
+# pax
+*.pax
+# pdfpcnotes
+*.pdfpc
+# sagetex
+*.sagetex.sage
+*.sagetex.py
+*.sagetex.scmd
+# scrwfile
+*.wrt
+# svg
+svg-inkscape/
+# sympy
+*.sout
+*.sympy
+sympy-plots-for-*.tex/
+# pdfcomment
+*.upa
+*.upb
+# pythontex
+*.pytxcode
+pythontex-files-*/
+# tcolorbox
+*.listing
+# thmtools
+*.loe
+# TikZ & PGF
+*.dpth
+*.md5
+*.auxlock
+# titletoc
+*.ptc
+# todonotes
+*.tdo
+# vhistory
+*.hst
+*.ver
+*.lod
+# xcolor
+*.xcp
+# xmpincl
+*.xmpi
+# xindy
+*.xdy
+# xypic precompiled matrices and outlines
+*.xyc
+*.xyd
+# endfloat
+*.ttt
+*.fff
+# Latexian
+TSWLatexianTemp*
+## Editors:
+# WinEdt
+*.bak
+*.sav
+# Texpad
+.texpadtmp
+# LyX
+*.lyx~
+# Kile
+*.backup
+# gummi
+.*.swp
+# KBibTeX
+*~[0-9]*
+# TeXnicCenter
+*.tps
+# auto folder when using emacs and auctex
+./auto/*
+*.el
+# expex forward references with \gathertags
+*-tags.tex
+# standalone packages
+*.sta
+# Makeindex log files
+*.lpz
+# xwatermark package
+*.xwm
+# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
+# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
+# Uncomment the next line to have this generated file ignored.
+#*Notes.bib
+### JupyterNotebooks template
+# gitignore template for Jupyter Notebooks
+# website: http://jupyter.org/
+.ipynb_checkpoints
+*/.ipynb_checkpoints/*
+# IPython
+profile_default/
+ipython_config.py
+# Remove previous ipynb_checkpoints
+#   git rm -r .ipynb_checkpoints/
+### LaTeX template
+## Core latex/pdflatex auxiliary files:
+*.aux
+*.lof
+*.log
+*.lot
+*.fls
+*.out
+*.toc
+*.fmt
+*.fot
+*.cb
+*.cb2
+.*.lb
+## Intermediate documents:
+*.dvi
+*.xdv
+*-converted-to.*
+# these rules might exclude image files for figures etc.
+# *.ps
+# *.eps
+# *.pdf
+## Generated if empty string is given at "Please type another file name for output:"
+.pdf
+## Bibliography auxiliary files (bibtex/biblatex/biber):
+*.bbl
+*.bcf
+*.blg
+*-blx.aux
+*-blx.bib
+*.run.xml
+## Build tool auxiliary files:
+*.fdb_latexmk
+*.synctex
+*.synctex(busy)
+*.synctex.gz
+*.synctex.gz(busy)
+*.pdfsync
+## Build tool directories for auxiliary files
+# latexrun
+latex.out/
+## Auxiliary and intermediate files from other packages:
+# algorithms
+*.alg
+*.loa
+# achemso
+acs-*.bib
+# amsthm
+*.thm
+# beamer
+*.nav
+*.pre
+*.snm
+*.vrb
+# changes
+*.soc
+# comment
+*.cut
+# cprotect
+*.cpt
+# elsarticle (documentclass of Elsevier journals)
+*.spl
+# endnotes
+*.ent
+*.lox
+# feynmf/feynmp
+*.mf
+*.mp
+*.t[1-9]
+*.t[1-9][0-9]
+*.tfm
+#(r)(e)ledmac/(r)(e)ledpar
+*.end
+*.?end
+*.[1-9]
+*.[1-9][0-9]
+*.[1-9][0-9][0-9]
+*.[1-9]R
+*.[1-9][0-9]R
+*.[1-9][0-9][0-9]R
+*.eledsec[1-9]
+*.eledsec[1-9]R
+*.eledsec[1-9][0-9]
+*.eledsec[1-9][0-9]R
+*.eledsec[1-9][0-9][0-9]
+*.eledsec[1-9][0-9][0-9]R
+# glossaries
+*.acn
+*.acr
+*.glg
+*.glo
+*.gls
+*.glsdefs
+*.lzo
+*.lzs
+*.slg
+*.slo
+*.sls
+# uncomment this for glossaries-extra (will ignore makeindex's style files!)
+# *.ist
+# gnuplot
+*.gnuplot
+*.table
+# gnuplottex
+*-gnuplottex-*
+# gregoriotex
+*.gaux
+*.glog
+*.gtex
+# htlatex
+*.4ct
+*.4tc
+*.idv
+*.lg
+*.trc
+*.xref
+# hyperref
+*.brf
+# knitr
+*-concordance.tex
+# *.tikz
+*-tikzDictionary
+# listings
+*.lol
+# luatexja-ruby
+*.ltjruby
+# makeidx
+*.idx
+*.ilg
+*.ind
+# minitoc
+*.maf
+*.mlf
+*.mlt
+*.mtc[0-9]*
+*.slf[0-9]*
+*.slt[0-9]*
+*.stc[0-9]*
+# minted
+_minted*
+*.pyg
+# morewrites
+*.mw
+# newpax
+*.newpax
+# nomencl
+*.nlg
+*.nlo
+*.nls
+# pax
+*.pax
+# pdfpcnotes
+*.pdfpc
+# sagetex
+*.sagetex.sage
+*.sagetex.py
+*.sagetex.scmd
+# scrwfile
+*.wrt
+# svg
+svg-inkscape/
+# sympy
+*.sout
+*.sympy
+sympy-plots-for-*.tex/
+# pdfcomment
+*.upa
+*.upb
+# pythontex
+*.pytxcode
+pythontex-files-*/
+# tcolorbox
+*.listing
+# thmtools
+*.loe
+# TikZ & PGF
+*.dpth
+*.md5
+*.auxlock
+# titletoc
+*.ptc
+# todonotes
+*.tdo
+# vhistory
+*.hst
+*.ver
+*.lod
+# xcolor
+*.xcp
+# xmpincl
+*.xmpi
+# xindy
+*.xdy
+# xypic precompiled matrices and outlines
+*.xyc
+*.xyd
+# endfloat
+*.ttt
+*.fff
+# Latexian
+TSWLatexianTemp*
+## Editors:
+# WinEdt
+*.bak
+*.sav
+# Texpad
+.texpadtmp
+# LyX
+*.lyx~
+# Kile
+*.backup
+# gummi
+.*.swp
+# KBibTeX
+*~[0-9]*
+# TeXnicCenter
+*.tps
+# auto folder when using emacs and auctex
+./auto/*
+*.el
+# expex forward references with \gathertags
+*-tags.tex
+# standalone packages
+*.sta
+# Makeindex log files
+*.lpz
+# xwatermark package
+*.xwm
+# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
+# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
+# Uncomment the next line to have this generated file ignored.
+#*Notes.bib
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

config.json CHANGED Viewed

@@ -42,5 +42,5 @@
   "transformers_version": "4.37.2",
   "type_vocab_size": 2,
   "use_cache": true,
-  "vocab_size": 28996
 }

   "transformers_version": "4.37.2",
   "type_vocab_size": 2,
   "use_cache": true,
+  "vocab_size": 52000
 }

data/.amlignore ADDED Viewed

	@@ -0,0 +1,6 @@

+## This file was auto generated by the Azure Machine Learning Studio. Please do not remove.
+## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots
+.ipynb_aml_checkpoints/
+*.amltmp
+*.amltemp

data/.amlignore.amltmp ADDED Viewed

	@@ -0,0 +1,6 @@

+## This file was auto generated by the Azure Machine Learning Studio. Please do not remove.
+## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots
+.ipynb_aml_checkpoints/
+*.amltmp
+*.amltemp

data/.gitkeep ADDED Viewed

File without changes

data/custom_vocab.txt ADDED Viewed

File without changes

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:50f69282f3743ce8bae62eaaa651c74301fe373cb675fddcb86d9ef391b247b6
-size 433289224

 version https://git-lfs.github.com/spec/v1
+oid sha256:fd4f0f89ac5e5fa87847f7574a0c4343175b23afd37140f15173824a44dfee61
+size 503957528

notebooks/.amlignore ADDED Viewed

	@@ -0,0 +1,6 @@

+## This file was auto generated by the Azure Machine Learning Studio. Please do not remove.
+## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots
+.ipynb_aml_checkpoints/
+*.amltmp
+*.amltemp

notebooks/.amlignore.amltmp ADDED Viewed

	@@ -0,0 +1,6 @@

+## This file was auto generated by the Azure Machine Learning Studio. Please do not remove.
+## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots
+.ipynb_aml_checkpoints/
+*.amltmp
+*.amltemp

notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-28-1-27-56Z.ipynb ADDED Viewed

	@@ -0,0 +1,744 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# DAEDRA: Determining Adverse Event Disposition for Regulatory Affairs\n",
+    "\n",
+    "DAEDRA is a language model intended to predict the disposition (outcome) of an adverse event based on the text of the event report. Intended to be used to classify reports in passive reporting systems, it is trained on the [VAERS](https://vaers.hhs.gov/) dataset, which contains reports of adverse events following vaccination in the United States."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, pipeline\n",
+    "import torch\n",
+    "import os\n",
+    "from typing import List\n",
+    "from datasets import load_dataset\n",
+    "import shap\n",
+    "from sklearn.metrics import f1_score, accuracy_score, classification_report\n",
+    "\n",
+    "%load_ext watermark"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "caZjjFP0OyQNMVgZDiwswE",
+     "type": "CODE",
+     "hide_input_from_viewers": false,
+     "hide_output_from_viewers": false,
+     "report_properties": {
+      "rowId": "un8W7ez7ZwoGb5Co6nydEV"
+     }
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "device: str = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+    "\n",
+    "SEED: int = 42\n",
+    "\n",
+    "BATCH_SIZE: int = 8\n",
+    "EPOCHS: int = 1\n",
+    "model_ckpt: str = \"distilbert-base-uncased\"\n",
+    "\n",
+    "CLASS_NAMES: List[str] = [\"DIED\",\n",
+    "                          \"ER_VISIT\",\n",
+    "                          \"HOSPITAL\",\n",
+    "                          \"OFC_VISIT\",\n",
+    "                          \"X_STAY\",\n",
+    "                          \"DISABLE\",\n",
+    "                          \"D_PRESENTED\"]\n",
+    "\n",
+    "# WandB configuration\n",
+    "os.environ[\"WANDB_PROJECT\"] = \"DAEDRA model training\"  # name your W&B project\n",
+    "os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\"  # log all model checkpoints"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "%watermark --iversion"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "!nvidia-smi"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "UU2oOJhwbIualogG1YyCMd",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Loading the data set"
+   ],
+   "attachments": {},
+   "metadata": {
+    "datalore": {
+     "node_id": "t45KHugmcPVaO0nuk8tGJ9",
+     "type": "MD",
+     "hide_input_from_viewers": false,
+     "hide_output_from_viewers": false,
+     "report_properties": {
+      "rowId": "40nN9Hvgi1clHNV5RAemI5"
+     }
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "dataset = load_dataset(\"chrisvoncsefalvay/vaers-outcomes\")"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Tokenisation and encoding"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "I7n646PIscsUZRoHu6m7zm",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "def tokenize_and_encode(examples):\n",
+    "  return tokenizer(examples[\"text\"], truncation=True)"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "QBLOSI0yVIslV7v7qX9ZC3",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "cols = dataset[\"train\"].column_names\n",
+    "cols.remove(\"labels\")\n",
+    "ds_enc = dataset.map(tokenize_and_encode, batched=True, remove_columns=cols)"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "slHeNysZOX9uWS9PB7jFDb",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Training"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "class MultiLabelTrainer(Trainer):\n",
+    "    def compute_loss(self, model, inputs, return_outputs=False):\n",
+    "        labels = inputs.pop(\"labels\")\n",
+    "        outputs = model(**inputs)\n",
+    "        logits = outputs.logits\n",
+    "        loss_fct = torch.nn.BCEWithLogitsLoss()\n",
+    "        loss = loss_fct(logits.view(-1, self.model.config.num_labels),\n",
+    "                        labels.float().view(-1, self.model.config.num_labels))\n",
+    "        return (loss, outputs) if return_outputs else loss"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "itXWkbDw9sqbkMuDP84QoT",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(\"cuda\")"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "ZQU7aW6TV45VmhHOQRzcnF",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "def accuracy_threshold(y_pred, y_true, threshold=.5, sigmoid=True):\n",
+    "    y_pred = torch.from_numpy(y_pred)\n",
+    "    y_true = torch.from_numpy(y_true)\n",
+    "\n",
+    "    if sigmoid:\n",
+    "        y_pred = y_pred.sigmoid()\n",
+    "\n",
+    "    return ((y_pred > threshold) == y_true.bool()).float().mean().item()"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "swhgyyyxoGL8HjnXJtMuSW",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "def compute_metrics(eval_pred):\n",
+    "    predictions, labels = eval_pred\n",
+    "    return {'accuracy_thresh': accuracy_threshold(predictions, labels)}"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "1Uq3HtkaBxtHNAnSwit5cI",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "args = TrainingArguments(\n",
+    "    output_dir=\"vaers\",\n",
+    "    evaluation_strategy=\"epoch\",\n",
+    "    learning_rate=2e-5,\n",
+    "    per_device_train_batch_size=BATCH_SIZE,\n",
+    "    per_device_eval_batch_size=BATCH_SIZE,\n",
+    "    num_train_epochs=EPOCHS,\n",
+    "    weight_decay=.01,\n",
+    "    report_to=[\"wandb\"]\n",
+    ")"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "1iPZOTKPwSkTgX5dORqT89",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "multi_label_trainer = MultiLabelTrainer(\n",
+    "    model, \n",
+    "    args, \n",
+    "    train_dataset=ds_enc[\"train\"], \n",
+    "    eval_dataset=ds_enc[\"test\"], \n",
+    "    compute_metrics=compute_metrics, \n",
+    "    tokenizer=tokenizer\n",
+    ")"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "bnRkNvRYltLun6gCEgL7v0",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "multi_label_trainer.evaluate()"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "LO54PlDkWQdFrzV25FvduB",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "multi_label_trainer.train()"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "hf0Ei1QXEYDmBv1VNLZ4Zw",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Evaluation"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We instantiate a classifier `pipeline` and push it to CUDA."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "classifier = pipeline(\"text-classification\", \n",
+    "                      model, \n",
+    "                      tokenizer=tokenizer, \n",
+    "                      device=\"cuda:0\")"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "kHoUdBeqcyVXDSGv54C4aE",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We use the same tokenizer used for training to tokenize/encode the validation set."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "test_encodings = tokenizer.batch_encode_plus(dataset[\"validate\"][\"text\"], \n",
+    "                                             max_length=255, \n",
+    "                                             pad_to_max_length=True, \n",
+    "                                             return_token_type_ids=True, \n",
+    "                                             truncation=True)"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "Dr5WCWA6jL51NR1fSrQu6Z",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Once we've made the data loadable by putting it into a `DataLoader`, we "
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "test_data = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']), \n",
+    "                                           torch.tensor(test_encodings['attention_mask']), \n",
+    "                                           torch.tensor(ds_enc[\"validate\"][\"labels\"]), \n",
+    "                                           torch.tensor(test_encodings['token_type_ids']))\n",
+    "test_dataloader = torch.utils.data.DataLoader(test_data, \n",
+    "                                              sampler=torch.utils.data.SequentialSampler(test_data), \n",
+    "                                              batch_size=BATCH_SIZE)"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "MWfGq2tTkJNzFiDoUPq2X7",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "model.eval()\n",
+    "\n",
+    "logit_preds, true_labels, pred_labels, tokenized_texts = [], [], [], []\n",
+    "\n",
+    "for i, batch in enumerate(test_dataloader):\n",
+    "  batch = tuple(t.to(device) for t in batch)\n",
+    "  # Unpack the inputs from our dataloader\n",
+    "  b_input_ids, b_input_mask, b_labels, b_token_types = batch\n",
+    "  \n",
+    "  with torch.no_grad():\n",
+    "    outs = model(b_input_ids, attention_mask=b_input_mask)\n",
+    "    b_logit_pred = outs[0]\n",
+    "    pred_label = torch.sigmoid(b_logit_pred)\n",
+    "\n",
+    "    b_logit_pred = b_logit_pred.detach().cpu().numpy()\n",
+    "    pred_label = pred_label.to('cpu').numpy()\n",
+    "    b_labels = b_labels.to('cpu').numpy()\n",
+    "\n",
+    "  tokenized_texts.append(b_input_ids)\n",
+    "  logit_preds.append(b_logit_pred)\n",
+    "  true_labels.append(b_labels)\n",
+    "  pred_labels.append(pred_label)\n",
+    "\n",
+    "# Flatten outputs\n",
+    "tokenized_texts = [item for sublist in tokenized_texts for item in sublist]\n",
+    "pred_labels = [item for sublist in pred_labels for item in sublist]\n",
+    "true_labels = [item for sublist in true_labels for item in sublist]\n",
+    "\n",
+    "# Converting flattened binary values to boolean values\n",
+    "true_bools = [tl == 1 for tl in true_labels]\n",
+    "pred_bools = [pl > 0.50 for pl in pred_labels] "
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "1SJCSrQTRCexFCNCIyRrzL",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We create a classification report:"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools, average='micro'))\n",
+    "print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools), '\\n')\n",
+    "clf_report = classification_report(true_bools, pred_bools, target_names=CLASS_NAMES)\n",
+    "print(clf_report)"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "eBprrgF086mznPbPVBpOLS",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Finally, we render a 'head to head' comparison table that maps each text prediction to actual and predicted labels."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "# Creating a map of class names from class numbers\n",
+    "idx2label = dict(zip(range(len(CLASS_NAMES)), CLASS_NAMES))"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "yELHY0IEwMlMw3x6e7hoD1",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "true_label_idxs, pred_label_idxs = [], []\n",
+    "\n",
+    "for vals in true_bools:\n",
+    "  true_label_idxs.append(np.where(vals)[0].flatten().tolist())\n",
+    "for vals in pred_bools:\n",
+    "  pred_label_idxs.append(np.where(vals)[0].flatten().tolist())"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "jH0S35dDteUch01sa6me6e",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "true_label_texts, pred_label_texts = [], []\n",
+    "\n",
+    "for vals in true_label_idxs:\n",
+    "  if vals:\n",
+    "    true_label_texts.append([idx2label[val] for val in vals])\n",
+    "  else:\n",
+    "    true_label_texts.append(vals)\n",
+    "\n",
+    "for vals in pred_label_idxs:\n",
+    "  if vals:\n",
+    "    pred_label_texts.append([idx2label[val] for val in vals])\n",
+    "  else:\n",
+    "    pred_label_texts.append(vals)"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "h4vHL8XdGpayZ6xLGJUF6F",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "symptom_texts = [tokenizer.decode(text,\n",
+    "                                  skip_special_tokens=True,\n",
+    "                                  clean_up_tokenization_spaces=False) for text in tokenized_texts]"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "SxUmVHfQISEeptg1SawOmB",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "comparisons_df = pd.DataFrame({'symptom_text': symptom_texts, \n",
+    "                               'true_labels': true_label_texts, \n",
+    "                               'pred_labels':pred_label_texts})\n",
+    "comparisons_df.to_csv('comparisons.csv')\n",
+    "comparisons_df"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "BxFNigNGRLTOqraI55BPSH",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Shapley analysis"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "explainer = shap.Explainer(classifier, output_names=CLASS_NAMES)"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "OpdZcoenX2HwzLdai7K5UA",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "shap_values = explainer(dataset[\"validate\"][\"text\"][1:2])"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "FvbCMfIDlcf16YSvb8wNQv",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "shap.plots.text(shap_values)"
+   ],
+   "execution_count": null,
+   "outputs": [],
+   "metadata": {
+    "datalore": {
+     "node_id": "TSxvakWLPCpjVMWi9ZdEbd",
+     "type": "CODE",
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "name": "python3",
+   "language": "python",
+   "display_name": "Python 3 (ipykernel)"
+  },
+  "datalore": {
+   "computation_mode": "JUPYTER",
+   "package_manager": "pip",
+   "base_environment": "default",
+   "packages": [
+    {
+     "name": "datasets",
+     "version": "2.16.1",
+     "source": "PIP"
+    },
+    {
+     "name": "torch",
+     "version": "2.1.2",
+     "source": "PIP"
+    },
+    {
+     "name": "accelerate",
+     "version": "0.26.1",
+     "source": "PIP"
+    }
+   ],
+   "report_row_ids": [
+    "un8W7ez7ZwoGb5Co6nydEV",
+    "40nN9Hvgi1clHNV5RAemI5",
+    "TgRD90H5NSPpKS41OeXI1w",
+    "ZOm5BfUs3h1EGLaUkBGeEB",
+    "kOP0CZWNSk6vqE3wkPp7Vc",
+    "W4PWcOu2O2pRaZyoE2W80h",
+    "RolbOnQLIftk0vy9mIcz5M",
+    "8OPhUgbaNJmOdiq5D3a6vK",
+    "5Qrt3jSvSrpK6Ne1hS6shL",
+    "hTq7nFUrovN5Ao4u6dIYWZ",
+    "I8WNZLpJ1DVP2wiCW7YBIB",
+    "SawhU3I9BewSE1XBPstpNJ",
+    "80EtLEl2FIE4FqbWnUD3nT"
+   ],
+   "version": 3
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-28-1-52-4Z.ipynb ADDED Viewed

	@@ -0,0 +1,788 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# DAEDRA: Determining Adverse Event Disposition for Regulatory Affairs\n",
+        "\n",
+        "DAEDRA is a language model intended to predict the disposition (outcome) of an adverse event based on the text of the event report. Intended to be used to classify reports in passive reporting systems, it is trained on the [VAERS](https://vaers.hhs.gov/) dataset, which contains reports of adverse events following vaccination in the United States."
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "import torch\n",
+        "import os\n",
+        "from typing import List\n",
+        "from datasets import load_dataset\n",
+        "import shap\n",
+        "from sklearn.metrics import f1_score, accuracy_score, classification_report\n",
+        "from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, pipeline\n",
+        "\n",
+        "%load_ext watermark"
+      ],
+      "outputs": [
+        {
+          "output_type": "error",
+          "ename": "ModuleNotFoundError",
+          "evalue": "No module named 'torch'",
+          "traceback": [
+            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+            "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+            "Cell \u001b[0;32mIn[2], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m List\n",
+            "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'torch'"
+          ]
+        }
+      ],
+      "execution_count": 2,
+      "metadata": {
+        "datalore": {
+          "node_id": "caZjjFP0OyQNMVgZDiwswE",
+          "type": "CODE",
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "report_properties": {
+            "rowId": "un8W7ez7ZwoGb5Co6nydEV"
+          }
+        },
+        "gather": {
+          "logged": 1706406690290
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "device: str = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+        "\n",
+        "SEED: int = 42\n",
+        "\n",
+        "BATCH_SIZE: int = 8\n",
+        "EPOCHS: int = 1\n",
+        "model_ckpt: str = \"distilbert-base-uncased\"\n",
+        "\n",
+        "CLASS_NAMES: List[str] = [\"DIED\",\n",
+        "                          \"ER_VISIT\",\n",
+        "                          \"HOSPITAL\",\n",
+        "                          \"OFC_VISIT\",\n",
+        "                          \"X_STAY\",\n",
+        "                          \"DISABLE\",\n",
+        "                          \"D_PRESENTED\"]\n",
+        "\n",
+        "# WandB configuration\n",
+        "os.environ[\"WANDB_PROJECT\"] = \"DAEDRA model training\"  # name your W&B project\n",
+        "os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\"  # log all model checkpoints"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%watermark --iversion"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!nvidia-smi"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Sun Jan 28 01:31:42 2024       \r\n+---------------------------------------------------------------------------------------+\r\n| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |\r\n|-----------------------------------------+----------------------+----------------------+\r\n| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\r\n| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\r\n|                                         |                      |               MIG M. |\r\n|=========================================+======================+======================|\r\n|   0  Tesla V100-PCIE-16GB           Off | 00000001:00:00.0 Off |                  Off |\r\n| N/A   28C    P0              37W / 250W |      0MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n|   1  Tesla V100-PCIE-16GB           Off | 00000002:00:00.0 Off |                  Off |\r\n| N/A   29C    P0              36W / 250W |      0MiB / 16384MiB |      1%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n                                                                                         \r\n+---------------------------------------------------------------------------------------+\r\n| Processes:                                                                            |\r\n|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\r\n|        ID   ID                                                             Usage      |\r\n|=======================================================================================|\r\n|  No running processes found                                                           |\r\n+---------------------------------------------------------------------------------------+\r\n"
+        }
+      ],
+      "execution_count": 4,
+      "metadata": {
+        "datalore": {
+          "node_id": "UU2oOJhwbIualogG1YyCMd",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "source": [
+        "## Loading the data set"
+      ],
+      "metadata": {
+        "datalore": {
+          "node_id": "t45KHugmcPVaO0nuk8tGJ9",
+          "type": "MD",
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "report_properties": {
+            "rowId": "40nN9Hvgi1clHNV5RAemI5"
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dataset = load_dataset(\"chrisvoncsefalvay/vaers-outcomes\")"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Tokenisation and encoding"
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "I7n646PIscsUZRoHu6m7zm",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def tokenize_and_encode(examples):\n",
+        "  return tokenizer(examples[\"text\"], truncation=True)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "QBLOSI0yVIslV7v7qX9ZC3",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "cols = dataset[\"train\"].column_names\n",
+        "cols.remove(\"labels\")\n",
+        "ds_enc = dataset.map(tokenize_and_encode, batched=True, remove_columns=cols)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "slHeNysZOX9uWS9PB7jFDb",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Training"
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "class MultiLabelTrainer(Trainer):\n",
+        "    def compute_loss(self, model, inputs, return_outputs=False):\n",
+        "        labels = inputs.pop(\"labels\")\n",
+        "        outputs = model(**inputs)\n",
+        "        logits = outputs.logits\n",
+        "        loss_fct = torch.nn.BCEWithLogitsLoss()\n",
+        "        loss = loss_fct(logits.view(-1, self.model.config.num_labels),\n",
+        "                        labels.float().view(-1, self.model.config.num_labels))\n",
+        "        return (loss, outputs) if return_outputs else loss"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "itXWkbDw9sqbkMuDP84QoT",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(\"cuda\")"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "ZQU7aW6TV45VmhHOQRzcnF",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def accuracy_threshold(y_pred, y_true, threshold=.5, sigmoid=True):\n",
+        "    y_pred = torch.from_numpy(y_pred)\n",
+        "    y_true = torch.from_numpy(y_true)\n",
+        "\n",
+        "    if sigmoid:\n",
+        "        y_pred = y_pred.sigmoid()\n",
+        "\n",
+        "    return ((y_pred > threshold) == y_true.bool()).float().mean().item()"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "swhgyyyxoGL8HjnXJtMuSW",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def compute_metrics(eval_pred):\n",
+        "    predictions, labels = eval_pred\n",
+        "    return {'accuracy_thresh': accuracy_threshold(predictions, labels)}"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "1Uq3HtkaBxtHNAnSwit5cI",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "args = TrainingArguments(\n",
+        "    output_dir=\"vaers\",\n",
+        "    evaluation_strategy=\"epoch\",\n",
+        "    learning_rate=2e-5,\n",
+        "    per_device_train_batch_size=BATCH_SIZE,\n",
+        "    per_device_eval_batch_size=BATCH_SIZE,\n",
+        "    num_train_epochs=EPOCHS,\n",
+        "    weight_decay=.01,\n",
+        "    report_to=[\"wandb\"]\n",
+        ")"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "1iPZOTKPwSkTgX5dORqT89",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "multi_label_trainer = MultiLabelTrainer(\n",
+        "    model, \n",
+        "    args, \n",
+        "    train_dataset=ds_enc[\"train\"], \n",
+        "    eval_dataset=ds_enc[\"test\"], \n",
+        "    compute_metrics=compute_metrics, \n",
+        "    tokenizer=tokenizer\n",
+        ")"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "bnRkNvRYltLun6gCEgL7v0",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "multi_label_trainer.evaluate()"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "LO54PlDkWQdFrzV25FvduB",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "multi_label_trainer.train()"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "hf0Ei1QXEYDmBv1VNLZ4Zw",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Evaluation"
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We instantiate a classifier `pipeline` and push it to CUDA."
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "classifier = pipeline(\"text-classification\", \n",
+        "                      model, \n",
+        "                      tokenizer=tokenizer, \n",
+        "                      device=\"cuda:0\")"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "kHoUdBeqcyVXDSGv54C4aE",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We use the same tokenizer used for training to tokenize/encode the validation set."
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "test_encodings = tokenizer.batch_encode_plus(dataset[\"validate\"][\"text\"], \n",
+        "                                             max_length=255, \n",
+        "                                             pad_to_max_length=True, \n",
+        "                                             return_token_type_ids=True, \n",
+        "                                             truncation=True)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "Dr5WCWA6jL51NR1fSrQu6Z",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Once we've made the data loadable by putting it into a `DataLoader`, we "
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "test_data = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']), \n",
+        "                                           torch.tensor(test_encodings['attention_mask']), \n",
+        "                                           torch.tensor(ds_enc[\"validate\"][\"labels\"]), \n",
+        "                                           torch.tensor(test_encodings['token_type_ids']))\n",
+        "test_dataloader = torch.utils.data.DataLoader(test_data, \n",
+        "                                              sampler=torch.utils.data.SequentialSampler(test_data), \n",
+        "                                              batch_size=BATCH_SIZE)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "MWfGq2tTkJNzFiDoUPq2X7",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model.eval()\n",
+        "\n",
+        "logit_preds, true_labels, pred_labels, tokenized_texts = [], [], [], []\n",
+        "\n",
+        "for i, batch in enumerate(test_dataloader):\n",
+        "  batch = tuple(t.to(device) for t in batch)\n",
+        "  # Unpack the inputs from our dataloader\n",
+        "  b_input_ids, b_input_mask, b_labels, b_token_types = batch\n",
+        "  \n",
+        "  with torch.no_grad():\n",
+        "    outs = model(b_input_ids, attention_mask=b_input_mask)\n",
+        "    b_logit_pred = outs[0]\n",
+        "    pred_label = torch.sigmoid(b_logit_pred)\n",
+        "\n",
+        "    b_logit_pred = b_logit_pred.detach().cpu().numpy()\n",
+        "    pred_label = pred_label.to('cpu').numpy()\n",
+        "    b_labels = b_labels.to('cpu').numpy()\n",
+        "\n",
+        "  tokenized_texts.append(b_input_ids)\n",
+        "  logit_preds.append(b_logit_pred)\n",
+        "  true_labels.append(b_labels)\n",
+        "  pred_labels.append(pred_label)\n",
+        "\n",
+        "# Flatten outputs\n",
+        "tokenized_texts = [item for sublist in tokenized_texts for item in sublist]\n",
+        "pred_labels = [item for sublist in pred_labels for item in sublist]\n",
+        "true_labels = [item for sublist in true_labels for item in sublist]\n",
+        "\n",
+        "# Converting flattened binary values to boolean values\n",
+        "true_bools = [tl == 1 for tl in true_labels]\n",
+        "pred_bools = [pl > 0.50 for pl in pred_labels] "
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "1SJCSrQTRCexFCNCIyRrzL",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We create a classification report:"
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools, average='micro'))\n",
+        "print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools), '\\n')\n",
+        "clf_report = classification_report(true_bools, pred_bools, target_names=CLASS_NAMES)\n",
+        "print(clf_report)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "eBprrgF086mznPbPVBpOLS",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Finally, we render a 'head to head' comparison table that maps each text prediction to actual and predicted labels."
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Creating a map of class names from class numbers\n",
+        "idx2label = dict(zip(range(len(CLASS_NAMES)), CLASS_NAMES))"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "yELHY0IEwMlMw3x6e7hoD1",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "true_label_idxs, pred_label_idxs = [], []\n",
+        "\n",
+        "for vals in true_bools:\n",
+        "  true_label_idxs.append(np.where(vals)[0].flatten().tolist())\n",
+        "for vals in pred_bools:\n",
+        "  pred_label_idxs.append(np.where(vals)[0].flatten().tolist())"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "jH0S35dDteUch01sa6me6e",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "true_label_texts, pred_label_texts = [], []\n",
+        "\n",
+        "for vals in true_label_idxs:\n",
+        "  if vals:\n",
+        "    true_label_texts.append([idx2label[val] for val in vals])\n",
+        "  else:\n",
+        "    true_label_texts.append(vals)\n",
+        "\n",
+        "for vals in pred_label_idxs:\n",
+        "  if vals:\n",
+        "    pred_label_texts.append([idx2label[val] for val in vals])\n",
+        "  else:\n",
+        "    pred_label_texts.append(vals)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "h4vHL8XdGpayZ6xLGJUF6F",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "symptom_texts = [tokenizer.decode(text,\n",
+        "                                  skip_special_tokens=True,\n",
+        "                                  clean_up_tokenization_spaces=False) for text in tokenized_texts]"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "SxUmVHfQISEeptg1SawOmB",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "comparisons_df = pd.DataFrame({'symptom_text': symptom_texts, \n",
+        "                               'true_labels': true_label_texts, \n",
+        "                               'pred_labels':pred_label_texts})\n",
+        "comparisons_df.to_csv('comparisons.csv')\n",
+        "comparisons_df"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "BxFNigNGRLTOqraI55BPSH",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Shapley analysis"
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "explainer = shap.Explainer(classifier, output_names=CLASS_NAMES)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "OpdZcoenX2HwzLdai7K5UA",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "shap_values = explainer(dataset[\"validate\"][\"text\"][1:2])"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "FvbCMfIDlcf16YSvb8wNQv",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "shap.plots.text(shap_values)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "TSxvakWLPCpjVMWi9ZdEbd",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "name": "python3",
+      "language": "python",
+      "display_name": "Python 3 (ipykernel)"
+    },
+    "datalore": {
+      "computation_mode": "JUPYTER",
+      "package_manager": "pip",
+      "base_environment": "default",
+      "packages": [
+        {
+          "name": "datasets",
+          "version": "2.16.1",
+          "source": "PIP"
+        },
+        {
+          "name": "torch",
+          "version": "2.1.2",
+          "source": "PIP"
+        },
+        {
+          "name": "accelerate",
+          "version": "0.26.1",
+          "source": "PIP"
+        }
+      ],
+      "report_row_ids": [
+        "un8W7ez7ZwoGb5Co6nydEV",
+        "40nN9Hvgi1clHNV5RAemI5",
+        "TgRD90H5NSPpKS41OeXI1w",
+        "ZOm5BfUs3h1EGLaUkBGeEB",
+        "kOP0CZWNSk6vqE3wkPp7Vc",
+        "W4PWcOu2O2pRaZyoE2W80h",
+        "RolbOnQLIftk0vy9mIcz5M",
+        "8OPhUgbaNJmOdiq5D3a6vK",
+        "5Qrt3jSvSrpK6Ne1hS6shL",
+        "hTq7nFUrovN5Ao4u6dIYWZ",
+        "I8WNZLpJ1DVP2wiCW7YBIB",
+        "SawhU3I9BewSE1XBPstpNJ",
+        "80EtLEl2FIE4FqbWnUD3nT"
+      ],
+      "version": 3
+    },
+    "microsoft": {
+      "ms_spell_check": {
+        "ms_spell_check_language": "en"
+      }
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.8.5",
+      "mimetype": "text/x-python",
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "pygments_lexer": "ipython3",
+      "nbconvert_exporter": "python",
+      "file_extension": ".py"
+    },
+    "nteract": {
+      "version": "nteract-front-end@1.0.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}

notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-28-13-2-30Z.ipynb ADDED Viewed

	@@ -0,0 +1,1147 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# DAEDRA: Determining Adverse Event Disposition for Regulatory Affairs\n",
+        "\n",
+        "DAEDRA is a language model intended to predict the disposition (outcome) of an adverse event based on the text of the event report. Intended to be used to classify reports in passive reporting systems, it is trained on the [VAERS](https://vaers.hhs.gov/) dataset, which contains reports of adverse events following vaccination in the United States."
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%pip install accelerate -U"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Requirement already satisfied: accelerate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.26.1)\nRequirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (23.1)\nRequirement already satisfied: pyyaml in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (6.0)\nRequirement already satisfied: psutil in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (5.9.5)\nRequirement already satisfied: torch>=1.10.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.12.0)\nRequirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.23.5)\nRequirement already satisfied: huggingface-hub in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.20.3)\nRequirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.4.2)\nRequirement already satisfied: typing_extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from torch>=1.10.0->accelerate) (4.6.3)\nRequirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (3.13.1)\nRequirement already satisfied: tqdm>=4.42.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (4.65.0)\nRequirement already satisfied: fsspec>=2023.5.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2023.10.0)\nRequirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2.31.0)\nRequirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (2023.5.7)\nRequirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.1.0)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (1.26.16)\nRequirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.4)\nNote: you may need to restart the kernel to use updated packages.\n"
+        }
+      ],
+      "execution_count": 1,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%pip install transformers datasets shap watermark wandb"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Requirement already satisfied: transformers in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (4.37.1)\nRequirement already satisfied: datasets in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.16.1)\nRequirement already satisfied: shap in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.44.1)\nRequirement already satisfied: watermark in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.4.3)\nRequirement already satisfied: wandb in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.16.2)\nRequirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (1.23.5)\nRequirement already satisfied: tqdm>=4.27 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (4.65.0)\nRequirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.20.3)\nRequirement already satisfied: pyyaml>=5.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (6.0)\nRequirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (3.13.1)\nRequirement already satisfied: regex!=2019.12.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2023.12.25)\nRequirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2.31.0)\nRequirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (23.1)\nRequirement already satisfied: tokenizers<0.19,>=0.14 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.15.1)\nRequirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.4.2)\nRequirement already satisfied: pandas in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2.0.2)\nRequirement already satisfied: xxhash in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.4.1)\nRequirement already satisfied: multiprocess in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.70.15)\nRequirement already satisfied: dill<0.3.8,>=0.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.3.7)\nRequirement already satisfied: aiohttp in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.9.1)\nRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2023.10.0)\nRequirement already satisfied: pyarrow>=8.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (9.0.0)\nRequirement already satisfied: pyarrow-hotfix in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.6)\nRequirement already satisfied: scipy in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.10.1)\nRequirement already satisfied: cloudpickle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (2.2.1)\nRequirement already satisfied: numba in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.58.1)\nRequirement already satisfied: slicer==0.0.7 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.0.7)\nRequirement already satisfied: scikit-learn in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.2.2)\nRequirement already satisfied: ipython>=6.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (8.12.2)\nRequirement already satisfied: setuptools in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (65.6.3)\nRequirement already satisfied: importlib-metadata>=1.4 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (6.7.0)\nRequirement already satisfied: psutil>=5.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (5.9.5)\nRequirement already satisfied: setproctitle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.3.3)\nRequirement already satisfied: appdirs>=1.4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.4.4)\nRequirement already satisfied: GitPython!=3.1.29,>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.1.31)\nRequirement already satisfied: protobuf!=4.21.0,<5,>=3.12.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.19.6)\nRequirement already satisfied: typing-extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (4.6.3)\nRequirement already satisfied: docker-pycreds>=0.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (0.4.0)\nRequirement already satisfied: sentry-sdk>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.39.2)\nRequirement already satisfied: Click!=8.0.0,>=7.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (8.1.3)\nRequirement already satisfied: six>=1.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0)\nRequirement already satisfied: frozenlist>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.4.1)\nRequirement already satisfied: aiosignal>=1.1.2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.1)\nRequirement already satisfied: attrs>=17.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (23.1.0)\nRequirement already satisfied: multidict<7.0,>=4.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (6.0.4)\nRequirement already satisfied: async-timeout<5.0,>=4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (4.0.3)\nRequirement already satisfied: yarl<2.0,>=1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.9.4)\nRequirement already satisfied: gitdb<5,>=4.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from GitPython!=3.1.29,>=1.0.0->wandb) (4.0.10)\nRequirement already satisfied: zipp>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from importlib-metadata>=1.4->watermark) (3.15.0)\nRequirement already satisfied: pygments>=2.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (2.15.1)\nRequirement already satisfied: decorator in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.1.1)\nRequirement already satisfied: traitlets>=5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.9.0)\nRequirement already satisfied: matplotlib-inline in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.1.6)\nRequirement already satisfied: backcall in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.2.0)\nRequirement already satisfied: stack-data in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.6.2)\nRequirement already satisfied: pickleshare in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.7.5)\nRequirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (3.0.30)\nRequirement already satisfied: jedi>=0.16 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.18.2)\nRequirement already satisfied: pexpect>4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (4.8.0)\nRequirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.1.0)\nRequirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.4)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (1.26.16)\nRequirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (2023.5.7)\nRequirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from numba->shap) (0.41.1)\nRequirement already satisfied: python-dateutil>=2.8.2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2.8.2)\nRequirement already satisfied: tzdata>=2022.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\nRequirement already satisfied: pytz>=2020.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (3.1.0)\nRequirement already satisfied: joblib>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (1.2.0)\nRequirement already satisfied: smmap<6,>=3.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.0)\nRequirement already satisfied: parso<0.9.0,>=0.8.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from jedi>=0.16->ipython>=6.0->watermark) (0.8.3)\nRequirement already satisfied: ptyprocess>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pexpect>4.3->ipython>=6.0->watermark) (0.7.0)\nRequirement already satisfied: wcwidth in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.0->watermark) (0.2.6)\nRequirement already satisfied: asttokens>=2.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (2.2.1)\nRequirement already satisfied: executing>=1.2.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (1.2.0)\nRequirement already satisfied: pure-eval in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (0.2.2)\nNote: you may need to restart the kernel to use updated packages.\n"
+        }
+      ],
+      "execution_count": 2,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "import torch\n",
+        "import os\n",
+        "from typing import List\n",
+        "from sklearn.metrics import f1_score, accuracy_score, classification_report\n",
+        "from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, pipeline\n",
+        "from datasets import load_dataset, Dataset, DatasetDict\n",
+        "from pyarrow import Table\n",
+        "import shap\n",
+        "\n",
+        "%load_ext watermark"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n2024-01-28 04:14:37.393442: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\nTo enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n2024-01-28 04:14:38.436146: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n2024-01-28 04:14:38.436275: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n2024-01-28 04:14:38.436289: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
+        }
+      ],
+      "execution_count": 3,
+      "metadata": {
+        "datalore": {
+          "node_id": "caZjjFP0OyQNMVgZDiwswE",
+          "type": "CODE",
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "report_properties": {
+            "rowId": "un8W7ez7ZwoGb5Co6nydEV"
+          }
+        },
+        "gather": {
+          "logged": 1706415280692
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "device: str = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+        "\n",
+        "SEED: int = 42\n",
+        "\n",
+        "BATCH_SIZE: int = 8\n",
+        "EPOCHS: int = 1\n",
+        "model_ckpt: str = \"distilbert-base-uncased\"\n",
+        "\n",
+        "CLASS_NAMES: List[str] = [\"DIED\",\n",
+        "                          \"ER_VISIT\",\n",
+        "                          \"HOSPITAL\",\n",
+        "                          \"OFC_VISIT\",\n",
+        "                          #\"X_STAY\",      # pruned\n",
+        "                          #\"DISABLE\",     # pruned\n",
+        "                          #\"D_PRESENTED\"  # pruned\n",
+        "                          ]\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "# WandB configuration\n",
+        "os.environ[\"WANDB_PROJECT\"] = \"DAEDRA model training\"  # name your W&B project\n",
+        "os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\"  # log all model checkpoints"
+      ],
+      "outputs": [],
+      "execution_count": 4,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706415281102
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%watermark --iversion"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "re     : 2.2.1\nlogging: 0.5.1.2\nnumpy  : 1.23.5\nshap   : 0.44.1\npandas : 2.0.2\ntorch  : 1.12.0\n\n"
+        }
+      ],
+      "execution_count": 5,
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!nvidia-smi"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Sun Jan 28 04:14:40 2024       \r\n+---------------------------------------------------------------------------------------+\r\n| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |\r\n|-----------------------------------------+----------------------+----------------------+\r\n| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\r\n| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\r\n|                                         |                      |               MIG M. |\r\n|=========================================+======================+======================|\r\n|   0  Tesla V100-PCIE-16GB           Off | 00000001:00:00.0 Off |                  Off |\r\n| N/A   29C    P0              37W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n|   1  Tesla V100-PCIE-16GB           Off | 00000002:00:00.0 Off |                  Off |\r\n| N/A   28C    P0              36W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n                                                                                         \r\n+---------------------------------------------------------------------------------------+\r\n| Processes:                                                                            |\r\n|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\r\n|        ID   ID                                                             Usage      |\r\n|=======================================================================================|\r\n|  No running processes found                                                           |\r\n+---------------------------------------------------------------------------------------+\r\n"
+        }
+      ],
+      "execution_count": 6,
+      "metadata": {
+        "datalore": {
+          "node_id": "UU2oOJhwbIualogG1YyCMd",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "source": [
+        "## Loading the data set"
+      ],
+      "metadata": {
+        "datalore": {
+          "node_id": "t45KHugmcPVaO0nuk8tGJ9",
+          "type": "MD",
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "report_properties": {
+            "rowId": "40nN9Hvgi1clHNV5RAemI5"
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dataset = load_dataset(\"chrisvoncsefalvay/vaers-outcomes\")"
+      ],
+      "outputs": [],
+      "execution_count": 7,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706415283301
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We prune things down to the first four keys: `DIED`, `ER_VISIT`, `HOSPITAL`, `OFC_VISIT`."
+      ],
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "ds = DatasetDict()\n",
+        "\n",
+        "for i in [\"test\", \"train\", \"val\"]:\n",
+        "    tab = Table.from_arrays([dataset[i][\"id\"], dataset[i][\"text\"], [i[:4] for i in dataset[i][\"labels\"]]], names=[\"id\", \"text\", \"labels\"])\n",
+        "    ds[i] = Dataset(tab)\n",
+        "\n",
+        "dataset = ds"
+      ],
+      "outputs": [],
+      "execution_count": 8,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1706415283944
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Tokenisation and encoding"
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)"
+      ],
+      "outputs": [],
+      "execution_count": 9,
+      "metadata": {
+        "datalore": {
+          "node_id": "I7n646PIscsUZRoHu6m7zm",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706415284206
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def tokenize_and_encode(examples):\n",
+        "  return tokenizer(examples[\"text\"], truncation=True)"
+      ],
+      "outputs": [],
+      "execution_count": 10,
+      "metadata": {
+        "datalore": {
+          "node_id": "QBLOSI0yVIslV7v7qX9ZC3",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706415284614
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "cols = dataset[\"train\"].column_names\n",
+        "cols.remove(\"labels\")\n",
+        "ds_enc = dataset.map(tokenize_and_encode, batched=True, remove_columns=cols)"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "Map: 100%|██████████| 15786/15786 [00:01<00:00, 10213.76 examples/s]\nMap: 100%|██████████| 73667/73667 [00:07<00:00, 10215.55 examples/s]\nMap: 100%|██████████| 15785/15785 [00:01<00:00, 10172.52 examples/s]\n"
+        }
+      ],
+      "execution_count": 11,
+      "metadata": {
+        "datalore": {
+          "node_id": "slHeNysZOX9uWS9PB7jFDb",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706415294450
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Training"
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "class MultiLabelTrainer(Trainer):\n",
+        "    def compute_loss(self, model, inputs, return_outputs=False):\n",
+        "        labels = inputs.pop(\"labels\")\n",
+        "        outputs = model(**inputs)\n",
+        "        logits = outputs.logits\n",
+        "        loss_fct = torch.nn.BCEWithLogitsLoss()\n",
+        "        loss = loss_fct(logits.view(-1, self.model.config.num_labels),\n",
+        "                        labels.float().view(-1, self.model.config.num_labels))\n",
+        "        return (loss, outputs) if return_outputs else loss"
+      ],
+      "outputs": [],
+      "execution_count": 12,
+      "metadata": {
+        "datalore": {
+          "node_id": "itXWkbDw9sqbkMuDP84QoT",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706415294807
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(CLASS_NAMES)).to(\"cuda\")"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\nYou should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+        }
+      ],
+      "execution_count": 13,
+      "metadata": {
+        "datalore": {
+          "node_id": "ZQU7aW6TV45VmhHOQRzcnF",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706415296683
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def accuracy_threshold(y_pred, y_true, threshold=.5, sigmoid=True):\n",
+        "    y_pred = torch.from_numpy(y_pred)\n",
+        "    y_true = torch.from_numpy(y_true)\n",
+        "\n",
+        "    if sigmoid:\n",
+        "        y_pred = y_pred.sigmoid()\n",
+        "\n",
+        "    return ((y_pred > threshold) == y_true.bool()).float().mean().item()"
+      ],
+      "outputs": [],
+      "execution_count": 14,
+      "metadata": {
+        "datalore": {
+          "node_id": "swhgyyyxoGL8HjnXJtMuSW",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706415296937
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def compute_metrics(eval_pred):\n",
+        "    predictions, labels = eval_pred\n",
+        "    return {'accuracy_thresh': accuracy_threshold(predictions, labels)}"
+      ],
+      "outputs": [],
+      "execution_count": 15,
+      "metadata": {
+        "datalore": {
+          "node_id": "1Uq3HtkaBxtHNAnSwit5cI",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706415297280
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "args = TrainingArguments(\n",
+        "    output_dir=\"vaers\",\n",
+        "    evaluation_strategy=\"epoch\",\n",
+        "    learning_rate=2e-5,\n",
+        "    per_device_train_batch_size=BATCH_SIZE,\n",
+        "    per_device_eval_batch_size=BATCH_SIZE,\n",
+        "    num_train_epochs=EPOCHS,\n",
+        "    weight_decay=.01,\n",
+        "    report_to=[\"wandb\"]\n",
+        ")"
+      ],
+      "outputs": [],
+      "execution_count": 16,
+      "metadata": {
+        "datalore": {
+          "node_id": "1iPZOTKPwSkTgX5dORqT89",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706415297551
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "multi_label_trainer = MultiLabelTrainer(\n",
+        "    model, \n",
+        "    args, \n",
+        "    train_dataset=ds_enc[\"train\"], \n",
+        "    eval_dataset=ds_enc[\"test\"], \n",
+        "    compute_metrics=compute_metrics, \n",
+        "    tokenizer=tokenizer\n",
+        ")"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\nhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\nhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+        }
+      ],
+      "execution_count": 17,
+      "metadata": {
+        "datalore": {
+          "node_id": "bnRkNvRYltLun6gCEgL7v0",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706415297795
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "multi_label_trainer.evaluate()"
+      ],
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "\n    <div>\n      \n      <progress value='987' max='987' style='width:300px; height:20px; vertical-align: middle;'></progress>\n      [987/987 01:13]\n    </div>\n    "
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\nhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mchrisvoncsefalvay\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\nhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\nhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Tracking run with wandb version 0.16.2"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/cvc-vaers-bert-dnsd/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240128_041615-nnw129w4</code>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/nnw129w4' target=\"_blank\">grateful-shadow-2</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training</a>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/nnw129w4' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/nnw129w4</a>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "execute_result",
+          "execution_count": 18,
+          "data": {
+            "text/plain": "{'eval_loss': 0.712559163570404,\n 'eval_accuracy_thresh': 0.36481693387031555,\n 'eval_runtime': 76.4156,\n 'eval_samples_per_second': 206.581,\n 'eval_steps_per_second': 12.916}"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 18,
+      "metadata": {
+        "datalore": {
+          "node_id": "LO54PlDkWQdFrzV25FvduB",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706415378024
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "multi_label_trainer.train()"
+      ],
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "\n    <div>\n      \n      <progress value='3001' max='4605' style='width:300px; height:20px; vertical-align: middle;'></progress>\n      [3001/4605 12:05 < 06:28, 4.13 it/s, Epoch 0.65/1]\n    </div>\n    <table border=\"1\" class=\"dataframe\">\n  <thead>\n <tr style=\"text-align: left;\">\n      <th>Epoch</th>\n      <th>Training Loss</th>\n      <th>Validation Loss</th>\n    </tr>\n  </thead>\n  <tbody>\n  </tbody>\n</table><p>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "Checkpoint destination directory vaers/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.\nhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-500)... Done. 15.2s\nCheckpoint destination directory vaers/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.\n\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-1000)... Done. 13.4s\nCheckpoint destination directory vaers/checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.\n\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-1500)... Done. 13.0s\nCheckpoint destination directory vaers/checkpoint-2000 already exists and is non-empty.Saving will proceed but saved results may be invalid.\n\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-2000)... Done. 11.6s\nCheckpoint destination directory vaers/checkpoint-2500 already exists and is non-empty.Saving will proceed but saved results may be invalid.\n\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-2500)... Done. 14.6s\nCheckpoint destination directory vaers/checkpoint-3000 already exists and is non-empty.Saving will proceed but saved results may be invalid.\n\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-3000)... "
+        }
+      ],
+      "execution_count": 19,
+      "metadata": {
+        "datalore": {
+          "node_id": "hf0Ei1QXEYDmBv1VNLZ4Zw",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411445752
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Evaluation"
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We instantiate a classifier `pipeline` and push it to CUDA."
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "classifier = pipeline(\"text-classification\", \n",
+        "                      model, \n",
+        "                      tokenizer=tokenizer, \n",
+        "                      device=\"cuda:0\")"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "kHoUdBeqcyVXDSGv54C4aE",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411459928
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We use the same tokenizer used for training to tokenize/encode the validation set."
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "test_encodings = tokenizer.batch_encode_plus(dataset[\"val\"][\"text\"], \n",
+        "                                             max_length=None, \n",
+        "                                             padding='max_length', \n",
+        "                                             return_token_type_ids=True, \n",
+        "                                             truncation=True)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "Dr5WCWA6jL51NR1fSrQu6Z",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411523285
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Once we've made the data loadable by putting it into a `DataLoader`, we "
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "test_data = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']), \n",
+        "                                           torch.tensor(test_encodings['attention_mask']), \n",
+        "                                           torch.tensor(ds_enc[\"val\"][\"labels\"]), \n",
+        "                                           torch.tensor(test_encodings['token_type_ids']))\n",
+        "test_dataloader = torch.utils.data.DataLoader(test_data, \n",
+        "                                              sampler=torch.utils.data.SequentialSampler(test_data), \n",
+        "                                              batch_size=BATCH_SIZE)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "MWfGq2tTkJNzFiDoUPq2X7",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411543379
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model.eval()\n",
+        "\n",
+        "logit_preds, true_labels, pred_labels, tokenized_texts = [], [], [], []\n",
+        "\n",
+        "for i, batch in enumerate(test_dataloader):\n",
+        "  batch = tuple(t.to(device) for t in batch)\n",
+        "  \n",
+        "  # Unpack the inputs from our dataloader\n",
+        "  b_input_ids, b_input_mask, b_labels, b_token_types = batch\n",
+        "  \n",
+        "  with torch.no_grad():\n",
+        "    outs = model(b_input_ids, attention_mask=b_input_mask)\n",
+        "    b_logit_pred = outs[0]\n",
+        "    pred_label = torch.sigmoid(b_logit_pred)\n",
+        "\n",
+        "    b_logit_pred = b_logit_pred.detach().cpu().numpy()\n",
+        "    pred_label = pred_label.to('cpu').numpy()\n",
+        "    b_labels = b_labels.to('cpu').numpy()\n",
+        "\n",
+        "  tokenized_texts.append(b_input_ids)\n",
+        "  logit_preds.append(b_logit_pred)\n",
+        "  true_labels.append(b_labels)\n",
+        "  pred_labels.append(pred_label)\n",
+        "\n",
+        "# Flatten outputs\n",
+        "tokenized_texts = [item for sublist in tokenized_texts for item in sublist]\n",
+        "pred_labels = [item for sublist in pred_labels for item in sublist]\n",
+        "true_labels = [item for sublist in true_labels for item in sublist]\n",
+        "\n",
+        "# Converting flattened binary values to boolean values\n",
+        "true_bools = [tl == 1 for tl in true_labels]\n",
+        "pred_bools = [pl > 0.50 for pl in pred_labels] "
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "1SJCSrQTRCexFCNCIyRrzL",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411587843
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We create a classification report:"
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools, average='micro'))\n",
+        "print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools), '\\n')\n",
+        "clf_report = classification_report(true_bools, pred_bools, target_names=CLASS_NAMES)\n",
+        "print(clf_report)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "eBprrgF086mznPbPVBpOLS",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411588249
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Finally, we render a 'head to head' comparison table that maps each text prediction to actual and predicted labels."
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Creating a map of class names from class numbers\n",
+        "idx2label = dict(zip(range(len(CLASS_NAMES)), CLASS_NAMES))"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "yELHY0IEwMlMw3x6e7hoD1",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411588638
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "true_label_idxs, pred_label_idxs = [], []\n",
+        "\n",
+        "for vals in true_bools:\n",
+        "  true_label_idxs.append(np.where(vals)[0].flatten().tolist())\n",
+        "for vals in pred_bools:\n",
+        "  pred_label_idxs.append(np.where(vals)[0].flatten().tolist())"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "jH0S35dDteUch01sa6me6e",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411589004
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "true_label_texts, pred_label_texts = [], []\n",
+        "\n",
+        "for vals in true_label_idxs:\n",
+        "  if vals:\n",
+        "    true_label_texts.append([idx2label[val] for val in vals])\n",
+        "  else:\n",
+        "    true_label_texts.append(vals)\n",
+        "\n",
+        "for vals in pred_label_idxs:\n",
+        "  if vals:\n",
+        "    pred_label_texts.append([idx2label[val] for val in vals])\n",
+        "  else:\n",
+        "    pred_label_texts.append(vals)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "h4vHL8XdGpayZ6xLGJUF6F",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411589301
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "symptom_texts = [tokenizer.decode(text,\n",
+        "                                  skip_special_tokens=True,\n",
+        "                                  clean_up_tokenization_spaces=False) for text in tokenized_texts]"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "SxUmVHfQISEeptg1SawOmB",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411591952
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "comparisons_df = pd.DataFrame({'symptom_text': symptom_texts, \n",
+        "                               'true_labels': true_label_texts, \n",
+        "                               'pred_labels':pred_label_texts})\n",
+        "comparisons_df.to_csv('comparisons.csv')\n",
+        "comparisons_df"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "BxFNigNGRLTOqraI55BPSH",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411592512
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Shapley analysis"
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "explainer = shap.Explainer(classifier, output_names=CLASS_NAMES)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "OpdZcoenX2HwzLdai7K5UA",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706415109071
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "#### Sampling correct predictions\n",
+        "\n",
+        "First, let's look at some correct predictions of deaths:"
+      ],
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "correct_death_predictions = comparisons_df[comparisons_df['true_labels'].astype(str) == \"['DIED']\"]"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1706414973990
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "texts = [i[:512] for i in correct_death_predictions.sample(n=6).symptom_text]\n",
+        "idxs = [i for i in range(len(texts))]\n",
+        "\n",
+        "d_s = Dataset(Table.from_arrays([idxs, texts], names=[\"idx\", \"texts\"]))"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1706415114683
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "shap_values = explainer(d_s[\"texts\"])"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1706415129229
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "shap.plots.text(shap_values)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1706415151494
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "name": "python3",
+      "language": "python",
+      "display_name": "Python 3 (ipykernel)"
+    },
+    "datalore": {
+      "computation_mode": "JUPYTER",
+      "package_manager": "pip",
+      "base_environment": "default",
+      "packages": [
+        {
+          "name": "datasets",
+          "version": "2.16.1",
+          "source": "PIP"
+        },
+        {
+          "name": "torch",
+          "version": "2.1.2",
+          "source": "PIP"
+        },
+        {
+          "name": "accelerate",
+          "version": "0.26.1",
+          "source": "PIP"
+        }
+      ],
+      "report_row_ids": [
+        "un8W7ez7ZwoGb5Co6nydEV",
+        "40nN9Hvgi1clHNV5RAemI5",
+        "TgRD90H5NSPpKS41OeXI1w",
+        "ZOm5BfUs3h1EGLaUkBGeEB",
+        "kOP0CZWNSk6vqE3wkPp7Vc",
+        "W4PWcOu2O2pRaZyoE2W80h",
+        "RolbOnQLIftk0vy9mIcz5M",
+        "8OPhUgbaNJmOdiq5D3a6vK",
+        "5Qrt3jSvSrpK6Ne1hS6shL",
+        "hTq7nFUrovN5Ao4u6dIYWZ",
+        "I8WNZLpJ1DVP2wiCW7YBIB",
+        "SawhU3I9BewSE1XBPstpNJ",
+        "80EtLEl2FIE4FqbWnUD3nT"
+      ],
+      "version": 3
+    },
+    "microsoft": {
+      "ms_spell_check": {
+        "ms_spell_check_language": "en"
+      },
+      "host": {
+        "AzureML": {
+          "notebookHasBeenCompleted": true
+        }
+      }
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.8.5",
+      "mimetype": "text/x-python",
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "pygments_lexer": "ipython3",
+      "nbconvert_exporter": "python",
+      "file_extension": ".py"
+    },
+    "nteract": {
+      "version": "nteract-front-end@1.0.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}

notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-28-15-7-36Z.ipynb ADDED Viewed

	@@ -0,0 +1,1452 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# DAEDRA: Determining Adverse Event Disposition for Regulatory Affairs\n",
+    "\n",
+    "DAEDRA is a language model intended to predict the disposition (outcome) of an adverse event based on the text of the event report. Intended to be used to classify reports in passive reporting systems, it is trained on the [VAERS](https://vaers.hhs.gov/) dataset, which contains reports of adverse events following vaccination in the United States."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# %pip install accelerate -U"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# %pip install transformers datasets shap watermark wandb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": false,
+     "hide_output_from_viewers": false,
+     "node_id": "caZjjFP0OyQNMVgZDiwswE",
+     "report_properties": {
+      "rowId": "un8W7ez7ZwoGb5Co6nydEV"
+     },
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449625034
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "2024-01-28 14:18:31.729214: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
+      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2024-01-28 14:18:32.746966: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n",
+      "2024-01-28 14:18:32.747096: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n",
+      "2024-01-28 14:18:32.747111: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import os\n",
+    "from typing import List\n",
+    "from sklearn.metrics import f1_score, accuracy_score, classification_report\n",
+    "from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, pipeline\n",
+    "from datasets import load_dataset, Dataset, DatasetDict\n",
+    "from pyarrow import Table\n",
+    "import shap\n",
+    "import wandb\n",
+    "\n",
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+    "\n",
+    "%load_ext watermark"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false,
+    "gather": {
+     "logged": 1706449721319
+    },
+    "jupyter": {
+     "outputs_hidden": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "device: str = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+    "\n",
+    "SEED: int = 42\n",
+    "\n",
+    "BATCH_SIZE: int = 16\n",
+    "EPOCHS: int = 3\n",
+    "model_ckpt: str = \"distilbert-base-uncased\"\n",
+    "\n",
+    "CLASS_NAMES: List[str] = [\"DIED\",\n",
+    "                          \"ER_VISIT\",\n",
+    "                          \"HOSPITAL\",\n",
+    "                          \"OFC_VISIT\",\n",
+    "                          #\"X_STAY\",      # pruned\n",
+    "                          #\"DISABLE\",     # pruned\n",
+    "                          #\"D_PRESENTED\"  # pruned\n",
+    "                          ]\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "# WandB configuration\n",
+    "os.environ[\"WANDB_PROJECT\"] = \"DAEDRA model training\"  # name your W&B project\n",
+    "os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\"  # log all model checkpoints\n",
+    "os.environ[\"WANDB_NOTEBOOK_NAME\"] = \"DAEDRA.ipynb\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch  : 1.12.0\n",
+      "pandas : 2.0.2\n",
+      "numpy  : 1.23.5\n",
+      "shap   : 0.44.1\n",
+      "re     : 2.2.1\n",
+      "wandb  : 0.16.2\n",
+      "logging: 0.5.1.2\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "%watermark --iversion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "UU2oOJhwbIualogG1YyCMd",
+     "type": "CODE"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sun Jan 28 14:18:35 2024       \n",
+      "+---------------------------------------------------------------------------------------+\n",
+      "| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |\n",
+      "|-----------------------------------------+----------------------+----------------------+\n",
+      "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
+      "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
+      "|                                         |                      |               MIG M. |\n",
+      "|=========================================+======================+======================|\n",
+      "|   0  Tesla V100-PCIE-16GB           Off | 00000001:00:00.0 Off |                  Off |\n",
+      "| N/A   29C    P0              27W / 250W |      4MiB / 16384MiB |      0%      Default |\n",
+      "|                                         |                      |                  N/A |\n",
+      "+-----------------------------------------+----------------------+----------------------+\n",
+      "|   1  Tesla V100-PCIE-16GB           Off | 00000002:00:00.0 Off |                  Off |\n",
+      "| N/A   29C    P0              24W / 250W |      4MiB / 16384MiB |      0%      Default |\n",
+      "|                                         |                      |                  N/A |\n",
+      "+-----------------------------------------+----------------------+----------------------+\n",
+      "                                                                                         \n",
+      "+---------------------------------------------------------------------------------------+\n",
+      "| Processes:                                                                            |\n",
+      "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
+      "|        ID   ID                                                             Usage      |\n",
+      "|=======================================================================================|\n",
+      "|  No running processes found                                                           |\n",
+      "+---------------------------------------------------------------------------------------+\n"
+     ]
+    }
+   ],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": false,
+     "hide_output_from_viewers": false,
+     "node_id": "t45KHugmcPVaO0nuk8tGJ9",
+     "report_properties": {
+      "rowId": "40nN9Hvgi1clHNV5RAemI5"
+     },
+     "type": "MD"
+    }
+   },
+   "source": [
+    "## Loading the data set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false,
+    "gather": {
+     "logged": 1706449040507
+    },
+    "jupyter": {
+     "outputs_hidden": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset = load_dataset(\"chrisvoncsefalvay/vaers-outcomes\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false,
+    "gather": {
+     "logged": 1706449044205
+    },
+    "jupyter": {
+     "outputs_hidden": false,
+     "source_hidden": false
+    },
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['id', 'text', 'labels'],\n",
+       "        num_rows: 1270444\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['id', 'text', 'labels'],\n",
+       "        num_rows: 272238\n",
+       "    })\n",
+       "    val: Dataset({\n",
+       "        features: ['id', 'text', 'labels'],\n",
+       "        num_rows: 272238\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SUBSAMPLING: float = 0.1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false,
+    "gather": {
+     "logged": 1706449378281
+    },
+    "jupyter": {
+     "outputs_hidden": false,
+     "source_hidden": false
+    },
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def minisample(ds: DatasetDict, fraction: float) -> DatasetDict:\n",
+    "    res = DatasetDict()\n",
+    "\n",
+    "    res[\"train\"] = Dataset.from_dict(ds[\"train\"].shuffle()[:round(len(ds[\"train\"]) * fraction)])\n",
+    "    res[\"test\"] = Dataset.from_dict(ds[\"test\"].shuffle()[:round(len(ds[\"test\"]) * fraction)])\n",
+    "    res[\"val\"] = Dataset.from_dict(ds[\"val\"].shuffle()[:round(len(ds[\"val\"]) * fraction)])\n",
+    "    \n",
+    "    return res"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false,
+    "gather": {
+     "logged": 1706449384162
+    },
+    "jupyter": {
+     "outputs_hidden": false,
+     "source_hidden": false
+    },
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset = minisample(dataset, SUBSAMPLING)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": false,
+    "gather": {
+     "logged": 1706449387981
+    },
+    "jupyter": {
+     "outputs_hidden": false,
+     "source_hidden": false
+    },
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['id', 'text', 'labels'],\n",
+       "        num_rows: 127044\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['id', 'text', 'labels'],\n",
+       "        num_rows: 27224\n",
+       "    })\n",
+       "    val: Dataset({\n",
+       "        features: ['id', 'text', 'labels'],\n",
+       "        num_rows: 27224\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    }
+   },
+   "source": [
+    "We prune things down to the first four keys: `DIED`, `ER_VISIT`, `HOSPITAL`, `OFC_VISIT`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": false,
+    "gather": {
+     "logged": 1706449443055
+    },
+    "jupyter": {
+     "outputs_hidden": false,
+     "source_hidden": false
+    },
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    }
+   },
+   "outputs": [],
+   "source": [
+    "ds = DatasetDict()\n",
+    "\n",
+    "for i in [\"test\", \"train\", \"val\"]:\n",
+    "    tab = Table.from_arrays([dataset[i][\"id\"], dataset[i][\"text\"], [i[:4] for i in dataset[i][\"labels\"]]], names=[\"id\", \"text\", \"labels\"])\n",
+    "    ds[i] = Dataset(tab)\n",
+    "\n",
+    "dataset = ds"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Tokenisation and encoding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "I7n646PIscsUZRoHu6m7zm",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449638377
+    }
+   },
+   "outputs": [],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "QBLOSI0yVIslV7v7qX9ZC3",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449642580
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def tokenize_and_encode(examples):\n",
+    "  return tokenizer(examples[\"text\"], truncation=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "slHeNysZOX9uWS9PB7jFDb",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449721161
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Map: 100%|██████████| 27224/27224 [00:10<00:00, 2638.52 examples/s]\n",
+      "Map: 100%|██████████| 127044/127044 [00:48<00:00, 2633.40 examples/s]\n",
+      "Map: 100%|██████████| 27224/27224 [00:10<00:00, 2613.19 examples/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "cols = dataset[\"train\"].column_names\n",
+    "cols.remove(\"labels\")\n",
+    "ds_enc = dataset.map(tokenize_and_encode, batched=True, remove_columns=cols)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "itXWkbDw9sqbkMuDP84QoT",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449743072
+    }
+   },
+   "outputs": [],
+   "source": [
+    "class MultiLabelTrainer(Trainer):\n",
+    "    def compute_loss(self, model, inputs, return_outputs=False):\n",
+    "        labels = inputs.pop(\"labels\")\n",
+    "        outputs = model(**inputs)\n",
+    "        logits = outputs.logits\n",
+    "        loss_fct = torch.nn.BCEWithLogitsLoss()\n",
+    "        loss = loss_fct(logits.view(-1, self.model.config.num_labels),\n",
+    "                        labels.float().view(-1, self.model.config.num_labels))\n",
+    "        return (loss, outputs) if return_outputs else loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "ZQU7aW6TV45VmhHOQRzcnF",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449761205
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(CLASS_NAMES)).to(\"cuda\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "swhgyyyxoGL8HjnXJtMuSW",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449761541
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def accuracy_threshold(y_pred, y_true, threshold=.5, sigmoid=True):\n",
+    "    y_pred = torch.from_numpy(y_pred)\n",
+    "    y_true = torch.from_numpy(y_true)\n",
+    "\n",
+    "    if sigmoid:\n",
+    "        y_pred = y_pred.sigmoid()\n",
+    "\n",
+    "    return ((y_pred > threshold) == y_true.bool()).float().mean().item()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "1Uq3HtkaBxtHNAnSwit5cI",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449761720
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def compute_metrics(eval_pred):\n",
+    "    predictions, labels = eval_pred\n",
+    "    return {'accuracy_thresh': accuracy_threshold(predictions, labels)}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "1iPZOTKPwSkTgX5dORqT89",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449761893
+    }
+   },
+   "outputs": [],
+   "source": [
+    "args = TrainingArguments(\n",
+    "    output_dir=\"vaers\",\n",
+    "    evaluation_strategy=\"epoch\",\n",
+    "    learning_rate=2e-5,\n",
+    "    per_device_train_batch_size=BATCH_SIZE,\n",
+    "    per_device_eval_batch_size=BATCH_SIZE,\n",
+    "    num_train_epochs=EPOCHS,\n",
+    "    weight_decay=.01,\n",
+    "    logging_steps=1,\n",
+    "    run_name=f\"daedra-training\",\n",
+    "    report_to=[\"wandb\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "bnRkNvRYltLun6gCEgL7v0",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449769103
+    }
+   },
+   "outputs": [],
+   "source": [
+    "multi_label_trainer = MultiLabelTrainer(\n",
+    "    model, \n",
+    "    args, \n",
+    "    train_dataset=ds_enc[\"train\"], \n",
+    "    eval_dataset=ds_enc[\"test\"], \n",
+    "    compute_metrics=compute_metrics, \n",
+    "    tokenizer=tokenizer\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "LO54PlDkWQdFrzV25FvduB",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449880674
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mchrisvoncsefalvay\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m wandb.init() arguments ignored because wandb magic has already been initialized\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.16.2"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/cvc-vaers-bert-dnsd/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240128_141956-9lniqjvz</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/9lniqjvz' target=\"_blank\">init_evaluation_run</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/9lniqjvz' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/9lniqjvz</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Finishing last run (ID:9lniqjvz) before initializing another..."
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run <strong style=\"color:#cdcd00\">init_evaluation_run</strong> at: <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/9lniqjvz' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/9lniqjvz</a><br/>Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Find logs at: <code>./wandb/run-20240128_141956-9lniqjvz/logs</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Successfully finished last run (ID:9lniqjvz). Initializing new run:<br/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.16.2"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/cvc-vaers-bert-dnsd/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240128_141958-5idmkcie</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/5idmkcie' target=\"_blank\">init_evaluation_run</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/5idmkcie' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/5idmkcie</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='1003' max='851' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [851/851 26:26]\n",
+       "    </div>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<style>\n",
+       "    table.wandb td:nth-child(1) { padding: 0 10px; text-align: left ; width: auto;} td:nth-child(2) {text-align: left ; width: 100%}\n",
+       "    .wandb-row { display: flex; flex-direction: row; flex-wrap: wrap; justify-content: flex-start; width: 100% }\n",
+       "    .wandb-col { display: flex; flex-direction: column; flex-basis: 100%; flex: 1; padding: 10px; }\n",
+       "    </style>\n",
+       "<div class=\"wandb-row\"><div class=\"wandb-col\"><h3>Run history:</h3><br/><table class=\"wandb\"><tr><td>eval/accuracy_thresh</td><td>▁</td></tr><tr><td>eval/loss</td><td>▁</td></tr><tr><td>eval/runtime</td><td>▁</td></tr><tr><td>eval/samples_per_second</td><td>▁</td></tr><tr><td>eval/steps_per_second</td><td>▁</td></tr><tr><td>train/global_step</td><td>▁</td></tr></table><br/></div><div class=\"wandb-col\"><h3>Run summary:</h3><br/><table class=\"wandb\"><tr><td>eval/accuracy_thresh</td><td>0.55198</td></tr><tr><td>eval/loss</td><td>0.68442</td></tr><tr><td>eval/runtime</td><td>105.0436</td></tr><tr><td>eval/samples_per_second</td><td>259.168</td></tr><tr><td>eval/steps_per_second</td><td>8.101</td></tr><tr><td>train/global_step</td><td>0</td></tr></table><br/></div></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run <strong style=\"color:#cdcd00\">init_evaluation_run</strong> at: <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/5idmkcie' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/5idmkcie</a><br/>Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Find logs at: <code>./wandb/run-20240128_141958-5idmkcie/logs</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "if SUBSAMPLING != 1.0:\n",
+    "    wandb_tag: List[str] = [f\"subsample-{SUBSAMPLING}\"]\n",
+    "else:\n",
+    "    wandb_tag: List[str] = [f\"full_sample\"]\n",
+    "    \n",
+    "wandb.init(name=\"init_evaluation_run\", tags=wandb_tag, magic=True)\n",
+    "\n",
+    "multi_label_trainer.evaluate()\n",
+    "wandb.finish()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "hf0Ei1QXEYDmBv1VNLZ4Zw",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449934637
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.16.2"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/cvc-vaers-bert-dnsd/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240128_142151-2mcc0ibc</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/2mcc0ibc' target=\"_blank\">daedra_training_run</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/2mcc0ibc' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/2mcc0ibc</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='3972' max='11913' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [ 3972/11913 24:20 < 48:40, 2.72 it/s, Epoch 1/3]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Epoch</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-500)... Done. 15.6s\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-1000)... Done. 22.7s\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-1500)... Done. 14.0s\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-2000)... Done. 15.2s\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-2500)... Done. 14.0s\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-3000)... Done. 12.4s\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-3500)... Done. 13.4s\n"
+     ]
+    }
+   ],
+   "source": [
+    "if SUBSAMPLING != 1.0:\n",
+    "    wandb_tag: List[str] = [f\"subsample-{SUBSAMPLING}\"]\n",
+    "else:\n",
+    "    wandb_tag: List[str] = [f\"full_sample\"]\n",
+    "    \n",
+    "wandb.init(name=\"daedra_training_run\", tags=wandb_tag, magic=True)\n",
+    "\n",
+    "multi_label_trainer.train()\n",
+    "wandb.finish()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Evaluation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We instantiate a classifier `pipeline` and push it to CUDA."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "kHoUdBeqcyVXDSGv54C4aE",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706411459928
+    }
+   },
+   "outputs": [],
+   "source": [
+    "classifier = pipeline(\"text-classification\", \n",
+    "                      model, \n",
+    "                      tokenizer=tokenizer, \n",
+    "                      device=\"cuda:0\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We use the same tokenizer used for training to tokenize/encode the validation set."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "Dr5WCWA6jL51NR1fSrQu6Z",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706411523285
+    }
+   },
+   "outputs": [],
+   "source": [
+    "test_encodings = tokenizer.batch_encode_plus(dataset[\"val\"][\"text\"], \n",
+    "                                             max_length=None, \n",
+    "                                             padding='max_length', \n",
+    "                                             return_token_type_ids=True, \n",
+    "                                             truncation=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once we've made the data loadable by putting it into a `DataLoader`, we "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "MWfGq2tTkJNzFiDoUPq2X7",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706411543379
+    }
+   },
+   "outputs": [],
+   "source": [
+    "test_data = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']), \n",
+    "                                           torch.tensor(test_encodings['attention_mask']), \n",
+    "                                           torch.tensor(ds_enc[\"val\"][\"labels\"]), \n",
+    "                                           torch.tensor(test_encodings['token_type_ids']))\n",
+    "test_dataloader = torch.utils.data.DataLoader(test_data, \n",
+    "                                              sampler=torch.utils.data.SequentialSampler(test_data), \n",
+    "                                              batch_size=BATCH_SIZE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "1SJCSrQTRCexFCNCIyRrzL",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706411587843
+    }
+   },
+   "outputs": [],
+   "source": [
+    "model.eval()\n",
+    "\n",
+    "logit_preds, true_labels, pred_labels, tokenized_texts = [], [], [], []\n",
+    "\n",
+    "for i, batch in enumerate(test_dataloader):\n",
+    "  batch = tuple(t.to(device) for t in batch)\n",
+    "  \n",
+    "  # Unpack the inputs from our dataloader\n",
+    "  b_input_ids, b_input_mask, b_labels, b_token_types = batch\n",
+    "  \n",
+    "  with torch.no_grad():\n",
+    "    outs = model(b_input_ids, attention_mask=b_input_mask)\n",
+    "    b_logit_pred = outs[0]\n",
+    "    pred_label = torch.sigmoid(b_logit_pred)\n",
+    "\n",
+    "    b_logit_pred = b_logit_pred.detach().cpu().numpy()\n",
+    "    pred_label = pred_label.to('cpu').numpy()\n",
+    "    b_labels = b_labels.to('cpu').numpy()\n",
+    "\n",
+    "  tokenized_texts.append(b_input_ids)\n",
+    "  logit_preds.append(b_logit_pred)\n",
+    "  true_labels.append(b_labels)\n",
+    "  pred_labels.append(pred_label)\n",
+    "\n",
+    "# Flatten outputs\n",
+    "tokenized_texts = [item for sublist in tokenized_texts for item in sublist]\n",
+    "pred_labels = [item for sublist in pred_labels for item in sublist]\n",
+    "true_labels = [item for sublist in true_labels for item in sublist]\n",
+    "\n",
+    "# Converting flattened binary values to boolean values\n",
+    "true_bools = [tl == 1 for tl in true_labels]\n",
+    "pred_bools = [pl > 0.50 for pl in pred_labels] "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We create a classification report:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "eBprrgF086mznPbPVBpOLS",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706411588249
+    }
+   },
+   "outputs": [],
+   "source": [
+    "print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools, average='micro'))\n",
+    "print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools), '\\n')\n",
+    "clf_report = classification_report(true_bools, pred_bools, target_names=CLASS_NAMES)\n",
+    "print(clf_report)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we render a 'head to head' comparison table that maps each text prediction to actual and predicted labels."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "yELHY0IEwMlMw3x6e7hoD1",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706411588638
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Creating a map of class names from class numbers\n",
+    "idx2label = dict(zip(range(len(CLASS_NAMES)), CLASS_NAMES))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "jH0S35dDteUch01sa6me6e",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706411589004
+    }
+   },
+   "outputs": [],
+   "source": [
+    "true_label_idxs, pred_label_idxs = [], []\n",
+    "\n",
+    "for vals in true_bools:\n",
+    "  true_label_idxs.append(np.where(vals)[0].flatten().tolist())\n",
+    "for vals in pred_bools:\n",
+    "  pred_label_idxs.append(np.where(vals)[0].flatten().tolist())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "h4vHL8XdGpayZ6xLGJUF6F",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706411589301
+    }
+   },
+   "outputs": [],
+   "source": [
+    "true_label_texts, pred_label_texts = [], []\n",
+    "\n",
+    "for vals in true_label_idxs:\n",
+    "  if vals:\n",
+    "    true_label_texts.append([idx2label[val] for val in vals])\n",
+    "  else:\n",
+    "    true_label_texts.append(vals)\n",
+    "\n",
+    "for vals in pred_label_idxs:\n",
+    "  if vals:\n",
+    "    pred_label_texts.append([idx2label[val] for val in vals])\n",
+    "  else:\n",
+    "    pred_label_texts.append(vals)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "SxUmVHfQISEeptg1SawOmB",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706411591952
+    }
+   },
+   "outputs": [],
+   "source": [
+    "symptom_texts = [tokenizer.decode(text,\n",
+    "                                  skip_special_tokens=True,\n",
+    "                                  clean_up_tokenization_spaces=False) for text in tokenized_texts]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "BxFNigNGRLTOqraI55BPSH",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706411592512
+    }
+   },
+   "outputs": [],
+   "source": [
+    "comparisons_df = pd.DataFrame({'symptom_text': symptom_texts, \n",
+    "                               'true_labels': true_label_texts, \n",
+    "                               'pred_labels':pred_label_texts})\n",
+    "comparisons_df.to_csv('comparisons.csv')\n",
+    "comparisons_df"
+   ]
+  }
+ ],
+ "metadata": {
+  "datalore": {
+   "base_environment": "default",
+   "computation_mode": "JUPYTER",
+   "package_manager": "pip",
+   "packages": [
+    {
+     "name": "datasets",
+     "source": "PIP",
+     "version": "2.16.1"
+    },
+    {
+     "name": "torch",
+     "source": "PIP",
+     "version": "2.1.2"
+    },
+    {
+     "name": "accelerate",
+     "source": "PIP",
+     "version": "0.26.1"
+    }
+   ],
+   "report_row_ids": [
+    "un8W7ez7ZwoGb5Co6nydEV",
+    "40nN9Hvgi1clHNV5RAemI5",
+    "TgRD90H5NSPpKS41OeXI1w",
+    "ZOm5BfUs3h1EGLaUkBGeEB",
+    "kOP0CZWNSk6vqE3wkPp7Vc",
+    "W4PWcOu2O2pRaZyoE2W80h",
+    "RolbOnQLIftk0vy9mIcz5M",
+    "8OPhUgbaNJmOdiq5D3a6vK",
+    "5Qrt3jSvSrpK6Ne1hS6shL",
+    "hTq7nFUrovN5Ao4u6dIYWZ",
+    "I8WNZLpJ1DVP2wiCW7YBIB",
+    "SawhU3I9BewSE1XBPstpNJ",
+    "80EtLEl2FIE4FqbWnUD3nT"
+   ],
+   "version": 3
+  },
+  "kernelspec": {
+   "display_name": "Python 3.8 - Pytorch and Tensorflow",
+   "language": "python",
+   "name": "python38-azureml-pt-tf"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  },
+  "microsoft": {
+   "host": {
+    "AzureML": {
+     "notebookHasBeenCompleted": true
+    }
+   },
+   "ms_spell_check": {
+    "ms_spell_check_language": "en"
+   }
+  },
+  "nteract": {
+   "version": "nteract-front-end@1.0.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-28-16-26-9Z.ipynb ADDED Viewed

	@@ -0,0 +1,1246 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# DAEDRA: Determining Adverse Event Disposition for Regulatory Affairs\n",
+        "\n",
+        "DAEDRA is a language model intended to predict the disposition (outcome) of an adverse event based on the text of the event report. Intended to be used to classify reports in passive reporting systems, it is trained on the [VAERS](https://vaers.hhs.gov/) dataset, which contains reports of adverse events following vaccination in the United States."
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# %pip install accelerate -U"
+      ],
+      "outputs": [],
+      "execution_count": 1,
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "tags": []
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%pip install transformers datasets shap watermark wandb scikit-multilearn"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Requirement already satisfied: transformers in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (4.37.1)\nRequirement already satisfied: datasets in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.16.1)\nRequirement already satisfied: shap in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.44.1)\nRequirement already satisfied: watermark in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.4.3)\nRequirement already satisfied: wandb in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.16.2)\nCollecting scikit-multilearn\n  Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)\n\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m89.4/89.4 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25hRequirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2.31.0)\nRequirement already satisfied: tokenizers<0.19,>=0.14 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.15.1)\nRequirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (1.23.5)\nRequirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.20.3)\nRequirement already satisfied: pyyaml>=5.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (6.0)\nRequirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.4.2)\nRequirement already satisfied: tqdm>=4.27 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (4.65.0)\nRequirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (3.13.1)\nRequirement already satisfied: regex!=2019.12.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2023.12.25)\nRequirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (23.1)\nRequirement already satisfied: pyarrow>=8.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (9.0.0)\nRequirement already satisfied: dill<0.3.8,>=0.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.3.7)\nRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2023.10.0)\nRequirement already satisfied: pandas in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2.0.2)\nRequirement already satisfied: pyarrow-hotfix in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.6)\nRequirement already satisfied: xxhash in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.4.1)\nRequirement already satisfied: aiohttp in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.9.1)\nRequirement already satisfied: multiprocess in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.70.15)\nRequirement already satisfied: scipy in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.10.1)\nRequirement already satisfied: numba in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.58.1)\nRequirement already satisfied: slicer==0.0.7 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.0.7)\nRequirement already satisfied: scikit-learn in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.2.2)\nRequirement already satisfied: cloudpickle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (2.2.1)\nRequirement already satisfied: setuptools in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (65.6.3)\nRequirement already satisfied: importlib-metadata>=1.4 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (6.7.0)\nRequirement already satisfied: ipython>=6.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (8.12.2)\nRequirement already satisfied: setproctitle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.3.3)\nRequirement already satisfied: Click!=8.0.0,>=7.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (8.1.3)\nRequirement already satisfied: protobuf!=4.21.0,<5,>=3.12.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.19.6)\nRequirement already satisfied: appdirs>=1.4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.4.4)\nRequirement already satisfied: typing-extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (4.6.3)\nRequirement already satisfied: docker-pycreds>=0.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (0.4.0)\nRequirement already satisfied: psutil>=5.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (5.9.5)\nRequirement already satisfied: GitPython!=3.1.29,>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.1.31)\nRequirement already satisfied: sentry-sdk>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.39.2)\nRequirement already satisfied: six>=1.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0)\nRequirement already satisfied: multidict<7.0,>=4.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (6.0.4)\nRequirement already satisfied: attrs>=17.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (23.1.0)\nRequirement already satisfied: async-timeout<5.0,>=4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (4.0.3)\nRequirement already satisfied: yarl<2.0,>=1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.9.4)\nRequirement already satisfied: frozenlist>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.4.1)\nRequirement already satisfied: aiosignal>=1.1.2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.1)\nRequirement already satisfied: gitdb<5,>=4.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from GitPython!=3.1.29,>=1.0.0->wandb) (4.0.10)\nRequirement already satisfied: zipp>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from importlib-metadata>=1.4->watermark) (3.15.0)\nRequirement already satisfied: stack-data in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.6.2)\nRequirement already satisfied: jedi>=0.16 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.18.2)\nRequirement already satisfied: pickleshare in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.7.5)\nRequirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (3.0.30)\nRequirement already satisfied: backcall in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.2.0)\nRequirement already satisfied: pexpect>4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (4.8.0)\nRequirement already satisfied: pygments>=2.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (2.15.1)\nRequirement already satisfied: decorator in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.1.1)\nRequirement already satisfied: traitlets>=5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.9.0)\nRequirement already satisfied: matplotlib-inline in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.1.6)\nRequirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (2023.5.7)\nRequirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.1.0)\nRequirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.4)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (1.26.16)\nRequirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from numba->shap) (0.41.1)\nRequirement already satisfied: pytz>=2020.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\nRequirement already satisfied: tzdata>=2022.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\nRequirement already satisfied: python-dateutil>=2.8.2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2.8.2)\nRequirement already satisfied: joblib>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (1.2.0)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (3.1.0)\nRequirement already satisfied: smmap<6,>=3.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.0)\nRequirement already satisfied: parso<0.9.0,>=0.8.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from jedi>=0.16->ipython>=6.0->watermark) (0.8.3)\nRequirement already satisfied: ptyprocess>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pexpect>4.3->ipython>=6.0->watermark) (0.7.0)\nRequirement already satisfied: wcwidth in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.0->watermark) (0.2.6)\nRequirement already satisfied: executing>=1.2.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (1.2.0)\nRequirement already satisfied: asttokens>=2.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (2.2.1)\nRequirement already satisfied: pure-eval in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (0.2.2)\nInstalling collected packages: scikit-multilearn\nSuccessfully installed scikit-multilearn-0.2.0\nNote: you may need to restart the kernel to use updated packages.\n"
+        }
+      ],
+      "execution_count": 1,
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "import torch\n",
+        "import os\n",
+        "from typing import List\n",
+        "from sklearn.metrics import f1_score, accuracy_score, classification_report\n",
+        "from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, pipeline\n",
+        "from datasets import load_dataset, Dataset, DatasetDict\n",
+        "from pyarrow import Table\n",
+        "import shap\n",
+        "import wandb\n",
+        "from skmultilearn.problem_transform import LabelPowerset\n",
+        "\n",
+        "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+        "\n",
+        "%load_ext watermark"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n2024-01-28 15:09:42.856486: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\nTo enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n2024-01-28 15:09:43.818179: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n2024-01-28 15:09:43.818307: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n2024-01-28 15:09:43.818321: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
+        }
+      ],
+      "execution_count": 2,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "caZjjFP0OyQNMVgZDiwswE",
+          "report_properties": {
+            "rowId": "un8W7ez7ZwoGb5Co6nydEV"
+          },
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706454586481
+        },
+        "tags": []
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "device: str = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+        "\n",
+        "SEED: int = 42\n",
+        "\n",
+        "BATCH_SIZE: int = 16\n",
+        "EPOCHS: int = 3\n",
+        "model_ckpt: str = \"distilbert-base-uncased\"\n",
+        "\n",
+        "CLASS_NAMES: List[str] = [\"DIED\",\n",
+        "                          \"ER_VISIT\",\n",
+        "                          \"HOSPITAL\",\n",
+        "                          \"OFC_VISIT\",\n",
+        "                          #\"X_STAY\",      # pruned\n",
+        "                          #\"DISABLE\",     # pruned\n",
+        "                          #\"D_PRESENTED\"  # pruned\n",
+        "                          ]\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "# WandB configuration\n",
+        "os.environ[\"WANDB_PROJECT\"] = \"DAEDRA model training\"  # name your W&B project\n",
+        "os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\"  # log all model checkpoints\n",
+        "os.environ[\"WANDB_NOTEBOOK_NAME\"] = \"DAEDRA.ipynb\""
+      ],
+      "outputs": [],
+      "execution_count": 3,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706454586654
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%watermark --iversion"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "shap   : 0.44.1\nlogging: 0.5.1.2\npandas : 2.0.2\nnumpy  : 1.23.5\ntorch  : 1.12.0\nwandb  : 0.16.2\nre     : 2.2.1\n\n"
+        }
+      ],
+      "execution_count": 4,
+      "metadata": {
+        "collapsed": false,
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!nvidia-smi"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Sun Jan 28 15:09:47 2024       \r\n+---------------------------------------------------------------------------------------+\r\n| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |\r\n|-----------------------------------------+----------------------+----------------------+\r\n| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\r\n| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\r\n|                                         |                      |               MIG M. |\r\n|=========================================+======================+======================|\r\n|   0  Tesla V100-PCIE-16GB           Off | 00000001:00:00.0 Off |                  Off |\r\n| N/A   30C    P0              38W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n|   1  Tesla V100-PCIE-16GB           Off | 00000002:00:00.0 Off |                  Off |\r\n| N/A   29C    P0              38W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n                                                                                         \r\n+---------------------------------------------------------------------------------------+\r\n| Processes:                                                                            |\r\n|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\r\n|        ID   ID                                                             Usage      |\r\n|=======================================================================================|\r\n|  No running processes found                                                           |\r\n+---------------------------------------------------------------------------------------+\r\n"
+        }
+      ],
+      "execution_count": 5,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "UU2oOJhwbIualogG1YyCMd",
+          "type": "CODE"
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Loading the data set"
+      ],
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "t45KHugmcPVaO0nuk8tGJ9",
+          "report_properties": {
+            "rowId": "40nN9Hvgi1clHNV5RAemI5"
+          },
+          "type": "MD"
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dataset = load_dataset(\"chrisvoncsefalvay/vaers-outcomes\")"
+      ],
+      "outputs": [],
+      "execution_count": 7,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706449040507
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dataset"
+      ],
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "execution_count": 8,
+          "data": {
+            "text/plain": "DatasetDict({\n    train: Dataset({\n        features: ['id', 'text', 'labels'],\n        num_rows: 1270444\n    })\n    test: Dataset({\n        features: ['id', 'text', 'labels'],\n        num_rows: 272238\n    })\n    val: Dataset({\n        features: ['id', 'text', 'labels'],\n        num_rows: 272238\n    })\n})"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 8,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706449044205
+        },
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "SUBSAMPLING: float = 0.1"
+      ],
+      "outputs": [],
+      "execution_count": 9,
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def minisample(ds: DatasetDict, fraction: float) -> DatasetDict:\n",
+        "    res = DatasetDict()\n",
+        "\n",
+        "    res[\"train\"] = Dataset.from_dict(ds[\"train\"].shuffle()[:round(len(ds[\"train\"]) * fraction)])\n",
+        "    res[\"test\"] = Dataset.from_dict(ds[\"test\"].shuffle()[:round(len(ds[\"test\"]) * fraction)])\n",
+        "    res[\"val\"] = Dataset.from_dict(ds[\"val\"].shuffle()[:round(len(ds[\"val\"]) * fraction)])\n",
+        "    \n",
+        "    return res"
+      ],
+      "outputs": [],
+      "execution_count": 10,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706449378281
+        },
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dataset = minisample(dataset, SUBSAMPLING)"
+      ],
+      "outputs": [],
+      "execution_count": 11,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706449384162
+        },
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dataset"
+      ],
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "execution_count": 12,
+          "data": {
+            "text/plain": "DatasetDict({\n    train: Dataset({\n        features: ['id', 'text', 'labels'],\n        num_rows: 127044\n    })\n    test: Dataset({\n        features: ['id', 'text', 'labels'],\n        num_rows: 27224\n    })\n    val: Dataset({\n        features: ['id', 'text', 'labels'],\n        num_rows: 27224\n    })\n})"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 12,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706449387981
+        },
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We prune things down to the first four keys: `DIED`, `ER_VISIT`, `HOSPITAL`, `OFC_VISIT`."
+      ],
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "ds = DatasetDict()\n",
+        "\n",
+        "for i in [\"test\", \"train\", \"val\"]:\n",
+        "    tab = Table.from_arrays([dataset[i][\"id\"], dataset[i][\"text\"], [i[:4] for i in dataset[i][\"labels\"]]], names=[\"id\", \"text\", \"labels\"])\n",
+        "    ds[i] = Dataset(tab)\n",
+        "\n",
+        "dataset = ds"
+      ],
+      "outputs": [],
+      "execution_count": 13,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706449443055
+        },
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Tokenisation and encoding"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)"
+      ],
+      "outputs": [],
+      "execution_count": 14,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "I7n646PIscsUZRoHu6m7zm",
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706449638377
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def tokenize_and_encode(examples):\n",
+        "  return tokenizer(examples[\"text\"], truncation=True)"
+      ],
+      "outputs": [],
+      "execution_count": 15,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "QBLOSI0yVIslV7v7qX9ZC3",
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706449642580
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "cols = dataset[\"train\"].column_names\n",
+        "cols.remove(\"labels\")\n",
+        "ds_enc = dataset.map(tokenize_and_encode, batched=True, remove_columns=cols)"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "Map: 100%|██████████| 27224/27224 [00:10<00:00, 2638.52 examples/s]\nMap: 100%|██████████| 127044/127044 [00:48<00:00, 2633.40 examples/s]\nMap: 100%|██████████| 27224/27224 [00:10<00:00, 2613.19 examples/s]\n"
+        }
+      ],
+      "execution_count": 16,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "slHeNysZOX9uWS9PB7jFDb",
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706449721161
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Training"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "class MultiLabelTrainer(Trainer):\n",
+        "    def compute_loss(self, model, inputs, return_outputs=False):\n",
+        "        labels = inputs.pop(\"labels\")\n",
+        "        outputs = model(**inputs)\n",
+        "        logits = outputs.logits\n",
+        "        loss_fct = torch.nn.BCEWithLogitsLoss()\n",
+        "        loss = loss_fct(logits.view(-1, self.model.config.num_labels),\n",
+        "                        labels.float().view(-1, self.model.config.num_labels))\n",
+        "        return (loss, outputs) if return_outputs else loss"
+      ],
+      "outputs": [],
+      "execution_count": 17,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "itXWkbDw9sqbkMuDP84QoT",
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706449743072
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(CLASS_NAMES)).to(\"cuda\")"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\nYou should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+        }
+      ],
+      "execution_count": 18,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "ZQU7aW6TV45VmhHOQRzcnF",
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706449761205
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def accuracy_threshold(y_pred, y_true, threshold=.5, sigmoid=True):\n",
+        "    y_pred = torch.from_numpy(y_pred)\n",
+        "    y_true = torch.from_numpy(y_true)\n",
+        "\n",
+        "    if sigmoid:\n",
+        "        y_pred = y_pred.sigmoid()\n",
+        "\n",
+        "    return ((y_pred > threshold) == y_true.bool()).float().mean().item()"
+      ],
+      "outputs": [],
+      "execution_count": 19,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "swhgyyyxoGL8HjnXJtMuSW",
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706449761541
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def compute_metrics(eval_pred):\n",
+        "    predictions, labels = eval_pred\n",
+        "    return {'accuracy_thresh': accuracy_threshold(predictions, labels)}"
+      ],
+      "outputs": [],
+      "execution_count": 20,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "1Uq3HtkaBxtHNAnSwit5cI",
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706449761720
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "args = TrainingArguments(\n",
+        "    output_dir=\"vaers\",\n",
+        "    evaluation_strategy=\"epoch\",\n",
+        "    learning_rate=2e-5,\n",
+        "    per_device_train_batch_size=BATCH_SIZE,\n",
+        "    per_device_eval_batch_size=BATCH_SIZE,\n",
+        "    num_train_epochs=EPOCHS,\n",
+        "    weight_decay=.01,\n",
+        "    logging_steps=1,\n",
+        "    run_name=f\"daedra-training\",\n",
+        "    report_to=[\"wandb\"]\n",
+        ")"
+      ],
+      "outputs": [],
+      "execution_count": 21,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "1iPZOTKPwSkTgX5dORqT89",
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706449761893
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "multi_label_trainer = MultiLabelTrainer(\n",
+        "    model, \n",
+        "    args, \n",
+        "    train_dataset=ds_enc[\"train\"], \n",
+        "    eval_dataset=ds_enc[\"test\"], \n",
+        "    compute_metrics=compute_metrics, \n",
+        "    tokenizer=tokenizer\n",
+        ")"
+      ],
+      "outputs": [],
+      "execution_count": 22,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "bnRkNvRYltLun6gCEgL7v0",
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706449769103
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "if SUBSAMPLING != 1.0:\n",
+        "    wandb_tag: List[str] = [f\"subsample-{SUBSAMPLING}\"]\n",
+        "else:\n",
+        "    wandb_tag: List[str] = [f\"full_sample\"]\n",
+        "    \n",
+        "wandb.init(name=\"init_evaluation_run\", tags=wandb_tag, magic=True)\n",
+        "\n",
+        "multi_label_trainer.evaluate()\n",
+        "wandb.finish()"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mchrisvoncsefalvay\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m wandb.init() arguments ignored because wandb magic has already been initialized\n"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": "Tracking run with wandb version 0.16.2",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/cvc-vaers-bert-dnsd/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240128_141956-9lniqjvz</code>",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/9lniqjvz' target=\"_blank\">init_evaluation_run</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training</a>",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/9lniqjvz' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/9lniqjvz</a>",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": "Finishing last run (ID:9lniqjvz) before initializing another...",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": " View run <strong style=\"color:#cdcd00\">init_evaluation_run</strong> at: <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/9lniqjvz' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/9lniqjvz</a><br/>Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": "Find logs at: <code>./wandb/run-20240128_141956-9lniqjvz/logs</code>",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": "Successfully finished last run (ID:9lniqjvz). Initializing new run:<br/>",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": "Tracking run with wandb version 0.16.2",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/cvc-vaers-bert-dnsd/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240128_141958-5idmkcie</code>",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/5idmkcie' target=\"_blank\">init_evaluation_run</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training</a>",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/5idmkcie' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/5idmkcie</a>",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": "\n    <div>\n      \n      <progress value='1003' max='851' style='width:300px; height:20px; vertical-align: middle;'></progress>\n      [851/851 26:26]\n    </div>\n    ",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": "<style>\n    table.wandb td:nth-child(1) { padding: 0 10px; text-align: left ; width: auto;} td:nth-child(2) {text-align: left ; width: 100%}\n    .wandb-row { display: flex; flex-direction: row; flex-wrap: wrap; justify-content: flex-start; width: 100% }\n    .wandb-col { display: flex; flex-direction: column; flex-basis: 100%; flex: 1; padding: 10px; }\n    </style>\n<div class=\"wandb-row\"><div class=\"wandb-col\"><h3>Run history:</h3><br/><table class=\"wandb\"><tr><td>eval/accuracy_thresh</td><td>▁</td></tr><tr><td>eval/loss</td><td>▁</td></tr><tr><td>eval/runtime</td><td>▁</td></tr><tr><td>eval/samples_per_second</td><td>▁</td></tr><tr><td>eval/steps_per_second</td><td>▁</td></tr><tr><td>train/global_step</td><td>▁</td></tr></table><br/></div><div class=\"wandb-col\"><h3>Run summary:</h3><br/><table class=\"wandb\"><tr><td>eval/accuracy_thresh</td><td>0.55198</td></tr><tr><td>eval/loss</td><td>0.68442</td></tr><tr><td>eval/runtime</td><td>105.0436</td></tr><tr><td>eval/samples_per_second</td><td>259.168</td></tr><tr><td>eval/steps_per_second</td><td>8.101</td></tr><tr><td>train/global_step</td><td>0</td></tr></table><br/></div></div>",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": " View run <strong style=\"color:#cdcd00\">init_evaluation_run</strong> at: <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/5idmkcie' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/5idmkcie</a><br/>Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": "Find logs at: <code>./wandb/run-20240128_141958-5idmkcie/logs</code>",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 23,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "LO54PlDkWQdFrzV25FvduB",
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706449880674
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "if SUBSAMPLING != 1.0:\n",
+        "    wandb_tag: List[str] = [f\"subsample-{SUBSAMPLING}\"]\n",
+        "else:\n",
+        "    wandb_tag: List[str] = [f\"full_sample\"]\n",
+        "    \n",
+        "wandb.init(name=\"daedra_training_run\", tags=wandb_tag, magic=True)\n",
+        "\n",
+        "multi_label_trainer.train()\n",
+        "wandb.finish()"
+      ],
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": "Tracking run with wandb version 0.16.2",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/cvc-vaers-bert-dnsd/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240128_142151-2mcc0ibc</code>",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/2mcc0ibc' target=\"_blank\">daedra_training_run</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training</a>",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/2mcc0ibc' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/2mcc0ibc</a>",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/html": "\n    <div>\n      \n      <progress value='3972' max='11913' style='width:300px; height:20px; vertical-align: middle;'></progress>\n      [ 3972/11913 24:20 < 48:40, 2.72 it/s, Epoch 1/3]\n    </div>\n    <table border=\"1\" class=\"dataframe\">\n  <thead>\n <tr style=\"text-align: left;\">\n      <th>Epoch</th>\n      <th>Training Loss</th>\n      <th>Validation Loss</th>\n    </tr>\n  </thead>\n  <tbody>\n  </tbody>\n</table><p>",
+            "text/plain": "<IPython.core.display.HTML object>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-500)... Done. 15.6s\n\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-1000)... Done. 22.7s\n\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-1500)... Done. 14.0s\n\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-2000)... Done. 15.2s\n\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-2500)... Done. 14.0s\n\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-3000)... Done. 12.4s\n\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-3500)... Done. 13.4s\n"
+        }
+      ],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "hf0Ei1QXEYDmBv1VNLZ4Zw",
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706449934637
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Evaluation"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We instantiate a classifier `pipeline` and push it to CUDA."
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "classifier = pipeline(\"text-classification\", \n",
+        "                      model, \n",
+        "                      tokenizer=tokenizer, \n",
+        "                      device=\"cuda:0\")"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "kHoUdBeqcyVXDSGv54C4aE",
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706411459928
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We use the same tokenizer used for training to tokenize/encode the validation set."
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "test_encodings = tokenizer.batch_encode_plus(dataset[\"val\"][\"text\"], \n",
+        "                                             max_length=None, \n",
+        "                                             padding='max_length', \n",
+        "                                             return_token_type_ids=True, \n",
+        "                                             truncation=True)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "Dr5WCWA6jL51NR1fSrQu6Z",
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706411523285
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Once we've made the data loadable by putting it into a `DataLoader`, we "
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "test_data = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']), \n",
+        "                                           torch.tensor(test_encodings['attention_mask']), \n",
+        "                                           torch.tensor(ds_enc[\"val\"][\"labels\"]), \n",
+        "                                           torch.tensor(test_encodings['token_type_ids']))\n",
+        "test_dataloader = torch.utils.data.DataLoader(test_data, \n",
+        "                                              sampler=torch.utils.data.SequentialSampler(test_data), \n",
+        "                                              batch_size=BATCH_SIZE)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "MWfGq2tTkJNzFiDoUPq2X7",
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706411543379
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model.eval()\n",
+        "\n",
+        "logit_preds, true_labels, pred_labels, tokenized_texts = [], [], [], []\n",
+        "\n",
+        "for i, batch in enumerate(test_dataloader):\n",
+        "  batch = tuple(t.to(device) for t in batch)\n",
+        "  \n",
+        "  # Unpack the inputs from our dataloader\n",
+        "  b_input_ids, b_input_mask, b_labels, b_token_types = batch\n",
+        "  \n",
+        "  with torch.no_grad():\n",
+        "    outs = model(b_input_ids, attention_mask=b_input_mask)\n",
+        "    b_logit_pred = outs[0]\n",
+        "    pred_label = torch.sigmoid(b_logit_pred)\n",
+        "\n",
+        "    b_logit_pred = b_logit_pred.detach().cpu().numpy()\n",
+        "    pred_label = pred_label.to('cpu').numpy()\n",
+        "    b_labels = b_labels.to('cpu').numpy()\n",
+        "\n",
+        "  tokenized_texts.append(b_input_ids)\n",
+        "  logit_preds.append(b_logit_pred)\n",
+        "  true_labels.append(b_labels)\n",
+        "  pred_labels.append(pred_label)\n",
+        "\n",
+        "# Flatten outputs\n",
+        "tokenized_texts = [item for sublist in tokenized_texts for item in sublist]\n",
+        "pred_labels = [item for sublist in pred_labels for item in sublist]\n",
+        "true_labels = [item for sublist in true_labels for item in sublist]\n",
+        "\n",
+        "# Converting flattened binary values to boolean values\n",
+        "true_bools = [tl == 1 for tl in true_labels]\n",
+        "pred_bools = [pl > 0.50 for pl in pred_labels] "
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "1SJCSrQTRCexFCNCIyRrzL",
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706411587843
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We create a classification report:"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools, average='micro'))\n",
+        "print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools), '\\n')\n",
+        "clf_report = classification_report(true_bools, pred_bools, target_names=CLASS_NAMES)\n",
+        "print(clf_report)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "eBprrgF086mznPbPVBpOLS",
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706411588249
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Finally, we render a 'head to head' comparison table that maps each text prediction to actual and predicted labels."
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Creating a map of class names from class numbers\n",
+        "idx2label = dict(zip(range(len(CLASS_NAMES)), CLASS_NAMES))"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "yELHY0IEwMlMw3x6e7hoD1",
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706411588638
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "true_label_idxs, pred_label_idxs = [], []\n",
+        "\n",
+        "for vals in true_bools:\n",
+        "  true_label_idxs.append(np.where(vals)[0].flatten().tolist())\n",
+        "for vals in pred_bools:\n",
+        "  pred_label_idxs.append(np.where(vals)[0].flatten().tolist())"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "jH0S35dDteUch01sa6me6e",
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706411589004
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "true_label_texts, pred_label_texts = [], []\n",
+        "\n",
+        "for vals in true_label_idxs:\n",
+        "  if vals:\n",
+        "    true_label_texts.append([idx2label[val] for val in vals])\n",
+        "  else:\n",
+        "    true_label_texts.append(vals)\n",
+        "\n",
+        "for vals in pred_label_idxs:\n",
+        "  if vals:\n",
+        "    pred_label_texts.append([idx2label[val] for val in vals])\n",
+        "  else:\n",
+        "    pred_label_texts.append(vals)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "h4vHL8XdGpayZ6xLGJUF6F",
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706411589301
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "symptom_texts = [tokenizer.decode(text,\n",
+        "                                  skip_special_tokens=True,\n",
+        "                                  clean_up_tokenization_spaces=False) for text in tokenized_texts]"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "SxUmVHfQISEeptg1SawOmB",
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706411591952
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "comparisons_df = pd.DataFrame({'symptom_text': symptom_texts, \n",
+        "                               'true_labels': true_label_texts, \n",
+        "                               'pred_labels':pred_label_texts})\n",
+        "comparisons_df.to_csv('comparisons.csv')\n",
+        "comparisons_df"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "BxFNigNGRLTOqraI55BPSH",
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706411592512
+        }
+      }
+    }
+  ],
+  "metadata": {
+    "datalore": {
+      "base_environment": "default",
+      "computation_mode": "JUPYTER",
+      "package_manager": "pip",
+      "packages": [
+        {
+          "name": "datasets",
+          "source": "PIP",
+          "version": "2.16.1"
+        },
+        {
+          "name": "torch",
+          "source": "PIP",
+          "version": "2.1.2"
+        },
+        {
+          "name": "accelerate",
+          "source": "PIP",
+          "version": "0.26.1"
+        }
+      ],
+      "report_row_ids": [
+        "un8W7ez7ZwoGb5Co6nydEV",
+        "40nN9Hvgi1clHNV5RAemI5",
+        "TgRD90H5NSPpKS41OeXI1w",
+        "ZOm5BfUs3h1EGLaUkBGeEB",
+        "kOP0CZWNSk6vqE3wkPp7Vc",
+        "W4PWcOu2O2pRaZyoE2W80h",
+        "RolbOnQLIftk0vy9mIcz5M",
+        "8OPhUgbaNJmOdiq5D3a6vK",
+        "5Qrt3jSvSrpK6Ne1hS6shL",
+        "hTq7nFUrovN5Ao4u6dIYWZ",
+        "I8WNZLpJ1DVP2wiCW7YBIB",
+        "SawhU3I9BewSE1XBPstpNJ",
+        "80EtLEl2FIE4FqbWnUD3nT"
+      ],
+      "version": 3
+    },
+    "kernelspec": {
+      "display_name": "Python 3.8 - Pytorch and Tensorflow",
+      "language": "python",
+      "name": "python38-azureml-pt-tf"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.8.5",
+      "mimetype": "text/x-python",
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "pygments_lexer": "ipython3",
+      "nbconvert_exporter": "python",
+      "file_extension": ".py"
+    },
+    "microsoft": {
+      "host": {
+        "AzureML": {
+          "notebookHasBeenCompleted": true
+        }
+      },
+      "ms_spell_check": {
+        "ms_spell_check_language": "en"
+      }
+    },
+    "nteract": {
+      "version": "nteract-front-end@1.0.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}

notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-28-20-56-58Z.ipynb ADDED Viewed

	@@ -0,0 +1,993 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# DAEDRA: Determining Adverse Event Disposition for Regulatory Affairs\n",
+        "\n",
+        "DAEDRA is a language model intended to predict the disposition (outcome) of an adverse event based on the text of the event report. Intended to be used to classify reports in passive reporting systems, it is trained on the [VAERS](https://vaers.hhs.gov/) dataset, which contains reports of adverse events following vaccination in the United States."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "tags": []
+      },
+      "outputs": [],
+      "source": [
+        "# %pip install accelerate -U"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Requirement already satisfied: transformers in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (4.37.1)\n",
+            "Requirement already satisfied: datasets in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.16.1)\n",
+            "Requirement already satisfied: shap in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.44.1)\n",
+            "Requirement already satisfied: watermark in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.4.3)\n",
+            "Requirement already satisfied: wandb in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.16.2)\n",
+            "Requirement already satisfied: scikit-multilearn in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.2.0)\n",
+            "Requirement already satisfied: evaluate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.4.1)\n",
+            "Requirement already satisfied: regex!=2019.12.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2023.12.25)\n",
+            "Requirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2.31.0)\n",
+            "Requirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.4.2)\n",
+            "Requirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (1.23.5)\n",
+            "Requirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (23.1)\n",
+            "Requirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.20.3)\n",
+            "Requirement already satisfied: tqdm>=4.27 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (4.65.0)\n",
+            "Requirement already satisfied: pyyaml>=5.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (6.0)\n",
+            "Requirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (3.13.1)\n",
+            "Requirement already satisfied: tokenizers<0.19,>=0.14 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.15.1)\n",
+            "Requirement already satisfied: pyarrow-hotfix in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.6)\n",
+            "Requirement already satisfied: pandas in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2.0.2)\n",
+            "Requirement already satisfied: xxhash in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.4.1)\n",
+            "Requirement already satisfied: multiprocess in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.70.15)\n",
+            "Requirement already satisfied: pyarrow>=8.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (9.0.0)\n",
+            "Requirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2023.10.0)\n",
+            "Requirement already satisfied: dill<0.3.8,>=0.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.3.7)\n",
+            "Requirement already satisfied: aiohttp in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.9.1)\n",
+            "Requirement already satisfied: slicer==0.0.7 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.0.7)\n",
+            "Requirement already satisfied: numba in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.58.1)\n",
+            "Requirement already satisfied: scikit-learn in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.2.2)\n",
+            "Requirement already satisfied: cloudpickle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (2.2.1)\n",
+            "Requirement already satisfied: scipy in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.10.1)\n",
+            "Requirement already satisfied: ipython>=6.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (8.12.2)\n",
+            "Requirement already satisfied: setuptools in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (65.6.3)\n",
+            "Requirement already satisfied: importlib-metadata>=1.4 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (6.7.0)\n",
+            "Requirement already satisfied: GitPython!=3.1.29,>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.1.31)\n",
+            "Requirement already satisfied: psutil>=5.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (5.9.5)\n",
+            "Requirement already satisfied: setproctitle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.3.3)\n",
+            "Requirement already satisfied: protobuf!=4.21.0,<5,>=3.12.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.19.6)\n",
+            "Requirement already satisfied: docker-pycreds>=0.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (0.4.0)\n",
+            "Requirement already satisfied: appdirs>=1.4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.4.4)\n",
+            "Requirement already satisfied: sentry-sdk>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.39.2)\n",
+            "Requirement already satisfied: typing-extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (4.6.3)\n",
+            "Requirement already satisfied: Click!=8.0.0,>=7.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (8.1.3)\n",
+            "Requirement already satisfied: responses<0.19 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from evaluate) (0.18.0)\n",
+            "Requirement already satisfied: six>=1.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0)\n",
+            "Requirement already satisfied: yarl<2.0,>=1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.9.4)\n",
+            "Requirement already satisfied: async-timeout<5.0,>=4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (4.0.3)\n",
+            "Requirement already satisfied: attrs>=17.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (23.1.0)\n",
+            "Requirement already satisfied: aiosignal>=1.1.2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.1)\n",
+            "Requirement already satisfied: frozenlist>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.4.1)\n",
+            "Requirement already satisfied: multidict<7.0,>=4.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (6.0.4)\n",
+            "Requirement already satisfied: gitdb<5,>=4.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from GitPython!=3.1.29,>=1.0.0->wandb) (4.0.10)\n",
+            "Requirement already satisfied: zipp>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from importlib-metadata>=1.4->watermark) (3.15.0)\n",
+            "Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (3.0.30)\n",
+            "Requirement already satisfied: stack-data in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.6.2)\n",
+            "Requirement already satisfied: traitlets>=5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.9.0)\n",
+            "Requirement already satisfied: jedi>=0.16 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.18.2)\n",
+            "Requirement already satisfied: backcall in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.2.0)\n",
+            "Requirement already satisfied: decorator in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.1.1)\n",
+            "Requirement already satisfied: pexpect>4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (4.8.0)\n",
+            "Requirement already satisfied: matplotlib-inline in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.1.6)\n",
+            "Requirement already satisfied: pygments>=2.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (2.15.1)\n",
+            "Requirement already satisfied: pickleshare in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.7.5)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.4)\n",
+            "Requirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (1.26.16)\n",
+            "Requirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.1.0)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (2023.5.7)\n",
+            "Requirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from numba->shap) (0.41.1)\n",
+            "Requirement already satisfied: python-dateutil>=2.8.2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2.8.2)\n",
+            "Requirement already satisfied: tzdata>=2022.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\n",
+            "Requirement already satisfied: pytz>=2020.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\n",
+            "Requirement already satisfied: threadpoolctl>=2.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (3.1.0)\n",
+            "Requirement already satisfied: joblib>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (1.2.0)\n",
+            "Requirement already satisfied: smmap<6,>=3.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.0)\n",
+            "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from jedi>=0.16->ipython>=6.0->watermark) (0.8.3)\n",
+            "Requirement already satisfied: ptyprocess>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pexpect>4.3->ipython>=6.0->watermark) (0.7.0)\n",
+            "Requirement already satisfied: wcwidth in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.0->watermark) (0.2.6)\n",
+            "Requirement already satisfied: executing>=1.2.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (1.2.0)\n",
+            "Requirement already satisfied: asttokens>=2.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (2.2.1)\n",
+            "Requirement already satisfied: pure-eval in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (0.2.2)\n",
+            "Note: you may need to restart the kernel to use updated packages.\n"
+          ]
+        }
+      ],
+      "source": [
+        "%pip install transformers datasets shap watermark wandb scikit-multilearn evaluate codecarbon"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "caZjjFP0OyQNMVgZDiwswE",
+          "report_properties": {
+            "rowId": "un8W7ez7ZwoGb5Co6nydEV"
+          },
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706454586481
+        },
+        "tags": []
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+            "  from .autonotebook import tqdm as notebook_tqdm\n",
+            "2024-01-28 19:47:15.508449: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
+            "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+            "2024-01-28 19:47:16.502791: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n",
+            "2024-01-28 19:47:16.502915: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n",
+            "2024-01-28 19:47:16.502928: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
+          ]
+        }
+      ],
+      "source": [
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "import torch\n",
+        "import os\n",
+        "from typing import List, Union\n",
+        "from sklearn.metrics import f1_score, accuracy_score, classification_report\n",
+        "from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, DataCollatorWithPadding, pipeline\n",
+        "from datasets import load_dataset, Dataset, DatasetDict\n",
+        "from pyarrow import Table\n",
+        "import shap\n",
+        "import wandb\n",
+        "import evaluate\n",
+        "from codecarbon import EmissionsTracker\n",
+        "\n",
+        "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+        "tracker = EmissionsTracker()\n",
+        "\n",
+        "%load_ext watermark"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706454586654
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      },
+      "outputs": [],
+      "source": [
+        "device: str = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+        "\n",
+        "SEED: int = 42\n",
+        "\n",
+        "BATCH_SIZE: int = 16\n",
+        "EPOCHS: int = 3\n",
+        "model_ckpt: str = \"distilbert-base-uncased\"\n",
+        "\n",
+        "# WandB configuration\n",
+        "os.environ[\"WANDB_PROJECT\"] = \"DAEDRA multiclass model training\" \n",
+        "os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\"  # log all model checkpoints\n",
+        "os.environ[\"WANDB_NOTEBOOK_NAME\"] = \"DAEDRA.ipynb\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "collapsed": false,
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "numpy   : 1.23.5\n",
+            "re      : 2.2.1\n",
+            "evaluate: 0.4.1\n",
+            "pandas  : 2.0.2\n",
+            "wandb   : 0.16.2\n",
+            "shap    : 0.44.1\n",
+            "torch   : 1.12.0\n",
+            "logging : 0.5.1.2\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "%watermark --iversion"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "UU2oOJhwbIualogG1YyCMd",
+          "type": "CODE"
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Sun Jan 28 19:47:19 2024       \n",
+            "+---------------------------------------------------------------------------------------+\n",
+            "| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |\n",
+            "|-----------------------------------------+----------------------+----------------------+\n",
+            "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
+            "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
+            "|                                         |                      |               MIG M. |\n",
+            "|=========================================+======================+======================|\n",
+            "|   0  Tesla V100-PCIE-16GB           Off | 00000001:00:00.0 Off |                  Off |\n",
+            "| N/A   29C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\n",
+            "|                                         |                      |                  N/A |\n",
+            "+-----------------------------------------+----------------------+----------------------+\n",
+            "|   1  Tesla V100-PCIE-16GB           Off | 00000002:00:00.0 Off |                  Off |\n",
+            "| N/A   29C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\n",
+            "|                                         |                      |                  N/A |\n",
+            "+-----------------------------------------+----------------------+----------------------+\n",
+            "                                                                                         \n",
+            "+---------------------------------------------------------------------------------------+\n",
+            "| Processes:                                                                            |\n",
+            "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
+            "|        ID   ID                                                             Usage      |\n",
+            "|=======================================================================================|\n",
+            "|  No running processes found                                                           |\n",
+            "+---------------------------------------------------------------------------------------+\n"
+          ]
+        }
+      ],
+      "source": [
+        "!nvidia-smi"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "t45KHugmcPVaO0nuk8tGJ9",
+          "report_properties": {
+            "rowId": "40nN9Hvgi1clHNV5RAemI5"
+          },
+          "type": "MD"
+        }
+      },
+      "source": [
+        "## Loading the data set"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706449040507
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      },
+      "outputs": [],
+      "source": [
+        "dataset = load_dataset(\"chrisvoncsefalvay/vaers-outcomes\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706449044205
+        },
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "DatasetDict({\n",
+              "    train: Dataset({\n",
+              "        features: ['id', 'text', 'label'],\n",
+              "        num_rows: 1270444\n",
+              "    })\n",
+              "    test: Dataset({\n",
+              "        features: ['id', 'text', 'label'],\n",
+              "        num_rows: 272238\n",
+              "    })\n",
+              "    val: Dataset({\n",
+              "        features: ['id', 'text', 'label'],\n",
+              "        num_rows: 272238\n",
+              "    })\n",
+              "})"
+            ]
+          },
+          "execution_count": 8,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "SUBSAMPLING = 0.1\n",
+        "\n",
+        "if SUBSAMPLING < 1:\n",
+        "    _ = DatasetDict()\n",
+        "    for each in dataset.keys():\n",
+        "        _[each] = dataset[each].shuffle(seed=SEED).select(range(int(len(dataset[each]) * SUBSAMPLING)))\n",
+        "\n",
+        "    dataset = _"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Tokenisation and encoding"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def encode_ds(ds: Union[Dataset, DatasetDict], tokenizer_model: str = model_ckpt) -> Union[Dataset, DatasetDict]:\n",
+        "    return ds_enc"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Evaluation metrics"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "accuracy = evaluate.load(\"accuracy\")\n",
+        "precision, recall = evaluate.load(\"precision\"), evaluate.load(\"recall\")\n",
+        "f1 = evaluate.load(\"f1\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def compute_metrics(eval_pred):\n",
+        "    predictions, labels = eval_pred\n",
+        "    predictions = np.argmax(predictions, axis=1)\n",
+        "    return {\n",
+        "        'accuracy': accuracy.compute(predictions=predictions, references=labels)[\"accuracy\"],\n",
+        "        'precision_macroaverage': precision.compute(predictions=predictions, references=labels, average='macro')[\"precision\"],\n",
+        "        'precision_microaverage': precision.compute(predictions=predictions, references=labels, average='micro')[\"precision\"],\n",
+        "        'recall_macroaverage': recall.compute(predictions=predictions, references=labels, average='macro')[\"recall\"],\n",
+        "        'recall_microaverage': recall.compute(predictions=predictions, references=labels, average='micro')[\"recall\"],\n",
+        "        'f1_microaverage': f1.compute(predictions=predictions, references=labels, average='micro')[\"f1\"]\n",
+        "    }"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Training"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We specify a label map – this has to be done manually, even if `Datasets` has a function for it, as `AutoModelForSequenceClassification` requires an object with a length :("
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "label_map = {i: label for i, label in enumerate(dataset[\"test\"].features[\"label\"].names)}"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 14,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Map: 100%|██████████| 127044/127044 [00:53<00:00, 2384.54 examples/s]\n",
+            "Map: 100%|██████████| 27223/27223 [00:11<00:00, 2396.71 examples/s]\n",
+            "Map: 100%|██████████| 27223/27223 [00:11<00:00, 2375.38 examples/s]\n",
+            "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+          ]
+        }
+      ],
+      "source": [
+        "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)\n",
+        "\n",
+        "cols = dataset[\"train\"].column_names\n",
+        "cols.remove(\"label\")\n",
+        "ds_enc = dataset.map(lambda x: tokenizer(x[\"text\"], truncation=True), batched=True, remove_columns=cols)\n",
+        "\n",
+        "model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, \n",
+        "    num_labels=len(dataset[\"test\"].features[\"label\"].names), \n",
+        "    id2label=label_map, \n",
+        "    label2id={v:k for k,v in label_map.items()})\n",
+        "\n",
+        "args = TrainingArguments(\n",
+        "    output_dir=\"vaers\",\n",
+        "    evaluation_strategy=\"epoch\",\n",
+        "    save_strategy=\"epoch\",\n",
+        "    learning_rate=2e-5,\n",
+        "    per_device_train_batch_size=BATCH_SIZE,\n",
+        "    per_device_eval_batch_size=BATCH_SIZE,\n",
+        "    num_train_epochs=EPOCHS,\n",
+        "    weight_decay=.01,\n",
+        "    logging_steps=1,\n",
+        "    load_best_model_at_end=True,\n",
+        "    run_name=f\"daedra-training\",\n",
+        "    report_to=[\"wandb\"])\n",
+        "\n",
+        "trainer = Trainer(\n",
+        "        model=model,\n",
+        "        args=args,\n",
+        "        train_dataset=ds_enc[\"train\"],\n",
+        "        eval_dataset=ds_enc[\"test\"],\n",
+        "        tokenizer=tokenizer,\n",
+        "        compute_metrics=compute_metrics)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 15,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mchrisvoncsefalvay\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
+            "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m wandb.init() arguments ignored because wandb magic has already been initialized\n"
+          ]
+        },
+        {
+          "data": {
+            "text/html": [
+              "Tracking run with wandb version 0.16.2"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/cvc-vaers-bert-dnsd/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240128_194842-yvxddyg6</code>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/yvxddyg6' target=\"_blank\">daedra_training_run</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/yvxddyg6' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/yvxddyg6</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Finishing last run (ID:yvxddyg6) before initializing another..."
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              " View run <strong style=\"color:#cdcd00\">daedra_training_run</strong> at: <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/yvxddyg6' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/yvxddyg6</a><br/>Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Find logs at: <code>./wandb/run-20240128_194842-yvxddyg6/logs</code>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Successfully finished last run (ID:yvxddyg6). Initializing new run:<br/>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Tracking run with wandb version 0.16.2"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/cvc-vaers-bert-dnsd/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240128_194845-9g8te2gf</code>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/9g8te2gf' target=\"_blank\">daedra_training_run</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/9g8te2gf' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/9g8te2gf</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "<button onClick=\"this.nextSibling.style.display='block';this.style.display='none';\">Display W&B run</button><iframe src='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/9g8te2gf?jupyter=true' style='border:none;width:100%;height:420px;display:none;'></iframe>"
+            ],
+            "text/plain": [
+              "<wandb.sdk.wandb_run.Run at 0x7fb8b483bf40>"
+            ]
+          },
+          "execution_count": 15,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "if SUBSAMPLING != 1.0:\n",
+        "    wandb_tag: List[str] = [f\"subsample-{SUBSAMPLING}\"]\n",
+        "else:\n",
+        "    wandb_tag: List[str] = [f\"full_sample\"]\n",
+        "\n",
+        "wandb_tag.append(f\"batch_size-{BATCH_SIZE}\")\n",
+        "wandb_tag.append(f\"base:{model_ckpt}\")\n",
+        "    \n",
+        "wandb.init(name=\"daedra_training_run\", tags=wandb_tag, magic=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 16,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"
+          ]
+        },
+        {
+          "data": {
+            "text/html": [
+              "\n",
+              "    <div>\n",
+              "      \n",
+              "      <progress value='7943' max='11913' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+              "      [ 7943/11913 43:43 < 21:51, 3.03 it/s, Epoch 2/3]\n",
+              "    </div>\n",
+              "    <table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              " <tr style=\"text-align: left;\">\n",
+              "      <th>Epoch</th>\n",
+              "      <th>Training Loss</th>\n",
+              "      <th>Validation Loss</th>\n",
+              "      <th>Accuracy</th>\n",
+              "      <th>Precision Macroaverage</th>\n",
+              "      <th>Precision Microaverage</th>\n",
+              "      <th>Recall Macroaverage</th>\n",
+              "      <th>Recall Microaverage</th>\n",
+              "      <th>F1 Microaverage</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <td>1</td>\n",
+              "      <td>0.251300</td>\n",
+              "      <td>0.362917</td>\n",
+              "      <td>0.865775</td>\n",
+              "      <td>0.701081</td>\n",
+              "      <td>0.865775</td>\n",
+              "      <td>0.556570</td>\n",
+              "      <td>0.865775</td>\n",
+              "      <td>0.865775</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>2</td>\n",
+              "      <td>0.036000</td>\n",
+              "      <td>0.352118</td>\n",
+              "      <td>0.870551</td>\n",
+              "      <td>0.728051</td>\n",
+              "      <td>0.870551</td>\n",
+              "      <td>0.609787</td>\n",
+              "      <td>0.870551</td>\n",
+              "      <td>0.870551</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table><p>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-3971)... Done. 18.2s\n",
+            "Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+            "\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-7942)... "
+          ]
+        }
+      ],
+      "source": [
+        "tracker.start()\n",
+        "trainer.train()\n",
+        "tracker.stop()\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "<style>\n",
+              "    table.wandb td:nth-child(1) { padding: 0 10px; text-align: left ; width: auto;} td:nth-child(2) {text-align: left ; width: 100%}\n",
+              "    .wandb-row { display: flex; flex-direction: row; flex-wrap: wrap; justify-content: flex-start; width: 100% }\n",
+              "    .wandb-col { display: flex; flex-direction: column; flex-basis: 100%; flex: 1; padding: 10px; }\n",
+              "    </style>\n",
+              "<div class=\"wandb-row\"><div class=\"wandb-col\"><h3>Run history:</h3><br/><table class=\"wandb\"><tr><td>eval/accuracy</td><td>▁▇█</td></tr><tr><td>eval/f1_microaverage</td><td>▁▇█</td></tr><tr><td>eval/loss</td><td>█▃▁</td></tr><tr><td>eval/precision_macroaverage</td><td>▁▇█</td></tr><tr><td>eval/precision_microaverage</td><td>▁▇█</td></tr><tr><td>eval/recall_macroaverage</td><td>▁▇█</td></tr><tr><td>eval/recall_microaverage</td><td>▁▇█</td></tr><tr><td>eval/runtime</td><td>▁▃█</td></tr><tr><td>eval/samples_per_second</td><td>█▆▁</td></tr><tr><td>eval/steps_per_second</td><td>█▆▁</td></tr><tr><td>train/epoch</td><td>▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███</td></tr><tr><td>train/global_step</td><td>▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███</td></tr><tr><td>train/learning_rate</td><td>████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁</td></tr><tr><td>train/loss</td><td>█▅▆▆▅▄▄▃▆▅▃▃▅▄▆▄▄▄▂▄▄▅▄▃▄▄▁▄▂▂▃▃▃▂▂▃▂▃▃▂</td></tr><tr><td>train/total_flos</td><td>▁</td></tr><tr><td>train/train_loss</td><td>▁</td></tr><tr><td>train/train_runtime</td><td>▁</td></tr><tr><td>train/train_samples_per_second</td><td>▁</td></tr><tr><td>train/train_steps_per_second</td><td>▁</td></tr></table><br/></div><div class=\"wandb-col\"><h3>Run summary:</h3><br/><table class=\"wandb\"><tr><td>eval/accuracy</td><td>0.84019</td></tr><tr><td>eval/f1_microaverage</td><td>0.84019</td></tr><tr><td>eval/loss</td><td>0.44011</td></tr><tr><td>eval/precision_macroaverage</td><td>0.415</td></tr><tr><td>eval/precision_microaverage</td><td>0.84019</td></tr><tr><td>eval/recall_macroaverage</td><td>0.40704</td></tr><tr><td>eval/recall_microaverage</td><td>0.84019</td></tr><tr><td>eval/runtime</td><td>10.0118</td></tr><tr><td>eval/samples_per_second</td><td>271.878</td></tr><tr><td>eval/steps_per_second</td><td>8.59</td></tr><tr><td>train/epoch</td><td>3.0</td></tr><tr><td>train/global_step</td><td>1191</td></tr><tr><td>train/learning_rate</td><td>0.0</td></tr><tr><td>train/loss</td><td>0.1782</td></tr><tr><td>train/total_flos</td><td>4885522962505728.0</td></tr><tr><td>train/train_loss</td><td>0.4724</td></tr><tr><td>train/train_runtime</td><td>483.5027</td></tr><tr><td>train/train_samples_per_second</td><td>78.825</td></tr><tr><td>train/train_steps_per_second</td><td>2.463</td></tr></table><br/></div></div>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              " View run <strong style=\"color:#cdcd00\">daedra_training_run</strong> at: <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/3xvt3c2y' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/3xvt3c2y</a><br/>Synced 5 W&B file(s), 0 media file(s), 40 artifact file(s) and 0 other file(s)"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Find logs at: <code>./wandb/run-20240128_192000-3xvt3c2y/logs</code>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "wandb.finish()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "CommitInfo(commit_url='https://huggingface.co/chrisvoncsefalvay/daedra/commit/c482ca6c8520142a3e67df4be25a408e6b557053', commit_message='DAEDRA model trained on 1.0% of the full sample of the VAERS dataset (training set size: 12,704)', commit_description='', oid='c482ca6c8520142a3e67df4be25a408e6b557053', pr_url=None, pr_revision=None, pr_num=None)"
+            ]
+          },
+          "execution_count": 31,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "variant = \"full_sample\" if SUBSAMPLING == 1.0 else f\"subsample-{SUBSAMPLING}\"\n",
+        "tokenizer._tokenizer.save(\"tokenizer.json\")\n",
+        "tokenizer.push_to_hub(\"chrisvoncsefalvay/daedra\")\n",
+        "sample = \"full sample\" if SUBSAMPLING == 1.0 else f\"{SUBSAMPLING * 100}% of the full sample\"\n",
+        "\n",
+        "model.push_to_hub(\"chrisvoncsefalvay/daedra\", \n",
+        "                  variant=variant,\n",
+        "                  commit_message=f\"DAEDRA model trained on {sample} of the VAERS dataset (training set size: {dataset['train'].num_rows:,})\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from collections import Counter\n",
+        "\n",
+        "def get_most_frequent_unknown_tokens(tokenizer, dataset):\n",
+        "    unknown_tokens = []\n",
+        "    \n",
+        "    # Tokenize each text in the dataset\n",
+        "    for example in dataset:\n",
+        "        tokens = tokenizer.tokenize(example['text'])\n",
+        "        \n",
+        "        # Check if each token is the 'unknown' special token\n",
+        "        for token in tokens:\n",
+        "            if token == tokenizer.unk_token:\n",
+        "                unknown_tokens.append(token)\n",
+        "    \n",
+        "    # Count the frequency of each unique unknown token\n",
+        "    token_counts = Counter(unknown_tokens)\n",
+        "    \n",
+        "    # Sort the tokens based on their frequency in descending order\n",
+        "    most_frequent_tokens = token_counts.most_common()\n",
+        "    \n",
+        "    return most_frequent_tokens\n",
+        "\n",
+        "# Example usage\n",
+        "tokenizer = YourTokenizer()  # Replace with your tokenizer\n",
+        "dataset = YourDataset()  # Replace with your dataset\n",
+        "\n",
+        "most_frequent_unknown_tokens = get_most_frequent_unknown_tokens(tokenizer, dataset)\n",
+        "print(most_frequent_unknown_tokens)\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "datalore": {
+      "base_environment": "default",
+      "computation_mode": "JUPYTER",
+      "package_manager": "pip",
+      "packages": [
+        {
+          "name": "datasets",
+          "source": "PIP",
+          "version": "2.16.1"
+        },
+        {
+          "name": "torch",
+          "source": "PIP",
+          "version": "2.1.2"
+        },
+        {
+          "name": "accelerate",
+          "source": "PIP",
+          "version": "0.26.1"
+        }
+      ],
+      "report_row_ids": [
+        "un8W7ez7ZwoGb5Co6nydEV",
+        "40nN9Hvgi1clHNV5RAemI5",
+        "TgRD90H5NSPpKS41OeXI1w",
+        "ZOm5BfUs3h1EGLaUkBGeEB",
+        "kOP0CZWNSk6vqE3wkPp7Vc",
+        "W4PWcOu2O2pRaZyoE2W80h",
+        "RolbOnQLIftk0vy9mIcz5M",
+        "8OPhUgbaNJmOdiq5D3a6vK",
+        "5Qrt3jSvSrpK6Ne1hS6shL",
+        "hTq7nFUrovN5Ao4u6dIYWZ",
+        "I8WNZLpJ1DVP2wiCW7YBIB",
+        "SawhU3I9BewSE1XBPstpNJ",
+        "80EtLEl2FIE4FqbWnUD3nT"
+      ],
+      "version": 3
+    },
+    "kernelspec": {
+      "display_name": "Python 3.8 - Pytorch and Tensorflow",
+      "language": "python",
+      "name": "python38-azureml-pt-tf"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.5"
+    },
+    "microsoft": {
+      "host": {
+        "AzureML": {
+          "notebookHasBeenCompleted": true
+        }
+      },
+      "ms_spell_check": {
+        "ms_spell_check_language": "en"
+      }
+    },
+    "nteract": {
+      "version": "nteract-front-end@1.0.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}

notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-28-23-54-39Z.ipynb ADDED Viewed

	@@ -0,0 +1,692 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# DAEDRA: Determining Adverse Event Disposition for Regulatory Affairs\n",
+        "\n",
+        "DAEDRA is a language model intended to predict the disposition (outcome) of an adverse event based on the text of the event report. Intended to be used to classify reports in passive reporting systems, it is trained on the [VAERS](https://vaers.hhs.gov/) dataset, which contains reports of adverse events following vaccination in the United States."
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%pip install accelerate -U"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Requirement already satisfied: accelerate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.26.1)\nRequirement already satisfied: pyyaml in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (6.0)\nRequirement already satisfied: torch>=1.10.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.12.0)\nRequirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.4.2)\nRequirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.23.5)\nRequirement already satisfied: psutil in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (5.9.5)\nRequirement already satisfied: huggingface-hub in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.20.3)\nRequirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (23.1)\nRequirement already satisfied: typing_extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from torch>=1.10.0->accelerate) (4.6.3)\nRequirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2.31.0)\nRequirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (3.13.1)\nRequirement already satisfied: tqdm>=4.42.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (4.65.0)\nRequirement already satisfied: fsspec>=2023.5.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2023.10.0)\nRequirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (2023.5.7)\nRequirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.4)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (1.26.16)\nRequirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.1.0)\nNote: you may need to restart the kernel to use updated packages.\n"
+        }
+      ],
+      "execution_count": 1,
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "tags": [],
+        "gather": {
+          "logged": 1706475754655
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%pip install transformers datasets shap watermark wandb evaluate codecarbon"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Requirement already satisfied: transformers in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (4.37.1)\nRequirement already satisfied: datasets in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.16.1)\nRequirement already satisfied: shap in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.44.1)\nRequirement already satisfied: watermark in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.4.3)\nRequirement already satisfied: wandb in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.16.2)\nRequirement already satisfied: evaluate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.4.1)\nRequirement already satisfied: codecarbon in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.3.3)\nRequirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.4.2)\nRequirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.20.3)\nRequirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2.31.0)\nRequirement already satisfied: tqdm>=4.27 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (4.65.0)\nRequirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (1.23.5)\nRequirement already satisfied: pyyaml>=5.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (6.0)\nRequirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (3.13.1)\nRequirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (23.1)\nRequirement already satisfied: tokenizers<0.19,>=0.14 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.15.1)\nRequirement already satisfied: regex!=2019.12.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2023.12.25)\nRequirement already satisfied: dill<0.3.8,>=0.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.3.7)\nRequirement already satisfied: xxhash in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.4.1)\nRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2023.10.0)\nRequirement already satisfied: aiohttp in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.9.1)\nRequirement already satisfied: pyarrow-hotfix in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.6)\nRequirement already satisfied: multiprocess in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.70.15)\nRequirement already satisfied: pandas in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2.0.2)\nRequirement already satisfied: pyarrow>=8.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (9.0.0)\nRequirement already satisfied: scipy in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.10.1)\nRequirement already satisfied: numba in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.58.1)\nRequirement already satisfied: slicer==0.0.7 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.0.7)\nRequirement already satisfied: scikit-learn in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.2.2)\nRequirement already satisfied: cloudpickle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (2.2.1)\nRequirement already satisfied: setuptools in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (65.6.3)\nRequirement already satisfied: importlib-metadata>=1.4 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (6.7.0)\nRequirement already satisfied: ipython>=6.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (8.12.2)\nRequirement already satisfied: psutil>=5.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (5.9.5)\nRequirement already satisfied: sentry-sdk>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.39.2)\nRequirement already satisfied: appdirs>=1.4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.4.4)\nRequirement already satisfied: setproctitle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.3.3)\nRequirement already satisfied: Click!=8.0.0,>=7.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (8.1.3)\nRequirement already satisfied: typing-extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (4.6.3)\nRequirement already satisfied: GitPython!=3.1.29,>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.1.31)\nRequirement already satisfied: docker-pycreds>=0.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (0.4.0)\nRequirement already satisfied: protobuf!=4.21.0,<5,>=3.12.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.19.6)\nRequirement already satisfied: responses<0.19 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from evaluate) (0.18.0)\nRequirement already satisfied: prometheus-client in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (0.19.0)\nRequirement already satisfied: py-cpuinfo in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (9.0.0)\nRequirement already satisfied: arrow in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (1.3.0)\nRequirement already satisfied: pynvml in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (11.5.0)\nRequirement already satisfied: rapidfuzz in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (3.6.1)\nRequirement already satisfied: six>=1.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0)\nRequirement already satisfied: multidict<7.0,>=4.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (6.0.4)\nRequirement already satisfied: aiosignal>=1.1.2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.1)\nRequirement already satisfied: async-timeout<5.0,>=4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (4.0.3)\nRequirement already satisfied: frozenlist>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.4.1)\nRequirement already satisfied: attrs>=17.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (23.1.0)\nRequirement already satisfied: yarl<2.0,>=1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.9.4)\nRequirement already satisfied: gitdb<5,>=4.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from GitPython!=3.1.29,>=1.0.0->wandb) (4.0.10)\nRequirement already satisfied: zipp>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from importlib-metadata>=1.4->watermark) (3.15.0)\nRequirement already satisfied: pygments>=2.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (2.15.1)\nRequirement already satisfied: backcall in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.2.0)\nRequirement already satisfied: pickleshare in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.7.5)\nRequirement already satisfied: traitlets>=5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.9.0)\nRequirement already satisfied: matplotlib-inline in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.1.6)\nRequirement already satisfied: decorator in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.1.1)\nRequirement already satisfied: pexpect>4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (4.8.0)\nRequirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (3.0.30)\nRequirement already satisfied: jedi>=0.16 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.18.2)\nRequirement already satisfied: stack-data in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.6.2)\nRequirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.1.0)\nRequirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.4)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (1.26.16)\nRequirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (2023.5.7)\nRequirement already satisfied: python-dateutil>=2.7.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from arrow->codecarbon) (2.8.2)\nRequirement already satisfied: types-python-dateutil>=2.8.10 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from arrow->codecarbon) (2.8.19.20240106)\nRequirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from numba->shap) (0.41.1)\nRequirement already satisfied: tzdata>=2022.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\nRequirement already satisfied: pytz>=2020.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\nRequirement already satisfied: joblib>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (1.2.0)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (3.1.0)\nRequirement already satisfied: smmap<6,>=3.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.0)\nRequirement already satisfied: parso<0.9.0,>=0.8.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from jedi>=0.16->ipython>=6.0->watermark) (0.8.3)\nRequirement already satisfied: ptyprocess>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pexpect>4.3->ipython>=6.0->watermark) (0.7.0)\nRequirement already satisfied: wcwidth in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.0->watermark) (0.2.6)\nRequirement already satisfied: asttokens>=2.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (2.2.1)\nRequirement already satisfied: executing>=1.2.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (1.2.0)\nRequirement already satisfied: pure-eval in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (0.2.2)\nNote: you may need to restart the kernel to use updated packages.\n"
+        }
+      ],
+      "execution_count": 2,
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "import torch\n",
+        "import os\n",
+        "from typing import List, Union\n",
+        "from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, DataCollatorWithPadding, pipeline\n",
+        "from datasets import load_dataset, Dataset, DatasetDict\n",
+        "import shap\n",
+        "import wandb\n",
+        "import evaluate\n",
+        "from codecarbon import EmissionsTracker\n",
+        "\n",
+        "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+        "tracker = EmissionsTracker()\n",
+        "\n",
+        "%load_ext watermark"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n2024-01-28 21:14:33.562898: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\nTo enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n2024-01-28 21:14:34.581816: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n2024-01-28 21:14:34.581943: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n2024-01-28 21:14:34.581956: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n[codecarbon INFO @ 21:14:37] [setup] RAM Tracking...\n[codecarbon INFO @ 21:14:37] [setup] GPU Tracking...\n[codecarbon INFO @ 21:14:37] Tracking Nvidia GPU via pynvml\n[codecarbon INFO @ 21:14:37] [setup] CPU Tracking...\n[codecarbon WARNING @ 21:14:37] No CPU tracking mode found. Falling back on CPU constant mode.\n[codecarbon WARNING @ 21:14:38] We saw that you have a Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz but we don't know it. Please contact us.\n[codecarbon INFO @ 21:14:38] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz\n[codecarbon INFO @ 21:14:38] >>> Tracker's metadata:\n[codecarbon INFO @ 21:14:38]   Platform system: Linux-5.15.0-1040-azure-x86_64-with-glibc2.10\n[codecarbon INFO @ 21:14:38]   Python version: 3.8.5\n[codecarbon INFO @ 21:14:38]   CodeCarbon version: 2.3.3\n[codecarbon INFO @ 21:14:38]   Available RAM : 440.883 GB\n[codecarbon INFO @ 21:14:38]   CPU count: 24\n[codecarbon INFO @ 21:14:38]   CPU model: Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz\n[codecarbon INFO @ 21:14:38]   GPU count: 4\n[codecarbon INFO @ 21:14:38]   GPU model: 4 x Tesla V100-PCIE-16GB\n[codecarbon WARNING @ 21:14:38] Cloud provider 'azure' do not publish electricity carbon intensity. Using country value instead.\n"
+        }
+      ],
+      "execution_count": 3,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "caZjjFP0OyQNMVgZDiwswE",
+          "report_properties": {
+            "rowId": "un8W7ez7ZwoGb5Co6nydEV"
+          },
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706476478659
+        },
+        "tags": []
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "device: str = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+        "\n",
+        "SEED: int = 42\n",
+        "\n",
+        "BATCH_SIZE: int = 32\n",
+        "EPOCHS: int = 3\n",
+        "model_ckpt: str = \"distilbert-base-uncased\"\n",
+        "\n",
+        "# WandB configuration\n",
+        "os.environ[\"WANDB_PROJECT\"] = \"DAEDRA multiclass model training\" \n",
+        "os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\"  # log all model checkpoints\n",
+        "os.environ[\"WANDB_NOTEBOOK_NAME\"] = \"DAEDRA.ipynb\""
+      ],
+      "outputs": [],
+      "execution_count": 4,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706476478863
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%watermark --iversion"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "shap    : 0.44.1\nre      : 2.2.1\ntorch   : 1.12.0\nevaluate: 0.4.1\nwandb   : 0.16.2\nlogging : 0.5.1.2\npandas  : 2.0.2\nnumpy   : 1.23.5\n\n"
+        }
+      ],
+      "execution_count": 5,
+      "metadata": {
+        "collapsed": false,
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!nvidia-smi"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Sun Jan 28 21:14:38 2024       \r\n+---------------------------------------------------------------------------------------+\r\n| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |\r\n|-----------------------------------------+----------------------+----------------------+\r\n| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\r\n| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\r\n|                                         |                      |               MIG M. |\r\n|=========================================+======================+======================|\r\n|   0  Tesla V100-PCIE-16GB           Off | 00000001:00:00.0 Off |                  Off |\r\n| N/A   26C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n|   1  Tesla V100-PCIE-16GB           Off | 00000002:00:00.0 Off |                  Off |\r\n| N/A   25C    P0              23W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n|   2  Tesla V100-PCIE-16GB           Off | 00000003:00:00.0 Off |                  Off |\r\n| N/A   26C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n|   3  Tesla V100-PCIE-16GB           Off | 00000004:00:00.0 Off |                  Off |\r\n| N/A   27C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n                                                                                         \r\n+---------------------------------------------------------------------------------------+\r\n| Processes:                                                                            |\r\n|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\r\n|        ID   ID                                                             Usage      |\r\n|=======================================================================================|\r\n|  No running processes found                                                           |\r\n+---------------------------------------------------------------------------------------+\r\n"
+        }
+      ],
+      "execution_count": 6,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "UU2oOJhwbIualogG1YyCMd",
+          "type": "CODE"
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Loading the data set"
+      ],
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "t45KHugmcPVaO0nuk8tGJ9",
+          "report_properties": {
+            "rowId": "40nN9Hvgi1clHNV5RAemI5"
+          },
+          "type": "MD"
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dataset = load_dataset(\"chrisvoncsefalvay/vaers-outcomes\")"
+      ],
+      "outputs": [],
+      "execution_count": 7,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706476480469
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dataset"
+      ],
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "execution_count": 8,
+          "data": {
+            "text/plain": "DatasetDict({\n    train: Dataset({\n        features: ['id', 'text', 'label'],\n        num_rows: 1270444\n    })\n    test: Dataset({\n        features: ['id', 'text', 'label'],\n        num_rows: 272238\n    })\n    val: Dataset({\n        features: ['id', 'text', 'label'],\n        num_rows: 272238\n    })\n})"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 8,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706476480629
+        },
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "SUBSAMPLING = 0.5\n",
+        "\n",
+        "if SUBSAMPLING < 1:\n",
+        "    _ = DatasetDict()\n",
+        "    for each in dataset.keys():\n",
+        "        _[each] = dataset[each].shuffle(seed=SEED).select(range(int(len(dataset[each]) * SUBSAMPLING)))\n",
+        "\n",
+        "    dataset = _"
+      ],
+      "outputs": [],
+      "execution_count": 9,
+      "metadata": {
+        "gather": {
+          "logged": 1706476480826
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Tokenisation and encoding"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def encode_ds(ds: Union[Dataset, DatasetDict], tokenizer_model: str = model_ckpt) -> Union[Dataset, DatasetDict]:\n",
+        "    return ds_enc"
+      ],
+      "outputs": [],
+      "execution_count": 10,
+      "metadata": {
+        "gather": {
+          "logged": 1706476480944
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Evaluation metrics"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "accuracy = evaluate.load(\"accuracy\")\n",
+        "precision, recall = evaluate.load(\"precision\"), evaluate.load(\"recall\")\n",
+        "f1 = evaluate.load(\"f1\")"
+      ],
+      "outputs": [],
+      "execution_count": 11,
+      "metadata": {
+        "gather": {
+          "logged": 1706476481192
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def compute_metrics(eval_pred):\n",
+        "    predictions, labels = eval_pred\n",
+        "    predictions = np.argmax(predictions, axis=1)\n",
+        "    return {\n",
+        "        'accuracy': accuracy.compute(predictions=predictions, references=labels)[\"accuracy\"],\n",
+        "        'precision_macroaverage': precision.compute(predictions=predictions, references=labels, average='macro')[\"precision\"],\n",
+        "        'precision_microaverage': precision.compute(predictions=predictions, references=labels, average='micro')[\"precision\"],\n",
+        "        'recall_macroaverage': recall.compute(predictions=predictions, references=labels, average='macro')[\"recall\"],\n",
+        "        'recall_microaverage': recall.compute(predictions=predictions, references=labels, average='micro')[\"recall\"],\n",
+        "        'f1_microaverage': f1.compute(predictions=predictions, references=labels, average='micro')[\"f1\"]\n",
+        "    }"
+      ],
+      "outputs": [],
+      "execution_count": 12,
+      "metadata": {
+        "gather": {
+          "logged": 1706476481346
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Training"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We specify a label map – this has to be done manually, even if `Datasets` has a function for it, as `AutoModelForSequenceClassification` requires an object with a length :("
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "label_map = {i: label for i, label in enumerate(dataset[\"test\"].features[\"label\"].names)}"
+      ],
+      "outputs": [],
+      "execution_count": 13,
+      "metadata": {
+        "gather": {
+          "logged": 1706476481593
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)\n",
+        "\n",
+        "cols = dataset[\"train\"].column_names\n",
+        "cols.remove(\"label\")\n",
+        "ds_enc = dataset.map(lambda x: tokenizer(x[\"text\"], truncation=True), batched=True, remove_columns=cols)\n",
+        "\n",
+        "model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, \n",
+        "    num_labels=len(dataset[\"test\"].features[\"label\"].names), \n",
+        "    id2label=label_map, \n",
+        "    label2id={v:k for k,v in label_map.items()})\n",
+        "\n",
+        "args = TrainingArguments(\n",
+        "    output_dir=\"vaers\",\n",
+        "    evaluation_strategy=\"epoch\",\n",
+        "    save_strategy=\"epoch\",\n",
+        "    learning_rate=2e-5,\n",
+        "    per_device_train_batch_size=BATCH_SIZE,\n",
+        "    per_device_eval_batch_size=BATCH_SIZE,\n",
+        "    num_train_epochs=EPOCHS,\n",
+        "    weight_decay=.01,\n",
+        "    logging_steps=1,\n",
+        "    load_best_model_at_end=True,\n",
+        "    run_name=f\"daedra-training\",\n",
+        "    report_to=[\"wandb\"])\n",
+        "\n",
+        "trainer = Trainer(\n",
+        "        model=model,\n",
+        "        args=args,\n",
+        "        train_dataset=ds_enc[\"train\"],\n",
+        "        eval_dataset=ds_enc[\"test\"],\n",
+        "        tokenizer=tokenizer,\n",
+        "        compute_metrics=compute_metrics)"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "Map: 100%|██████████| 635222/635222 [04:25<00:00, 2395.47 examples/s]\nMap: 100%|██████████| 136119/136119 [00:56<00:00, 2405.75 examples/s]\nMap: 100%|██████████| 136119/136119 [00:56<00:00, 2422.27 examples/s]\nSome weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\nYou should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+        }
+      ],
+      "execution_count": 14,
+      "metadata": {
+        "gather": {
+          "logged": 1706476861739
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "if SUBSAMPLING != 1.0:\n",
+        "    wandb_tag: List[str] = [f\"subsample-{SUBSAMPLING}\"]\n",
+        "else:\n",
+        "    wandb_tag: List[str] = [f\"full_sample\"]\n",
+        "\n",
+        "wandb_tag.append(f\"batch_size-{BATCH_SIZE}\")\n",
+        "wandb_tag.append(f\"base:{model_ckpt}\")\n",
+        "    \n",
+        "wandb.init(name=\"daedra_training_run\", tags=wandb_tag, magic=True)"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mchrisvoncsefalvay\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m wandb.init() arguments ignored because wandb magic has already been initialized\n"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Tracking run with wandb version 0.16.2"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/daedra-hptrain-cvc/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240128_212103-403j5ij5</code>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/403j5ij5' target=\"_blank\">daedra_training_run</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training</a>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/403j5ij5' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/403j5ij5</a>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Finishing last run (ID:403j5ij5) before initializing another..."
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View run <strong style=\"color:#cdcd00\">daedra_training_run</strong> at: <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/403j5ij5' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/403j5ij5</a><br/>Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Find logs at: <code>./wandb/run-20240128_212103-403j5ij5/logs</code>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Successfully finished last run (ID:403j5ij5). Initializing new run:<br/>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Tracking run with wandb version 0.16.2"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/daedra-hptrain-cvc/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240128_212105-q65k78ea</code>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/q65k78ea' target=\"_blank\">daedra_training_run</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training</a>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/q65k78ea' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/q65k78ea</a>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "execute_result",
+          "execution_count": 15,
+          "data": {
+            "text/html": "<button onClick=\"this.nextSibling.style.display='block';this.style.display='none';\">Display W&B run</button><iframe src='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/q65k78ea?jupyter=true' style='border:none;width:100%;height:420px;display:none;'></iframe>",
+            "text/plain": "<wandb.sdk.wandb_run.Run at 0x7fea31898d90>"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 15,
+      "metadata": {
+        "gather": {
+          "logged": 1706476872191
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "tracker.start()\n",
+        "trainer.train()\n",
+        "tracker.stop()\n"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "\n    <div>\n      \n      <progress value='2907' max='14889' style='width:300px; height:20px; vertical-align: middle;'></progress>\n      [ 2907/14889 31:21 < 2:09:21, 1.54 it/s, Epoch 0.59/3]\n    </div>\n    <table border=\"1\" class=\"dataframe\">\n  <thead>\n <tr style=\"text-align: left;\">\n      <th>Epoch</th>\n      <th>Training Loss</th>\n      <th>Validation Loss</th>\n    </tr>\n  </thead>\n  <tbody>\n  </tbody>\n</table><p>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "[codecarbon INFO @ 21:21:26] Energy consumed for RAM : 0.000690 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:21:26] Energy consumed for all GPUs : 0.001498 kWh. Total GPU Power : 359.01546838188807 W\n[codecarbon INFO @ 21:21:26] Energy consumed for all CPUs : 0.000177 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:21:26] 0.002365 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:21:41] Energy consumed for RAM : 0.001378 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:21:41] Energy consumed for all GPUs : 0.004065 kWh. Total GPU Power : 616.9146793770267 W\n[codecarbon INFO @ 21:21:41] Energy consumed for all CPUs : 0.000354 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:21:41] 0.005797 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:21:56] Energy consumed for RAM : 0.002066 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:21:56] Energy consumed for all GPUs : 0.006654 kWh. Total GPU Power : 621.6877665436252 W\n[codecarbon INFO @ 21:21:56] Energy consumed for all CPUs : 0.000532 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:21:56] 0.009251 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:22:11] Energy consumed for RAM : 0.002754 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:22:11] Energy consumed for all GPUs : 0.009260 kWh. Total GPU Power : 626.1437572465749 W\n[codecarbon INFO @ 21:22:11] Energy consumed for all CPUs : 0.000709 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:22:11] 0.012723 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:22:26] Energy consumed for RAM : 0.003443 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:22:26] Energy consumed for all GPUs : 0.011865 kWh. Total GPU Power : 625.3693802936192 W\n[codecarbon INFO @ 21:22:26] Energy consumed for all CPUs : 0.000886 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:22:26] 0.016193 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:22:41] Energy consumed for RAM : 0.004131 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:22:41] Energy consumed for all GPUs : 0.014488 kWh. Total GPU Power : 630.2419235639226 W\n[codecarbon INFO @ 21:22:41] Energy consumed for all CPUs : 0.001063 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:22:41] 0.019682 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:22:56] Energy consumed for RAM : 0.004819 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:22:56] Energy consumed for all GPUs : 0.017135 kWh. Total GPU Power : 635.8556506868297 W\n[codecarbon INFO @ 21:22:56] Energy consumed for all CPUs : 0.001240 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:22:56] 0.023194 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:23:11] Energy consumed for RAM : 0.005507 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:23:11] Energy consumed for all GPUs : 0.019738 kWh. Total GPU Power : 625.0758518089303 W\n[codecarbon INFO @ 21:23:11] Energy consumed for all CPUs : 0.001417 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:23:11] 0.026662 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:23:26] Energy consumed for RAM : 0.006195 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:23:26] Energy consumed for all GPUs : 0.022385 kWh. Total GPU Power : 636.0572579593729 W\n[codecarbon INFO @ 21:23:26] Energy consumed for all CPUs : 0.001594 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:23:26] 0.030175 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:23:41] Energy consumed for RAM : 0.006883 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:23:41] Energy consumed for all GPUs : 0.025031 kWh. Total GPU Power : 635.4132918961806 W\n[codecarbon INFO @ 21:23:41] Energy consumed for all CPUs : 0.001771 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:23:41] 0.033685 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:23:56] Energy consumed for RAM : 0.007572 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:23:56] Energy consumed for all GPUs : 0.027661 kWh. Total GPU Power : 631.8222916777424 W\n[codecarbon INFO @ 21:23:56] Energy consumed for all CPUs : 0.001948 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:23:56] 0.037180 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:24:11] Energy consumed for RAM : 0.008260 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:24:11] Energy consumed for all GPUs : 0.030315 kWh. Total GPU Power : 637.844758085687 W\n[codecarbon INFO @ 21:24:11] Energy consumed for all CPUs : 0.002125 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:24:11] 0.040701 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:24:26] Energy consumed for RAM : 0.008948 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:24:26] Energy consumed for all GPUs : 0.032970 kWh. Total GPU Power : 637.7063667069607 W\n[codecarbon INFO @ 21:24:26] Energy consumed for all CPUs : 0.002302 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:24:26] 0.044220 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:24:41] Energy consumed for RAM : 0.009636 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:24:41] Energy consumed for all GPUs : 0.035629 kWh. Total GPU Power : 638.7595521159491 W\n[codecarbon INFO @ 21:24:41] Energy consumed for all CPUs : 0.002479 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:24:41] 0.047744 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:24:56] Energy consumed for RAM : 0.010324 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:24:56] Energy consumed for all GPUs : 0.038234 kWh. Total GPU Power : 626.0118880295652 W\n[codecarbon INFO @ 21:24:56] Energy consumed for all CPUs : 0.002657 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:24:56] 0.051214 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:25:11] Energy consumed for RAM : 0.011012 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:25:11] Energy consumed for all GPUs : 0.040892 kWh. Total GPU Power : 638.4170631771941 W\n[codecarbon INFO @ 21:25:11] Energy consumed for all CPUs : 0.002834 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:25:11] 0.054738 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:25:26] Energy consumed for RAM : 0.011700 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:25:26] Energy consumed for all GPUs : 0.043524 kWh. Total GPU Power : 632.34394576946 W\n[codecarbon INFO @ 21:25:26] Energy consumed for all CPUs : 0.003011 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:25:26] 0.058235 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:25:41] Energy consumed for RAM : 0.012388 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:25:41] Energy consumed for all GPUs : 0.046182 kWh. Total GPU Power : 638.4662389546352 W\n[codecarbon INFO @ 21:25:41] Energy consumed for all CPUs : 0.003188 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:25:41] 0.061758 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:25:56] Energy consumed for RAM : 0.013076 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:25:56] Energy consumed for all GPUs : 0.048838 kWh. Total GPU Power : 638.0871021853263 W\n[codecarbon INFO @ 21:25:56] Energy consumed for all CPUs : 0.003365 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:25:56] 0.065279 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:26:11] Energy consumed for RAM : 0.013765 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:26:11] Energy consumed for all GPUs : 0.051499 kWh. Total GPU Power : 639.0983849678707 W\n[codecarbon INFO @ 21:26:11] Energy consumed for all CPUs : 0.003542 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:26:11] 0.068806 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:26:26] Energy consumed for RAM : 0.014453 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:26:26] Energy consumed for all GPUs : 0.054132 kWh. Total GPU Power : 632.549674567773 W\n[codecarbon INFO @ 21:26:26] Energy consumed for all CPUs : 0.003719 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:26:26] 0.072304 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:26:41] Energy consumed for RAM : 0.015141 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:26:41] Energy consumed for all GPUs : 0.056768 kWh. Total GPU Power : 633.3096652159345 W\n[codecarbon INFO @ 21:26:41] Energy consumed for all CPUs : 0.003896 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:26:41] 0.075804 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:26:56] Energy consumed for RAM : 0.015829 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:26:56] Energy consumed for all GPUs : 0.059404 kWh. Total GPU Power : 633.339846576491 W\n[codecarbon INFO @ 21:26:56] Energy consumed for all CPUs : 0.004073 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:26:56] 0.079306 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:27:11] Energy consumed for RAM : 0.016517 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:27:11] Energy consumed for all GPUs : 0.062068 kWh. Total GPU Power : 639.9100492849137 W\n[codecarbon INFO @ 21:27:11] Energy consumed for all CPUs : 0.004250 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:27:11] 0.082835 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:27:26] Energy consumed for RAM : 0.017205 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:27:26] Energy consumed for all GPUs : 0.064726 kWh. Total GPU Power : 638.6437092393893 W\n[codecarbon INFO @ 21:27:26] Energy consumed for all CPUs : 0.004427 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:27:26] 0.086359 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:27:41] Energy consumed for RAM : 0.017893 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:27:41] Energy consumed for all GPUs : 0.067388 kWh. Total GPU Power : 639.3487979354586 W\n[codecarbon INFO @ 21:27:41] Energy consumed for all CPUs : 0.004604 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:27:41] 0.089885 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:27:56] Energy consumed for RAM : 0.018581 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:27:56] Energy consumed for all GPUs : 0.070026 kWh. Total GPU Power : 633.6884387646057 W\n[codecarbon INFO @ 21:27:56] Energy consumed for all CPUs : 0.004781 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:27:56] 0.093389 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:28:11] Energy consumed for RAM : 0.019269 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:28:11] Energy consumed for all GPUs : 0.072687 kWh. Total GPU Power : 639.4422525221754 W\n[codecarbon INFO @ 21:28:11] Energy consumed for all CPUs : 0.004958 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:28:11] 0.096915 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:28:26] Energy consumed for RAM : 0.019958 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:28:26] Energy consumed for all GPUs : 0.075296 kWh. Total GPU Power : 626.9464464111006 W\n[codecarbon INFO @ 21:28:26] Energy consumed for all CPUs : 0.005135 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:28:26] 0.100390 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:28:41] Energy consumed for RAM : 0.020646 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:28:41] Energy consumed for all GPUs : 0.077962 kWh. Total GPU Power : 640.3962575270206 W\n[codecarbon INFO @ 21:28:41] Energy consumed for all CPUs : 0.005313 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:28:41] 0.103921 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:28:56] Energy consumed for RAM : 0.021334 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:28:56] Energy consumed for all GPUs : 0.080619 kWh. Total GPU Power : 638.3087387539953 W\n[codecarbon INFO @ 21:28:56] Energy consumed for all CPUs : 0.005490 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:28:56] 0.107443 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:29:11] Energy consumed for RAM : 0.022022 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:29:11] Energy consumed for all GPUs : 0.083270 kWh. Total GPU Power : 636.8708359764104 W\n[codecarbon INFO @ 21:29:11] Energy consumed for all CPUs : 0.005667 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:29:11] 0.110959 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:29:26] Energy consumed for RAM : 0.022710 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:29:26] Energy consumed for all GPUs : 0.085893 kWh. Total GPU Power : 630.0796388169725 W\n[codecarbon INFO @ 21:29:26] Energy consumed for all CPUs : 0.005844 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:29:26] 0.114447 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:29:41] Energy consumed for RAM : 0.023398 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:29:41] Energy consumed for all GPUs : 0.088548 kWh. Total GPU Power : 637.7758378447022 W\n[codecarbon INFO @ 21:29:41] Energy consumed for all CPUs : 0.006021 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:29:41] 0.117968 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:29:56] Energy consumed for RAM : 0.024087 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:29:56] Energy consumed for all GPUs : 0.091193 kWh. Total GPU Power : 634.8550146720521 W\n[codecarbon INFO @ 21:29:56] Energy consumed for all CPUs : 0.006198 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:29:56] 0.121478 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:30:11] Energy consumed for RAM : 0.024775 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:30:11] Energy consumed for all GPUs : 0.093817 kWh. Total GPU Power : 630.5186457226341 W\n[codecarbon INFO @ 21:30:11] Energy consumed for all CPUs : 0.006375 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:30:11] 0.124967 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:30:26] Energy consumed for RAM : 0.025463 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:30:26] Energy consumed for all GPUs : 0.096471 kWh. Total GPU Power : 637.5849420686613 W\n[codecarbon INFO @ 21:30:26] Energy consumed for all CPUs : 0.006552 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:30:26] 0.128486 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:30:41] Energy consumed for RAM : 0.026151 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:30:41] Energy consumed for all GPUs : 0.099109 kWh. Total GPU Power : 633.6189362439791 W\n[codecarbon INFO @ 21:30:41] Energy consumed for all CPUs : 0.006729 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:30:41] 0.131990 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:30:56] Energy consumed for RAM : 0.026839 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:30:56] Energy consumed for all GPUs : 0.101776 kWh. Total GPU Power : 640.6257944471723 W\n[codecarbon INFO @ 21:30:56] Energy consumed for all CPUs : 0.006906 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:30:56] 0.135522 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:31:11] Energy consumed for RAM : 0.027528 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:31:11] Energy consumed for all GPUs : 0.104439 kWh. Total GPU Power : 639.6643020904513 W\n[codecarbon INFO @ 21:31:11] Energy consumed for all CPUs : 0.007083 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:31:11] 0.139050 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:31:26] Energy consumed for RAM : 0.028216 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:31:26] Energy consumed for all GPUs : 0.107104 kWh. Total GPU Power : 640.0939172348444 W\n[codecarbon INFO @ 21:31:26] Energy consumed for all CPUs : 0.007260 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:31:26] 0.142580 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:31:41] Energy consumed for RAM : 0.028904 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:31:41] Energy consumed for all GPUs : 0.109747 kWh. Total GPU Power : 634.9935430223439 W\n[codecarbon INFO @ 21:31:41] Energy consumed for all CPUs : 0.007438 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:31:41] 0.146088 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:31:56] Energy consumed for RAM : 0.029592 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:31:56] Energy consumed for all GPUs : 0.112378 kWh. Total GPU Power : 632.1091666419065 W\n[codecarbon INFO @ 21:31:56] Energy consumed for all CPUs : 0.007615 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:31:56] 0.149585 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:32:11] Energy consumed for RAM : 0.030281 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:32:11] Energy consumed for all GPUs : 0.115023 kWh. Total GPU Power : 633.8442942682154 W\n[codecarbon INFO @ 21:32:11] Energy consumed for all CPUs : 0.007792 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:32:11] 0.153096 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:32:26] Energy consumed for RAM : 0.030968 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:32:26] Energy consumed for all GPUs : 0.117663 kWh. Total GPU Power : 635.0998743598051 W\n[codecarbon INFO @ 21:32:26] Energy consumed for all CPUs : 0.007969 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:32:26] 0.156599 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:32:41] Energy consumed for RAM : 0.031656 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:32:41] Energy consumed for all GPUs : 0.120332 kWh. Total GPU Power : 641.1040333084177 W\n[codecarbon INFO @ 21:32:41] Energy consumed for all CPUs : 0.008146 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:32:41] 0.160134 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:32:56] Energy consumed for RAM : 0.032344 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:32:56] Energy consumed for all GPUs : 0.122995 kWh. Total GPU Power : 639.7462391288783 W\n[codecarbon INFO @ 21:32:56] Energy consumed for all CPUs : 0.008323 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:32:56] 0.163662 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:33:11] Energy consumed for RAM : 0.033033 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:33:11] Energy consumed for all GPUs : 0.125648 kWh. Total GPU Power : 637.3370633888644 W\n[codecarbon INFO @ 21:33:11] Energy consumed for all CPUs : 0.008500 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:33:11] 0.167180 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:33:26] Energy consumed for RAM : 0.033721 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:33:26] Energy consumed for all GPUs : 0.128260 kWh. Total GPU Power : 627.497349520734 W\n[codecarbon INFO @ 21:33:26] Energy consumed for all CPUs : 0.008677 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:33:26] 0.170658 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:33:41] Energy consumed for RAM : 0.034409 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:33:41] Energy consumed for all GPUs : 0.130922 kWh. Total GPU Power : 639.378459827986 W\n[codecarbon INFO @ 21:33:41] Energy consumed for all CPUs : 0.008854 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:33:41] 0.174185 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:33:56] Energy consumed for RAM : 0.035097 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:33:56] Energy consumed for all GPUs : 0.133563 kWh. Total GPU Power : 634.2779963263187 W\n[codecarbon INFO @ 21:33:56] Energy consumed for all CPUs : 0.009031 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:33:56] 0.177692 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:34:11] Energy consumed for RAM : 0.035785 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:34:11] Energy consumed for all GPUs : 0.136211 kWh. Total GPU Power : 636.088462236655 W\n[codecarbon INFO @ 21:34:11] Energy consumed for all CPUs : 0.009208 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:34:11] 0.181205 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:34:26] Energy consumed for RAM : 0.036474 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:34:26] Energy consumed for all GPUs : 0.138875 kWh. Total GPU Power : 639.8420566736949 W\n[codecarbon INFO @ 21:34:26] Energy consumed for all CPUs : 0.009385 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:34:26] 0.184734 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:34:41] Energy consumed for RAM : 0.037162 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:34:41] Energy consumed for all GPUs : 0.141537 kWh. Total GPU Power : 639.4628459940732 W\n[codecarbon INFO @ 21:34:41] Energy consumed for all CPUs : 0.009563 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:34:41] 0.188261 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:34:56] Energy consumed for RAM : 0.037850 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:34:56] Energy consumed for all GPUs : 0.144193 kWh. Total GPU Power : 638.284138549091 W\n[codecarbon INFO @ 21:34:56] Energy consumed for all CPUs : 0.009740 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:34:56] 0.191782 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:35:11] Energy consumed for RAM : 0.038538 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:35:11] Energy consumed for all GPUs : 0.146807 kWh. Total GPU Power : 627.9721129851367 W\n[codecarbon INFO @ 21:35:11] Energy consumed for all CPUs : 0.009917 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:35:11] 0.195262 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:35:26] Energy consumed for RAM : 0.039226 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:35:26] Energy consumed for all GPUs : 0.149465 kWh. Total GPU Power : 638.5284782703005 W\n[codecarbon INFO @ 21:35:26] Energy consumed for all CPUs : 0.010094 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:35:26] 0.198785 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:35:41] Energy consumed for RAM : 0.039914 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:35:41] Energy consumed for all GPUs : 0.152101 kWh. Total GPU Power : 633.1180716439897 W\n[codecarbon INFO @ 21:35:41] Energy consumed for all CPUs : 0.010271 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:35:41] 0.202286 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:35:56] Energy consumed for RAM : 0.040602 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:35:56] Energy consumed for all GPUs : 0.154764 kWh. Total GPU Power : 640.0670545574203 W\n[codecarbon INFO @ 21:35:56] Energy consumed for all CPUs : 0.010448 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:35:56] 0.205814 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:36:11] Energy consumed for RAM : 0.041290 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:36:11] Energy consumed for all GPUs : 0.157432 kWh. Total GPU Power : 640.6751187111053 W\n[codecarbon INFO @ 21:36:11] Energy consumed for all CPUs : 0.010625 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:36:11] 0.209347 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:36:26] Energy consumed for RAM : 0.041979 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:36:26] Energy consumed for all GPUs : 0.160090 kWh. Total GPU Power : 638.5720494734854 W\n[codecarbon INFO @ 21:36:26] Energy consumed for all CPUs : 0.010802 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:36:26] 0.212871 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:36:41] Energy consumed for RAM : 0.042667 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:36:41] Energy consumed for all GPUs : 0.162737 kWh. Total GPU Power : 635.8084991485674 W\n[codecarbon INFO @ 21:36:41] Energy consumed for all CPUs : 0.010979 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:36:41] 0.216383 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:36:56] Energy consumed for RAM : 0.043355 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:36:56] Energy consumed for all GPUs : 0.165376 kWh. Total GPU Power : 634.1987011824029 W\n[codecarbon INFO @ 21:36:56] Energy consumed for all CPUs : 0.011156 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:36:56] 0.219886 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:37:11] Energy consumed for RAM : 0.044043 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:37:11] Energy consumed for all GPUs : 0.168015 kWh. Total GPU Power : 633.9887371766706 W\n[codecarbon INFO @ 21:37:11] Energy consumed for all CPUs : 0.011333 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:37:11] 0.223391 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:37:26] Energy consumed for RAM : 0.044731 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:37:26] Energy consumed for all GPUs : 0.170672 kWh. Total GPU Power : 638.6399487093975 W\n[codecarbon INFO @ 21:37:26] Energy consumed for all CPUs : 0.011510 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:37:26] 0.226913 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:37:41] Energy consumed for RAM : 0.045419 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:37:41] Energy consumed for all GPUs : 0.173330 kWh. Total GPU Power : 638.3717543169629 W\n[codecarbon INFO @ 21:37:41] Energy consumed for all CPUs : 0.011687 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:37:41] 0.230436 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:37:56] Energy consumed for RAM : 0.046107 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:37:56] Energy consumed for all GPUs : 0.175996 kWh. Total GPU Power : 640.4666251525215 W\n[codecarbon INFO @ 21:37:56] Energy consumed for all CPUs : 0.011864 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:37:56] 0.233967 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:38:11] Energy consumed for RAM : 0.046795 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:38:11] Energy consumed for all GPUs : 0.178655 kWh. Total GPU Power : 638.5808546734917 W\n[codecarbon INFO @ 21:38:11] Energy consumed for all CPUs : 0.012042 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:38:11] 0.237491 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:38:26] Energy consumed for RAM : 0.047483 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:38:26] Energy consumed for all GPUs : 0.181299 kWh. Total GPU Power : 635.2413118760992 W\n[codecarbon INFO @ 21:38:26] Energy consumed for all CPUs : 0.012219 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:38:26] 0.241001 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:38:41] Energy consumed for RAM : 0.048172 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:38:41] Energy consumed for all GPUs : 0.183936 kWh. Total GPU Power : 633.4649546198842 W\n[codecarbon INFO @ 21:38:41] Energy consumed for all CPUs : 0.012396 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:38:41] 0.244503 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:38:56] Energy consumed for RAM : 0.048860 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:38:56] Energy consumed for all GPUs : 0.186584 kWh. Total GPU Power : 636.131524414156 W\n[codecarbon INFO @ 21:38:56] Energy consumed for all CPUs : 0.012573 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:38:56] 0.248017 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:39:11] Energy consumed for RAM : 0.049548 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:39:11] Energy consumed for all GPUs : 0.189253 kWh. Total GPU Power : 641.2634282249857 W\n[codecarbon INFO @ 21:39:11] Energy consumed for all CPUs : 0.012750 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:39:11] 0.251551 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:39:26] Energy consumed for RAM : 0.050236 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:39:26] Energy consumed for all GPUs : 0.191926 kWh. Total GPU Power : 642.0441434380534 W\n[codecarbon INFO @ 21:39:26] Energy consumed for all CPUs : 0.012927 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:39:26] 0.255089 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:39:41] Energy consumed for RAM : 0.050924 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:39:41] Energy consumed for all GPUs : 0.194582 kWh. Total GPU Power : 637.9781331087586 W\n[codecarbon INFO @ 21:39:41] Energy consumed for all CPUs : 0.013104 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:39:41] 0.258610 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:39:56] Energy consumed for RAM : 0.051612 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:39:56] Energy consumed for all GPUs : 0.197230 kWh. Total GPU Power : 636.2697706727595 W\n[codecarbon INFO @ 21:39:56] Energy consumed for all CPUs : 0.013281 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:39:56] 0.262124 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:40:11] Energy consumed for RAM : 0.052300 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:40:11] Energy consumed for all GPUs : 0.199895 kWh. Total GPU Power : 640.3428775768339 W\n[codecarbon INFO @ 21:40:11] Energy consumed for all CPUs : 0.013458 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:40:11] 0.265654 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:40:26] Energy consumed for RAM : 0.052988 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:40:26] Energy consumed for all GPUs : 0.202515 kWh. Total GPU Power : 629.2766685535789 W\n[codecarbon INFO @ 21:40:26] Energy consumed for all CPUs : 0.013635 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:40:26] 0.269139 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:40:41] Energy consumed for RAM : 0.053677 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:40:41] Energy consumed for all GPUs : 0.205189 kWh. Total GPU Power : 642.1835694357527 W\n[codecarbon INFO @ 21:40:41] Energy consumed for all CPUs : 0.013812 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:40:41] 0.272678 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:40:56] Energy consumed for RAM : 0.054365 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:40:56] Energy consumed for all GPUs : 0.207849 kWh. Total GPU Power : 639.0512123347582 W\n[codecarbon INFO @ 21:40:56] Energy consumed for all CPUs : 0.013989 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:40:56] 0.276203 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:41:11] Energy consumed for RAM : 0.055053 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:41:11] Energy consumed for all GPUs : 0.210510 kWh. Total GPU Power : 639.2114833450486 W\n[codecarbon INFO @ 21:41:11] Energy consumed for all CPUs : 0.014166 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:41:11] 0.279729 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:41:26] Energy consumed for RAM : 0.055741 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:41:26] Energy consumed for all GPUs : 0.213150 kWh. Total GPU Power : 634.2127304940207 W\n[codecarbon INFO @ 21:41:26] Energy consumed for all CPUs : 0.014343 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:41:26] 0.283234 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:41:41] Energy consumed for RAM : 0.056429 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:41:41] Energy consumed for all GPUs : 0.215819 kWh. Total GPU Power : 641.2007745785662 W\n[codecarbon INFO @ 21:41:41] Energy consumed for all CPUs : 0.014521 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:41:41] 0.286769 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:41:56] Energy consumed for RAM : 0.057117 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:41:56] Energy consumed for all GPUs : 0.218486 kWh. Total GPU Power : 640.7227371900796 W\n[codecarbon INFO @ 21:41:56] Energy consumed for all CPUs : 0.014698 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:41:56] 0.290301 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:42:11] Energy consumed for RAM : 0.057806 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:42:11] Energy consumed for all GPUs : 0.221103 kWh. Total GPU Power : 628.6339716630874 W\n[codecarbon INFO @ 21:42:11] Energy consumed for all CPUs : 0.014875 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:42:11] 0.293783 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:42:26] Energy consumed for RAM : 0.058494 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:42:26] Energy consumed for all GPUs : 0.223767 kWh. Total GPU Power : 640.0746364068535 W\n[codecarbon INFO @ 21:42:26] Energy consumed for all CPUs : 0.015052 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:42:26] 0.297313 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:42:41] Energy consumed for RAM : 0.059182 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:42:41] Energy consumed for all GPUs : 0.226424 kWh. Total GPU Power : 638.2080824809826 W\n[codecarbon INFO @ 21:42:41] Energy consumed for all CPUs : 0.015229 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:42:41] 0.300835 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:42:56] Energy consumed for RAM : 0.059870 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:42:56] Energy consumed for all GPUs : 0.229064 kWh. Total GPU Power : 634.2533218929578 W\n[codecarbon INFO @ 21:42:56] Energy consumed for all CPUs : 0.015406 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:42:56] 0.304340 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:43:11] Energy consumed for RAM : 0.060558 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:43:11] Energy consumed for all GPUs : 0.231699 kWh. Total GPU Power : 632.8743496381484 W\n[codecarbon INFO @ 21:43:11] Energy consumed for all CPUs : 0.015583 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:43:11] 0.307840 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:43:26] Energy consumed for RAM : 0.061246 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:43:26] Energy consumed for all GPUs : 0.234369 kWh. Total GPU Power : 641.6283834036132 W\n[codecarbon INFO @ 21:43:26] Energy consumed for all CPUs : 0.015760 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:43:26] 0.311375 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:43:41] Energy consumed for RAM : 0.061934 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:43:41] Energy consumed for all GPUs : 0.237043 kWh. Total GPU Power : 642.3040649089497 W\n[codecarbon INFO @ 21:43:41] Energy consumed for all CPUs : 0.015937 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:43:41] 0.314915 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:43:56] Energy consumed for RAM : 0.062622 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:43:56] Energy consumed for all GPUs : 0.239655 kWh. Total GPU Power : 627.6126294000912 W\n[codecarbon INFO @ 21:43:56] Energy consumed for all CPUs : 0.016114 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:43:56] 0.318392 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:44:11] Energy consumed for RAM : 0.063311 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:44:11] Energy consumed for all GPUs : 0.242321 kWh. Total GPU Power : 640.4353576994267 W\n[codecarbon INFO @ 21:44:11] Energy consumed for all CPUs : 0.016291 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:44:11] 0.321923 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:44:26] Energy consumed for RAM : 0.063999 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:44:26] Energy consumed for all GPUs : 0.244988 kWh. Total GPU Power : 640.5277401792778 W\n[codecarbon INFO @ 21:44:26] Energy consumed for all CPUs : 0.016468 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:44:26] 0.325455 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:44:41] Energy consumed for RAM : 0.064687 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:44:41] Energy consumed for all GPUs : 0.247629 kWh. Total GPU Power : 634.3819968519699 W\n[codecarbon INFO @ 21:44:41] Energy consumed for all CPUs : 0.016645 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:44:41] 0.328961 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:44:56] Energy consumed for RAM : 0.065375 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:44:56] Energy consumed for all GPUs : 0.250281 kWh. Total GPU Power : 637.265582807383 W\n[codecarbon INFO @ 21:44:56] Energy consumed for all CPUs : 0.016823 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:44:56] 0.332479 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:45:11] Energy consumed for RAM : 0.066063 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:45:11] Energy consumed for all GPUs : 0.252945 kWh. Total GPU Power : 639.7317988572786 W\n[codecarbon INFO @ 21:45:11] Energy consumed for all CPUs : 0.017000 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:45:11] 0.336008 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:45:26] Energy consumed for RAM : 0.066751 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:45:26] Energy consumed for all GPUs : 0.255612 kWh. Total GPU Power : 640.7293994008817 W\n[codecarbon INFO @ 21:45:26] Energy consumed for all CPUs : 0.017177 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:45:26] 0.339540 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:45:41] Energy consumed for RAM : 0.067439 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:45:41] Energy consumed for all GPUs : 0.258225 kWh. Total GPU Power : 627.831662994067 W\n[codecarbon INFO @ 21:45:41] Energy consumed for all CPUs : 0.017354 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:45:41] 0.343018 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:45:56] Energy consumed for RAM : 0.068128 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:45:56] Energy consumed for all GPUs : 0.260886 kWh. Total GPU Power : 639.0834373322126 W\n[codecarbon INFO @ 21:45:56] Energy consumed for all CPUs : 0.017531 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:45:56] 0.346544 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:46:11] Energy consumed for RAM : 0.068816 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:46:11] Energy consumed for all GPUs : 0.263551 kWh. Total GPU Power : 640.21811942804 W\n[codecarbon INFO @ 21:46:11] Energy consumed for all CPUs : 0.017708 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:46:11] 0.350075 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:46:26] Energy consumed for RAM : 0.069504 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:46:26] Energy consumed for all GPUs : 0.266199 kWh. Total GPU Power : 635.36554464275 W\n[codecarbon INFO @ 21:46:26] Energy consumed for all CPUs : 0.017885 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:46:26] 0.353588 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:46:41] Energy consumed for RAM : 0.070192 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:46:41] Energy consumed for all GPUs : 0.268846 kWh. Total GPU Power : 636.3615954525276 W\n[codecarbon INFO @ 21:46:41] Energy consumed for all CPUs : 0.018062 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:46:41] 0.357099 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:46:56] Energy consumed for RAM : 0.070880 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:46:56] Energy consumed for all GPUs : 0.271505 kWh. Total GPU Power : 638.7791497333527 W\n[codecarbon INFO @ 21:46:56] Energy consumed for all CPUs : 0.018239 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:46:56] 0.360624 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:47:11] Energy consumed for RAM : 0.071568 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:47:11] Energy consumed for all GPUs : 0.274180 kWh. Total GPU Power : 642.4466497637459 W\n[codecarbon INFO @ 21:47:11] Energy consumed for all CPUs : 0.018416 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:47:11] 0.364164 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:47:26] Energy consumed for RAM : 0.072256 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:47:26] Energy consumed for all GPUs : 0.276794 kWh. Total GPU Power : 628.3190508121727 W\n[codecarbon INFO @ 21:47:26] Energy consumed for all CPUs : 0.018593 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:47:26] 0.367644 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:47:41] Energy consumed for RAM : 0.072944 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:47:41] Energy consumed for all GPUs : 0.279466 kWh. Total GPU Power : 641.8670593361426 W\n[codecarbon INFO @ 21:47:41] Energy consumed for all CPUs : 0.018770 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:47:41] 0.371181 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:47:56] Energy consumed for RAM : 0.073632 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:47:56] Energy consumed for all GPUs : 0.282137 kWh. Total GPU Power : 641.4703729656464 W\n[codecarbon INFO @ 21:47:56] Energy consumed for all CPUs : 0.018947 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:47:56] 0.374716 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:48:11] Energy consumed for RAM : 0.074321 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:48:11] Energy consumed for all GPUs : 0.284781 kWh. Total GPU Power : 635.1853195157872 W\n[codecarbon INFO @ 21:48:11] Energy consumed for all CPUs : 0.019125 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:48:11] 0.378226 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:48:26] Energy consumed for RAM : 0.075008 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:48:26] Energy consumed for all GPUs : 0.287442 kWh. Total GPU Power : 639.5462106369428 W\n[codecarbon INFO @ 21:48:26] Energy consumed for all CPUs : 0.019302 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:48:26] 0.381752 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:48:41] Energy consumed for RAM : 0.075697 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:48:41] Energy consumed for all GPUs : 0.290110 kWh. Total GPU Power : 640.8566755972051 W\n[codecarbon INFO @ 21:48:41] Energy consumed for all CPUs : 0.019479 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:48:41] 0.385285 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:48:56] Energy consumed for RAM : 0.076385 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:48:56] Energy consumed for all GPUs : 0.292782 kWh. Total GPU Power : 641.7052864666981 W\n[codecarbon INFO @ 21:48:56] Energy consumed for all CPUs : 0.019656 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:48:56] 0.388822 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:49:11] Energy consumed for RAM : 0.077073 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:49:11] Energy consumed for all GPUs : 0.295396 kWh. Total GPU Power : 628.327852773132 W\n[codecarbon INFO @ 21:49:11] Energy consumed for all CPUs : 0.019833 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:49:11] 0.392302 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:49:26] Energy consumed for RAM : 0.077761 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:49:26] Energy consumed for all GPUs : 0.298062 kWh. Total GPU Power : 640.3407136160951 W\n[codecarbon INFO @ 21:49:26] Energy consumed for all CPUs : 0.020010 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:49:26] 0.395833 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:49:41] Energy consumed for RAM : 0.078448 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:49:41] Energy consumed for all GPUs : 0.300704 kWh. Total GPU Power : 635.323287223873 W\n[codecarbon INFO @ 21:49:41] Energy consumed for all CPUs : 0.020187 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:49:41] 0.399339 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:49:56] Energy consumed for RAM : 0.079137 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:49:56] Energy consumed for all GPUs : 0.303364 kWh. Total GPU Power : 638.8823021398234 W\n[codecarbon INFO @ 21:49:56] Energy consumed for all CPUs : 0.020364 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:49:56] 0.402864 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:50:11] Energy consumed for RAM : 0.079825 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:50:11] Energy consumed for all GPUs : 0.306026 kWh. Total GPU Power : 639.5449269632837 W\n[codecarbon INFO @ 21:50:11] Energy consumed for all CPUs : 0.020541 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:50:11] 0.406392 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:50:26] Energy consumed for RAM : 0.080513 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:50:26] Energy consumed for all GPUs : 0.308696 kWh. Total GPU Power : 641.2978321880182 W\n[codecarbon INFO @ 21:50:26] Energy consumed for all CPUs : 0.020718 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:50:26] 0.409927 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:50:41] Energy consumed for RAM : 0.081201 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:50:41] Energy consumed for all GPUs : 0.311344 kWh. Total GPU Power : 636.0606478210286 W\n[codecarbon INFO @ 21:50:41] Energy consumed for all CPUs : 0.020895 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:50:41] 0.413440 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:50:56] Energy consumed for RAM : 0.081889 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:50:56] Energy consumed for all GPUs : 0.313981 kWh. Total GPU Power : 633.4763683114048 W\n[codecarbon INFO @ 21:50:56] Energy consumed for all CPUs : 0.021072 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:50:56] 0.416942 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:51:11] Energy consumed for RAM : 0.082577 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:51:11] Energy consumed for all GPUs : 0.316621 kWh. Total GPU Power : 634.3093356260223 W\n[codecarbon INFO @ 21:51:11] Energy consumed for all CPUs : 0.021249 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:51:11] 0.420447 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:51:26] Energy consumed for RAM : 0.083265 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:51:26] Energy consumed for all GPUs : 0.319275 kWh. Total GPU Power : 637.9102256749104 W\n[codecarbon INFO @ 21:51:26] Energy consumed for all CPUs : 0.021426 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:51:26] 0.423967 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:51:41] Energy consumed for RAM : 0.083953 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:51:41] Energy consumed for all GPUs : 0.321942 kWh. Total GPU Power : 640.7082551761707 W\n[codecarbon INFO @ 21:51:41] Energy consumed for all CPUs : 0.021603 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:51:41] 0.427499 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:51:56] Energy consumed for RAM : 0.084642 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:51:56] Energy consumed for all GPUs : 0.324609 kWh. Total GPU Power : 640.5186958631616 W\n[codecarbon INFO @ 21:51:56] Energy consumed for all CPUs : 0.021781 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:51:56] 0.431032 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:52:11] Energy consumed for RAM : 0.085330 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:52:11] Energy consumed for all GPUs : 0.327266 kWh. Total GPU Power : 638.3529279530002 W\n[codecarbon INFO @ 21:52:11] Energy consumed for all CPUs : 0.021958 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:52:11] 0.434553 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:52:26] Energy consumed for RAM : 0.086018 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:52:26] Energy consumed for all GPUs : 0.329910 kWh. Total GPU Power : 635.3762025182446 W\n[codecarbon INFO @ 21:52:26] Energy consumed for all CPUs : 0.022135 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:52:26] 0.438062 kWh of electricity used since the beginning.\n[codecarbon INFO @ 21:52:41] Energy consumed for RAM : 0.086706 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 21:52:41] Energy consumed for all GPUs : 0.332545 kWh. Total GPU Power : 632.9484218502305 W\n[codecarbon INFO @ 21:52:41] Energy consumed for all CPUs : 0.022312 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 21:52:41] 0.441563 kWh of electricity used since the beginning.\n"
+        }
+      ],
+      "execution_count": 16,
+      "metadata": {
+        "gather": {
+          "logged": 1706476228988
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "wandb.finish()"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "gather": {
+          "logged": 1706476229030
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "variant = \"full_sample\" if SUBSAMPLING == 1.0 else f\"subsample-{SUBSAMPLING}\"\n",
+        "tokenizer._tokenizer.save(\"tokenizer.json\")\n",
+        "tokenizer.push_to_hub(\"chrisvoncsefalvay/daedra\")\n",
+        "sample = \"full sample\" if SUBSAMPLING == 1.0 else f\"{SUBSAMPLING * 100}% of the full sample\"\n",
+        "\n",
+        "model.push_to_hub(\"chrisvoncsefalvay/daedra\", \n",
+        "                  variant=variant,\n",
+        "                  commit_message=f\"DAEDRA model trained on {sample} of the VAERS dataset (training set size: {dataset['train'].num_rows:,})\")"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "gather": {
+          "logged": 1706476229038
+        }
+      }
+    }
+  ],
+  "metadata": {
+    "datalore": {
+      "base_environment": "default",
+      "computation_mode": "JUPYTER",
+      "package_manager": "pip",
+      "packages": [
+        {
+          "name": "datasets",
+          "source": "PIP",
+          "version": "2.16.1"
+        },
+        {
+          "name": "torch",
+          "source": "PIP",
+          "version": "2.1.2"
+        },
+        {
+          "name": "accelerate",
+          "source": "PIP",
+          "version": "0.26.1"
+        }
+      ],
+      "report_row_ids": [
+        "un8W7ez7ZwoGb5Co6nydEV",
+        "40nN9Hvgi1clHNV5RAemI5",
+        "TgRD90H5NSPpKS41OeXI1w",
+        "ZOm5BfUs3h1EGLaUkBGeEB",
+        "kOP0CZWNSk6vqE3wkPp7Vc",
+        "W4PWcOu2O2pRaZyoE2W80h",
+        "RolbOnQLIftk0vy9mIcz5M",
+        "8OPhUgbaNJmOdiq5D3a6vK",
+        "5Qrt3jSvSrpK6Ne1hS6shL",
+        "hTq7nFUrovN5Ao4u6dIYWZ",
+        "I8WNZLpJ1DVP2wiCW7YBIB",
+        "SawhU3I9BewSE1XBPstpNJ",
+        "80EtLEl2FIE4FqbWnUD3nT"
+      ],
+      "version": 3
+    },
+    "kernelspec": {
+      "name": "python38-azureml-pt-tf",
+      "language": "python",
+      "display_name": "Python 3.8 - Pytorch and Tensorflow"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.8.5",
+      "mimetype": "text/x-python",
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "pygments_lexer": "ipython3",
+      "nbconvert_exporter": "python",
+      "file_extension": ".py"
+    },
+    "microsoft": {
+      "host": {
+        "AzureML": {
+          "notebookHasBeenCompleted": true
+        }
+      },
+      "ms_spell_check": {
+        "ms_spell_check_language": "en"
+      }
+    },
+    "nteract": {
+      "version": "nteract-front-end@1.0.0"
+    },
+    "kernel_info": {
+      "name": "python38-azureml-pt-tf"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}

notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-28-3-12-1Z.ipynb ADDED Viewed

	@@ -0,0 +1,1053 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# DAEDRA: Determining Adverse Event Disposition for Regulatory Affairs\n",
+        "\n",
+        "DAEDRA is a language model intended to predict the disposition (outcome) of an adverse event based on the text of the event report. Intended to be used to classify reports in passive reporting systems, it is trained on the [VAERS](https://vaers.hhs.gov/) dataset, which contains reports of adverse events following vaccination in the United States."
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%pip install accelerate -U"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Requirement already satisfied: accelerate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.26.1)\nRequirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (23.1)\nRequirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.23.5)\nRequirement already satisfied: torch>=1.10.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.12.0)\nRequirement already satisfied: pyyaml in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (6.0)\nRequirement already satisfied: psutil in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (5.9.5)\nRequirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.4.2)\nRequirement already satisfied: huggingface-hub in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.20.3)\nRequirement already satisfied: typing_extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from torch>=1.10.0->accelerate) (4.6.3)\nRequirement already satisfied: tqdm>=4.42.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (4.65.0)\nRequirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (3.13.1)\nRequirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2.31.0)\nRequirement already satisfied: fsspec>=2023.5.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2023.10.0)\nRequirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.1.0)\nRequirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.4)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (1.26.16)\nRequirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (2023.5.7)\nNote: you may need to restart the kernel to use updated packages.\n"
+        }
+      ],
+      "execution_count": 1,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%pip install transformers datasets shap watermark wandb"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Requirement already satisfied: transformers in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (4.37.1)\nRequirement already satisfied: datasets in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.16.1)\nRequirement already satisfied: shap in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.44.1)\nRequirement already satisfied: watermark in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.4.3)\nCollecting wandb\n  Using cached wandb-0.16.2-py3-none-any.whl (2.2 MB)\nRequirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (23.1)\nRequirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.20.3)\nRequirement already satisfied: tokenizers<0.19,>=0.14 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.15.1)\nRequirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (1.23.5)\nRequirement already satisfied: tqdm>=4.27 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (4.65.0)\nRequirement already satisfied: regex!=2019.12.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2023.12.25)\nRequirement already satisfied: pyyaml>=5.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (6.0)\nRequirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (3.13.1)\nRequirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.4.2)\nRequirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2.31.0)\nRequirement already satisfied: pandas in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2.0.2)\nRequirement already satisfied: multiprocess in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.70.15)\nRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2023.10.0)\nRequirement already satisfied: pyarrow>=8.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (9.0.0)\nRequirement already satisfied: dill<0.3.8,>=0.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.3.7)\nRequirement already satisfied: xxhash in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.4.1)\nRequirement already satisfied: aiohttp in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.9.1)\nRequirement already satisfied: pyarrow-hotfix in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.6)\nRequirement already satisfied: scipy in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.10.1)\nRequirement already satisfied: cloudpickle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (2.2.1)\nRequirement already satisfied: numba in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.58.1)\nRequirement already satisfied: scikit-learn in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.2.2)\nRequirement already satisfied: slicer==0.0.7 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.0.7)\nRequirement already satisfied: importlib-metadata>=1.4 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (6.7.0)\nRequirement already satisfied: setuptools in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (65.6.3)\nRequirement already satisfied: ipython>=6.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (8.12.2)\nCollecting sentry-sdk>=1.0.0\n  Using cached sentry_sdk-1.39.2-py2.py3-none-any.whl (254 kB)\nRequirement already satisfied: typing-extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (4.6.3)\nCollecting docker-pycreds>=0.4.0\n  Using cached docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)\nRequirement already satisfied: protobuf!=4.21.0,<5,>=3.12.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.19.6)\nCollecting setproctitle\n  Using cached setproctitle-1.3.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31 kB)\nCollecting appdirs>=1.4.3\n  Using cached appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)\nRequirement already satisfied: GitPython!=3.1.29,>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.1.31)\nRequirement already satisfied: psutil>=5.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (5.9.5)\nRequirement already satisfied: Click!=8.0.0,>=7.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (8.1.3)\nRequirement already satisfied: six>=1.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0)\nRequirement already satisfied: frozenlist>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.4.1)\nRequirement already satisfied: yarl<2.0,>=1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.9.4)\nRequirement already satisfied: aiosignal>=1.1.2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.1)\nRequirement already satisfied: multidict<7.0,>=4.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (6.0.4)\nRequirement already satisfied: async-timeout<5.0,>=4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (4.0.3)\nRequirement already satisfied: attrs>=17.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (23.1.0)\nRequirement already satisfied: gitdb<5,>=4.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from GitPython!=3.1.29,>=1.0.0->wandb) (4.0.10)\nRequirement already satisfied: zipp>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from importlib-metadata>=1.4->watermark) (3.15.0)\nRequirement already satisfied: backcall in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.2.0)\nRequirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (3.0.30)\nRequirement already satisfied: traitlets>=5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.9.0)\nRequirement already satisfied: stack-data in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.6.2)\nRequirement already satisfied: pickleshare in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.7.5)\nRequirement already satisfied: decorator in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.1.1)\nRequirement already satisfied: jedi>=0.16 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.18.2)\nRequirement already satisfied: matplotlib-inline in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.1.6)\nRequirement already satisfied: pygments>=2.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (2.15.1)\nRequirement already satisfied: pexpect>4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (4.8.0)\nRequirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (2023.5.7)\nRequirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.1.0)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (1.26.16)\nRequirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.4)\nRequirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from numba->shap) (0.41.1)\nRequirement already satisfied: pytz>=2020.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\nRequirement already satisfied: python-dateutil>=2.8.2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2.8.2)\nRequirement already satisfied: tzdata>=2022.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\nRequirement already satisfied: joblib>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (1.2.0)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (3.1.0)\nRequirement already satisfied: smmap<6,>=3.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.0)\nRequirement already satisfied: parso<0.9.0,>=0.8.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from jedi>=0.16->ipython>=6.0->watermark) (0.8.3)\nRequirement already satisfied: ptyprocess>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pexpect>4.3->ipython>=6.0->watermark) (0.7.0)\nRequirement already satisfied: wcwidth in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.0->watermark) (0.2.6)\nRequirement already satisfied: executing>=1.2.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (1.2.0)\nRequirement already satisfied: asttokens>=2.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (2.2.1)\nRequirement already satisfied: pure-eval in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (0.2.2)\nInstalling collected packages: appdirs, setproctitle, sentry-sdk, docker-pycreds, wandb\nSuccessfully installed appdirs-1.4.4 docker-pycreds-0.4.0 sentry-sdk-1.39.2 setproctitle-1.3.3 wandb-0.16.2\nNote: you may need to restart the kernel to use updated packages.\n"
+        }
+      ],
+      "execution_count": 17,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "import torch\n",
+        "import os\n",
+        "from typing import List\n",
+        "from sklearn.metrics import f1_score, accuracy_score, classification_report\n",
+        "from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, pipeline\n",
+        "from datasets import load_dataset\n",
+        "import shap\n",
+        "\n",
+        "%load_ext watermark"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n2024-01-28 02:27:28.730200: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\nTo enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n2024-01-28 02:27:29.708865: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n2024-01-28 02:27:29.708983: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n2024-01-28 02:27:29.708996: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
+        }
+      ],
+      "execution_count": 3,
+      "metadata": {
+        "datalore": {
+          "node_id": "caZjjFP0OyQNMVgZDiwswE",
+          "type": "CODE",
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "report_properties": {
+            "rowId": "un8W7ez7ZwoGb5Co6nydEV"
+          }
+        },
+        "gather": {
+          "logged": 1706408851775
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "device: str = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+        "\n",
+        "SEED: int = 42\n",
+        "\n",
+        "BATCH_SIZE: int = 8\n",
+        "EPOCHS: int = 1\n",
+        "model_ckpt: str = \"distilbert-base-uncased\"\n",
+        "\n",
+        "CLASS_NAMES: List[str] = [\"DIED\",\n",
+        "                          \"ER_VISIT\",\n",
+        "                          \"HOSPITAL\",\n",
+        "                          \"OFC_VISIT\",\n",
+        "                          \"X_STAY\",\n",
+        "                          \"DISABLE\",\n",
+        "                          \"D_PRESENTED\"]\n",
+        "\n",
+        "# WandB configuration\n",
+        "os.environ[\"WANDB_PROJECT\"] = \"DAEDRA model training\"  # name your W&B project\n",
+        "os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\"  # log all model checkpoints"
+      ],
+      "outputs": [],
+      "execution_count": 4,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706408852045
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%watermark --iversion"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "re     : 2.2.1\nnumpy  : 1.23.5\nlogging: 0.5.1.2\npandas : 2.0.2\ntorch  : 1.12.0\nshap   : 0.44.1\n\n"
+        }
+      ],
+      "execution_count": 5,
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!nvidia-smi"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Sun Jan 28 02:27:31 2024       \r\n+---------------------------------------------------------------------------------------+\r\n| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |\r\n|-----------------------------------------+----------------------+----------------------+\r\n| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\r\n| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\r\n|                                         |                      |               MIG M. |\r\n|=========================================+======================+======================|\r\n|   0  Tesla V100-PCIE-16GB           Off | 00000001:00:00.0 Off |                  Off |\r\n| N/A   28C    P0              37W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n|   1  Tesla V100-PCIE-16GB           Off | 00000002:00:00.0 Off |                  Off |\r\n| N/A   27C    P0              36W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n                                                                                         \r\n+---------------------------------------------------------------------------------------+\r\n| Processes:                                                                            |\r\n|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\r\n|        ID   ID                                                             Usage      |\r\n|=======================================================================================|\r\n|  No running processes found                                                           |\r\n+---------------------------------------------------------------------------------------+\r\n"
+        }
+      ],
+      "execution_count": 6,
+      "metadata": {
+        "datalore": {
+          "node_id": "UU2oOJhwbIualogG1YyCMd",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        }
+      }
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "source": [
+        "## Loading the data set"
+      ],
+      "metadata": {
+        "datalore": {
+          "node_id": "t45KHugmcPVaO0nuk8tGJ9",
+          "type": "MD",
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "report_properties": {
+            "rowId": "40nN9Hvgi1clHNV5RAemI5"
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dataset = load_dataset(\"chrisvoncsefalvay/vaers-outcomes\")"
+      ],
+      "outputs": [],
+      "execution_count": 7,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706408853264
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Tokenisation and encoding"
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)"
+      ],
+      "outputs": [],
+      "execution_count": 8,
+      "metadata": {
+        "datalore": {
+          "node_id": "I7n646PIscsUZRoHu6m7zm",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706408853475
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def tokenize_and_encode(examples):\n",
+        "  return tokenizer(examples[\"text\"], truncation=True)"
+      ],
+      "outputs": [],
+      "execution_count": 9,
+      "metadata": {
+        "datalore": {
+          "node_id": "QBLOSI0yVIslV7v7qX9ZC3",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706408853684
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "cols = dataset[\"train\"].column_names\n",
+        "cols.remove(\"labels\")\n",
+        "ds_enc = dataset.map(tokenize_and_encode, batched=True, remove_columns=cols)"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "Map: 100%|██████████| 15786/15786 [00:01<00:00, 10990.82 examples/s]\n"
+        }
+      ],
+      "execution_count": 10,
+      "metadata": {
+        "datalore": {
+          "node_id": "slHeNysZOX9uWS9PB7jFDb",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706408854738
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Training"
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "class MultiLabelTrainer(Trainer):\n",
+        "    def compute_loss(self, model, inputs, return_outputs=False):\n",
+        "        labels = inputs.pop(\"labels\")\n",
+        "        outputs = model(**inputs)\n",
+        "        logits = outputs.logits\n",
+        "        loss_fct = torch.nn.BCEWithLogitsLoss()\n",
+        "        loss = loss_fct(logits.view(-1, self.model.config.num_labels),\n",
+        "                        labels.float().view(-1, self.model.config.num_labels))\n",
+        "        return (loss, outputs) if return_outputs else loss"
+      ],
+      "outputs": [],
+      "execution_count": 11,
+      "metadata": {
+        "datalore": {
+          "node_id": "itXWkbDw9sqbkMuDP84QoT",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706408854925
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(CLASS_NAMES)).to(\"cuda\")"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\nYou should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+        }
+      ],
+      "execution_count": 12,
+      "metadata": {
+        "datalore": {
+          "node_id": "ZQU7aW6TV45VmhHOQRzcnF",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706408857008
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def accuracy_threshold(y_pred, y_true, threshold=.5, sigmoid=True):\n",
+        "    y_pred = torch.from_numpy(y_pred)\n",
+        "    y_true = torch.from_numpy(y_true)\n",
+        "\n",
+        "    if sigmoid:\n",
+        "        y_pred = y_pred.sigmoid()\n",
+        "\n",
+        "    return ((y_pred > threshold) == y_true.bool()).float().mean().item()"
+      ],
+      "outputs": [],
+      "execution_count": 13,
+      "metadata": {
+        "datalore": {
+          "node_id": "swhgyyyxoGL8HjnXJtMuSW",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706408857297
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def compute_metrics(eval_pred):\n",
+        "    predictions, labels = eval_pred\n",
+        "    return {'accuracy_thresh': accuracy_threshold(predictions, labels)}"
+      ],
+      "outputs": [],
+      "execution_count": 14,
+      "metadata": {
+        "datalore": {
+          "node_id": "1Uq3HtkaBxtHNAnSwit5cI",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706408857499
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "args = TrainingArguments(\n",
+        "    output_dir=\"vaers\",\n",
+        "    evaluation_strategy=\"epoch\",\n",
+        "    learning_rate=2e-5,\n",
+        "    per_device_train_batch_size=BATCH_SIZE,\n",
+        "    per_device_eval_batch_size=BATCH_SIZE,\n",
+        "    num_train_epochs=EPOCHS,\n",
+        "    weight_decay=.01,\n",
+        "    report_to=[\"wandb\"]\n",
+        ")"
+      ],
+      "outputs": [],
+      "execution_count": 15,
+      "metadata": {
+        "datalore": {
+          "node_id": "1iPZOTKPwSkTgX5dORqT89",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706408857680
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "multi_label_trainer = MultiLabelTrainer(\n",
+        "    model, \n",
+        "    args, \n",
+        "    train_dataset=ds_enc[\"train\"], \n",
+        "    eval_dataset=ds_enc[\"test\"], \n",
+        "    compute_metrics=compute_metrics, \n",
+        "    tokenizer=tokenizer\n",
+        ")"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\nhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\nhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+        }
+      ],
+      "execution_count": 18,
+      "metadata": {
+        "datalore": {
+          "node_id": "bnRkNvRYltLun6gCEgL7v0",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706408895305
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "multi_label_trainer.evaluate()"
+      ],
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "\n    <div>\n      \n      <progress value='1974' max='987' style='width:300px; height:20px; vertical-align: middle;'></progress>\n      [987/987 21:41]\n    </div>\n    "
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\nhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mchrisvoncsefalvay\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\nhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\nhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Tracking run with wandb version 0.16.2"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/cvc-vaers-bert-dnsd/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240128_022947-hh1sxw9i</code>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/hh1sxw9i' target=\"_blank\">icy-firebrand-1</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training</a>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/hh1sxw9i' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/hh1sxw9i</a>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "execute_result",
+          "execution_count": 19,
+          "data": {
+            "text/plain": "{'eval_loss': 0.7153111100196838,\n 'eval_accuracy_thresh': 0.2938227355480194,\n 'eval_runtime': 82.3613,\n 'eval_samples_per_second': 191.668,\n 'eval_steps_per_second': 11.984}"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 19,
+      "metadata": {
+        "datalore": {
+          "node_id": "LO54PlDkWQdFrzV25FvduB",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706408991752
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "multi_label_trainer.train()"
+      ],
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "\n    <div>\n      \n      <progress value='4605' max='4605' style='width:300px; height:20px; vertical-align: middle;'></progress>\n      [4605/4605 20:25, Epoch 1/1]\n    </div>\n    <table border=\"1\" class=\"dataframe\">\n  <thead>\n <tr style=\"text-align: left;\">\n      <th>Epoch</th>\n      <th>Training Loss</th>\n      <th>Validation Loss</th>\n      <th>Accuracy Thresh</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>1</td>\n      <td>0.086700</td>\n      <td>0.093388</td>\n      <td>0.962897</td>\n    </tr>\n  </tbody>\n</table><p>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "Checkpoint destination directory vaers/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.\n\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-500)... Done. 15.9s\nCheckpoint destination directory vaers/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.\n\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-1000)... Done. 12.5s\nCheckpoint destination directory vaers/checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.\n\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-1500)... Done. 21.9s\nCheckpoint destination directory vaers/checkpoint-2000 already exists and is non-empty.Saving will proceed but saved results may be invalid.\n\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-2000)... Done. 13.8s\nCheckpoint destination directory vaers/checkpoint-2500 already exists and is non-empty.Saving will proceed but saved results may be invalid.\n\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-2500)... Done. 15.7s\nCheckpoint destination directory vaers/checkpoint-3000 already exists and is non-empty.Saving will proceed but saved results may be invalid.\n\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-3000)... Done. 21.7s\nCheckpoint destination directory vaers/checkpoint-3500 already exists and is non-empty.Saving will proceed but saved results may be invalid.\n\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-3500)... Done. 10.6s\nCheckpoint destination directory vaers/checkpoint-4000 already exists and is non-empty.Saving will proceed but saved results may be invalid.\n\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-4000)... Done. 15.0s\nCheckpoint destination directory vaers/checkpoint-4500 already exists and is non-empty.Saving will proceed but saved results may be invalid.\n\u001b[34m\u001b[1mwandb\u001b[0m: Adding directory to artifact (./vaers/checkpoint-4500)... Done. 16.7s\n"
+        },
+        {
+          "output_type": "execute_result",
+          "execution_count": 21,
+          "data": {
+            "text/plain": "TrainOutput(global_step=4605, training_loss=0.09062977189220382, metrics={'train_runtime': 1223.2444, 'train_samples_per_second': 60.223, 'train_steps_per_second': 3.765, 'total_flos': 9346797199425174.0, 'train_loss': 0.09062977189220382, 'epoch': 1.0})"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 21,
+      "metadata": {
+        "datalore": {
+          "node_id": "hf0Ei1QXEYDmBv1VNLZ4Zw",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411445752
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Evaluation"
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We instantiate a classifier `pipeline` and push it to CUDA."
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "classifier = pipeline(\"text-classification\", \n",
+        "                      model, \n",
+        "                      tokenizer=tokenizer, \n",
+        "                      device=\"cuda:0\")"
+      ],
+      "outputs": [],
+      "execution_count": 24,
+      "metadata": {
+        "datalore": {
+          "node_id": "kHoUdBeqcyVXDSGv54C4aE",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411459928
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We use the same tokenizer used for training to tokenize/encode the validation set."
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "test_encodings = tokenizer.batch_encode_plus(dataset[\"val\"][\"text\"], \n",
+        "                                             max_length=255, \n",
+        "                                             pad_to_max_length=True, \n",
+        "                                             return_token_type_ids=True, \n",
+        "                                             truncation=True)"
+      ],
+      "outputs": [
+        {
+          "output_type": "error",
+          "ename": "KeyError",
+          "evalue": "'validate'",
+          "traceback": [
+            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+            "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+            "Cell \u001b[0;32mIn[25], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m test_encodings \u001b[38;5;241m=\u001b[39m tokenizer\u001b[38;5;241m.\u001b[39mbatch_encode_plus(\u001b[43mdataset\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mvalidate\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m\"\u001b[39m], \n\u001b[1;32m      2\u001b[0m                                              max_length\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m255\u001b[39m, \n\u001b[1;32m      3\u001b[0m                                              pad_to_max_length\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, \n\u001b[1;32m      4\u001b[0m                                              return_token_type_ids\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, \n\u001b[1;32m      5\u001b[0m                                              truncation\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
+            "File \u001b[0;32m/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/datasets/dataset_dict.py:74\u001b[0m, in \u001b[0;36mDatasetDict.__getitem__\u001b[0;34m(self, k)\u001b[0m\n\u001b[1;32m     72\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, k) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Dataset:\n\u001b[1;32m     73\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(k, (\u001b[38;5;28mstr\u001b[39m, NamedSplit)) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m---> 74\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__getitem__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mk\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     75\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m     76\u001b[0m         available_suggested_splits \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m     77\u001b[0m             split \u001b[38;5;28;01mfor\u001b[39;00m split \u001b[38;5;129;01min\u001b[39;00m (Split\u001b[38;5;241m.\u001b[39mTRAIN, Split\u001b[38;5;241m.\u001b[39mTEST, Split\u001b[38;5;241m.\u001b[39mVALIDATION) \u001b[38;5;28;01mif\u001b[39;00m split \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\n\u001b[1;32m     78\u001b[0m         ]\n",
+            "\u001b[0;31mKeyError\u001b[0m: 'validate'"
+          ]
+        }
+      ],
+      "execution_count": 25,
+      "metadata": {
+        "datalore": {
+          "node_id": "Dr5WCWA6jL51NR1fSrQu6Z",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411465538
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Once we've made the data loadable by putting it into a `DataLoader`, we "
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "test_data = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']), \n",
+        "                                           torch.tensor(test_encodings['attention_mask']), \n",
+        "                                           torch.tensor(ds_enc[\"validate\"][\"labels\"]), \n",
+        "                                           torch.tensor(test_encodings['token_type_ids']))\n",
+        "test_dataloader = torch.utils.data.DataLoader(test_data, \n",
+        "                                              sampler=torch.utils.data.SequentialSampler(test_data), \n",
+        "                                              batch_size=BATCH_SIZE)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "MWfGq2tTkJNzFiDoUPq2X7",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411446707
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model.eval()\n",
+        "\n",
+        "logit_preds, true_labels, pred_labels, tokenized_texts = [], [], [], []\n",
+        "\n",
+        "for i, batch in enumerate(test_dataloader):\n",
+        "  batch = tuple(t.to(device) for t in batch)\n",
+        "  # Unpack the inputs from our dataloader\n",
+        "  b_input_ids, b_input_mask, b_labels, b_token_types = batch\n",
+        "  \n",
+        "  with torch.no_grad():\n",
+        "    outs = model(b_input_ids, attention_mask=b_input_mask)\n",
+        "    b_logit_pred = outs[0]\n",
+        "    pred_label = torch.sigmoid(b_logit_pred)\n",
+        "\n",
+        "    b_logit_pred = b_logit_pred.detach().cpu().numpy()\n",
+        "    pred_label = pred_label.to('cpu').numpy()\n",
+        "    b_labels = b_labels.to('cpu').numpy()\n",
+        "\n",
+        "  tokenized_texts.append(b_input_ids)\n",
+        "  logit_preds.append(b_logit_pred)\n",
+        "  true_labels.append(b_labels)\n",
+        "  pred_labels.append(pred_label)\n",
+        "\n",
+        "# Flatten outputs\n",
+        "tokenized_texts = [item for sublist in tokenized_texts for item in sublist]\n",
+        "pred_labels = [item for sublist in pred_labels for item in sublist]\n",
+        "true_labels = [item for sublist in true_labels for item in sublist]\n",
+        "\n",
+        "# Converting flattened binary values to boolean values\n",
+        "true_bools = [tl == 1 for tl in true_labels]\n",
+        "pred_bools = [pl > 0.50 for pl in pred_labels] "
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "1SJCSrQTRCexFCNCIyRrzL",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411446723
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We create a classification report:"
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools, average='micro'))\n",
+        "print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools), '\\n')\n",
+        "clf_report = classification_report(true_bools, pred_bools, target_names=CLASS_NAMES)\n",
+        "print(clf_report)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "eBprrgF086mznPbPVBpOLS",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411446746
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Finally, we render a 'head to head' comparison table that maps each text prediction to actual and predicted labels."
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Creating a map of class names from class numbers\n",
+        "idx2label = dict(zip(range(len(CLASS_NAMES)), CLASS_NAMES))"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "yELHY0IEwMlMw3x6e7hoD1",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411446758
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "true_label_idxs, pred_label_idxs = [], []\n",
+        "\n",
+        "for vals in true_bools:\n",
+        "  true_label_idxs.append(np.where(vals)[0].flatten().tolist())\n",
+        "for vals in pred_bools:\n",
+        "  pred_label_idxs.append(np.where(vals)[0].flatten().tolist())"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "jH0S35dDteUch01sa6me6e",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411446771
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "true_label_texts, pred_label_texts = [], []\n",
+        "\n",
+        "for vals in true_label_idxs:\n",
+        "  if vals:\n",
+        "    true_label_texts.append([idx2label[val] for val in vals])\n",
+        "  else:\n",
+        "    true_label_texts.append(vals)\n",
+        "\n",
+        "for vals in pred_label_idxs:\n",
+        "  if vals:\n",
+        "    pred_label_texts.append([idx2label[val] for val in vals])\n",
+        "  else:\n",
+        "    pred_label_texts.append(vals)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "h4vHL8XdGpayZ6xLGJUF6F",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411446785
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "symptom_texts = [tokenizer.decode(text,\n",
+        "                                  skip_special_tokens=True,\n",
+        "                                  clean_up_tokenization_spaces=False) for text in tokenized_texts]"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "SxUmVHfQISEeptg1SawOmB",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411446805
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "comparisons_df = pd.DataFrame({'symptom_text': symptom_texts, \n",
+        "                               'true_labels': true_label_texts, \n",
+        "                               'pred_labels':pred_label_texts})\n",
+        "comparisons_df.to_csv('comparisons.csv')\n",
+        "comparisons_df"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "BxFNigNGRLTOqraI55BPSH",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411446818
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Shapley analysis"
+      ],
+      "metadata": {
+        "collapsed": false
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "explainer = shap.Explainer(classifier, output_names=CLASS_NAMES)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "OpdZcoenX2HwzLdai7K5UA",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411446829
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "shap_values = explainer(dataset[\"validate\"][\"text\"][1:2])"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "FvbCMfIDlcf16YSvb8wNQv",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411446839
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "shap.plots.text(shap_values)"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "datalore": {
+          "node_id": "TSxvakWLPCpjVMWi9ZdEbd",
+          "type": "CODE",
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true
+        },
+        "gather": {
+          "logged": 1706411446848
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "name": "python3",
+      "language": "python",
+      "display_name": "Python 3 (ipykernel)"
+    },
+    "datalore": {
+      "computation_mode": "JUPYTER",
+      "package_manager": "pip",
+      "base_environment": "default",
+      "packages": [
+        {
+          "name": "datasets",
+          "version": "2.16.1",
+          "source": "PIP"
+        },
+        {
+          "name": "torch",
+          "version": "2.1.2",
+          "source": "PIP"
+        },
+        {
+          "name": "accelerate",
+          "version": "0.26.1",
+          "source": "PIP"
+        }
+      ],
+      "report_row_ids": [
+        "un8W7ez7ZwoGb5Co6nydEV",
+        "40nN9Hvgi1clHNV5RAemI5",
+        "TgRD90H5NSPpKS41OeXI1w",
+        "ZOm5BfUs3h1EGLaUkBGeEB",
+        "kOP0CZWNSk6vqE3wkPp7Vc",
+        "W4PWcOu2O2pRaZyoE2W80h",
+        "RolbOnQLIftk0vy9mIcz5M",
+        "8OPhUgbaNJmOdiq5D3a6vK",
+        "5Qrt3jSvSrpK6Ne1hS6shL",
+        "hTq7nFUrovN5Ao4u6dIYWZ",
+        "I8WNZLpJ1DVP2wiCW7YBIB",
+        "SawhU3I9BewSE1XBPstpNJ",
+        "80EtLEl2FIE4FqbWnUD3nT"
+      ],
+      "version": 3
+    },
+    "microsoft": {
+      "ms_spell_check": {
+        "ms_spell_check_language": "en"
+      }
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.8.5",
+      "mimetype": "text/x-python",
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "pygments_lexer": "ipython3",
+      "nbconvert_exporter": "python",
+      "file_extension": ".py"
+    },
+    "nteract": {
+      "version": "nteract-front-end@1.0.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}

notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-28-4-13-53Z.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-29-14-26-30Z.ipynb ADDED Viewed

	@@ -0,0 +1,739 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# DAEDRA: Determining Adverse Event Disposition for Regulatory Affairs\n",
+        "\n",
+        "DAEDRA is a language model intended to predict the disposition (outcome) of an adverse event based on the text of the event report. Intended to be used to classify reports in passive reporting systems, it is trained on the [VAERS](https://vaers.hhs.gov/) dataset, which contains reports of adverse events following vaccination in the United States."
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%pip install accelerate -U"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Requirement already satisfied: accelerate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.26.1)\nRequirement already satisfied: torch>=1.10.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.12.0)\nRequirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (23.1)\nRequirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.23.5)\nRequirement already satisfied: pyyaml in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (6.0)\nRequirement already satisfied: psutil in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (5.9.5)\nRequirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.4.2)\nRequirement already satisfied: huggingface-hub in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.20.3)\nRequirement already satisfied: typing_extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from torch>=1.10.0->accelerate) (4.6.3)\nRequirement already satisfied: tqdm>=4.42.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (4.65.0)\nRequirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (3.13.1)\nRequirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2.31.0)\nRequirement already satisfied: fsspec>=2023.5.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2023.10.0)\nRequirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.4)\nRequirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.1.0)\nRequirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (2023.5.7)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (1.26.16)\nNote: you may need to restart the kernel to use updated packages.\n"
+        }
+      ],
+      "execution_count": 1,
+      "metadata": {
+        "gather": {
+          "logged": 1706475754655
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "tags": []
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%pip install transformers datasets shap watermark wandb evaluate codecarbon"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Requirement already satisfied: transformers in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (4.37.1)\nRequirement already satisfied: datasets in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.16.1)\nRequirement already satisfied: shap in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.44.1)\nRequirement already satisfied: watermark in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.4.3)\nRequirement already satisfied: wandb in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.16.2)\nRequirement already satisfied: evaluate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.4.1)\nRequirement already satisfied: codecarbon in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.3.3)\nRequirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.4.2)\nRequirement already satisfied: regex!=2019.12.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2023.12.25)\nRequirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (3.13.1)\nRequirement already satisfied: pyyaml>=5.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (6.0)\nRequirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (1.23.5)\nRequirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (23.1)\nRequirement already satisfied: tqdm>=4.27 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (4.65.0)\nRequirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.20.3)\nRequirement already satisfied: tokenizers<0.19,>=0.14 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.15.1)\nRequirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2.31.0)\nRequirement already satisfied: pyarrow>=8.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (9.0.0)\nRequirement already satisfied: aiohttp in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.9.1)\nRequirement already satisfied: xxhash in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.4.1)\nRequirement already satisfied: multiprocess in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.70.15)\nRequirement already satisfied: pyarrow-hotfix in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.6)\nRequirement already satisfied: pandas in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2.0.2)\nRequirement already satisfied: dill<0.3.8,>=0.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.3.7)\nRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2023.10.0)\nRequirement already satisfied: scipy in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.10.1)\nRequirement already satisfied: slicer==0.0.7 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.0.7)\nRequirement already satisfied: cloudpickle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (2.2.1)\nRequirement already satisfied: scikit-learn in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.2.2)\nRequirement already satisfied: numba in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.58.1)\nRequirement already satisfied: importlib-metadata>=1.4 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (6.7.0)\nRequirement already satisfied: ipython>=6.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (8.12.2)\nRequirement already satisfied: setuptools in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (65.6.3)\nRequirement already satisfied: protobuf!=4.21.0,<5,>=3.12.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.19.6)\nRequirement already satisfied: setproctitle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.3.3)\nRequirement already satisfied: psutil>=5.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (5.9.5)\nRequirement already satisfied: sentry-sdk>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.39.2)\nRequirement already satisfied: typing-extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (4.6.3)\nRequirement already satisfied: Click!=8.0.0,>=7.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (8.1.3)\nRequirement already satisfied: docker-pycreds>=0.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (0.4.0)\nRequirement already satisfied: appdirs>=1.4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.4.4)\nRequirement already satisfied: GitPython!=3.1.29,>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.1.31)\nRequirement already satisfied: responses<0.19 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from evaluate) (0.18.0)\nRequirement already satisfied: pynvml in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (11.5.0)\nRequirement already satisfied: py-cpuinfo in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (9.0.0)\nRequirement already satisfied: prometheus-client in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (0.19.0)\nRequirement already satisfied: arrow in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (1.3.0)\nRequirement already satisfied: rapidfuzz in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (3.6.1)\nRequirement already satisfied: six>=1.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0)\nRequirement already satisfied: yarl<2.0,>=1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.9.4)\nRequirement already satisfied: async-timeout<5.0,>=4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (4.0.3)\nRequirement already satisfied: aiosignal>=1.1.2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.1)\nRequirement already satisfied: multidict<7.0,>=4.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (6.0.4)\nRequirement already satisfied: frozenlist>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.4.1)\nRequirement already satisfied: attrs>=17.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (23.1.0)\nRequirement already satisfied: gitdb<5,>=4.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from GitPython!=3.1.29,>=1.0.0->wandb) (4.0.10)\nRequirement already satisfied: zipp>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from importlib-metadata>=1.4->watermark) (3.15.0)\nRequirement already satisfied: pygments>=2.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (2.15.1)\nRequirement already satisfied: decorator in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.1.1)\nRequirement already satisfied: jedi>=0.16 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.18.2)\nRequirement already satisfied: pexpect>4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (4.8.0)\nRequirement already satisfied: matplotlib-inline in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.1.6)\nRequirement already satisfied: pickleshare in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.7.5)\nRequirement already satisfied: stack-data in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.6.2)\nRequirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (3.0.30)\nRequirement already satisfied: backcall in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.2.0)\nRequirement already satisfied: traitlets>=5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.9.0)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (1.26.16)\nRequirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.4)\nRequirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.1.0)\nRequirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (2023.5.7)\nRequirement already satisfied: python-dateutil>=2.7.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from arrow->codecarbon) (2.8.2)\nRequirement already satisfied: types-python-dateutil>=2.8.10 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from arrow->codecarbon) (2.8.19.20240106)\nRequirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from numba->shap) (0.41.1)\nRequirement already satisfied: pytz>=2020.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\nRequirement already satisfied: tzdata>=2022.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (3.1.0)\nRequirement already satisfied: joblib>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (1.2.0)\nRequirement already satisfied: smmap<6,>=3.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.0)\nRequirement already satisfied: parso<0.9.0,>=0.8.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from jedi>=0.16->ipython>=6.0->watermark) (0.8.3)\nRequirement already satisfied: ptyprocess>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pexpect>4.3->ipython>=6.0->watermark) (0.7.0)\nRequirement already satisfied: wcwidth in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.0->watermark) (0.2.6)\nRequirement already satisfied: asttokens>=2.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (2.2.1)\nRequirement already satisfied: pure-eval in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (0.2.2)\nRequirement already satisfied: executing>=1.2.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (1.2.0)\nNote: you may need to restart the kernel to use updated packages.\n"
+        }
+      ],
+      "execution_count": 2,
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "import torch\n",
+        "import os\n",
+        "from typing import List, Union\n",
+        "from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, DataCollatorWithPadding, pipeline\n",
+        "from datasets import load_dataset, Dataset, DatasetDict\n",
+        "import shap\n",
+        "import wandb\n",
+        "import evaluate\n",
+        "from codecarbon import EmissionsTracker\n",
+        "import logging\n",
+        "\n",
+        "wandb.finish()\n",
+        "\n",
+        "logging.getLogger('codecarbon').propagate = False\n",
+        "\n",
+        "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+        "tracker = EmissionsTracker()\n",
+        "\n",
+        "%load_ext watermark"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n2024-01-29 04:43:58.191236: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\nTo enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n2024-01-29 04:43:59.182154: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n2024-01-29 04:43:59.182291: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n2024-01-29 04:43:59.182304: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n[codecarbon INFO @ 04:44:02] [setup] RAM Tracking...\n[codecarbon INFO @ 04:44:02] [setup] GPU Tracking...\n[codecarbon INFO @ 04:44:02] Tracking Nvidia GPU via pynvml\n[codecarbon INFO @ 04:44:02] [setup] CPU Tracking...\n[codecarbon WARNING @ 04:44:02] No CPU tracking mode found. Falling back on CPU constant mode.\n[codecarbon WARNING @ 04:44:03] We saw that you have a Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz but we don't know it. Please contact us.\n[codecarbon INFO @ 04:44:03] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz\n[codecarbon INFO @ 04:44:03] >>> Tracker's metadata:\n[codecarbon INFO @ 04:44:03]   Platform system: Linux-5.15.0-1040-azure-x86_64-with-glibc2.10\n[codecarbon INFO @ 04:44:03]   Python version: 3.8.5\n[codecarbon INFO @ 04:44:03]   CodeCarbon version: 2.3.3\n[codecarbon INFO @ 04:44:03]   Available RAM : 440.883 GB\n[codecarbon INFO @ 04:44:03]   CPU count: 24\n[codecarbon INFO @ 04:44:03]   CPU model: Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz\n[codecarbon INFO @ 04:44:03]   GPU count: 4\n[codecarbon INFO @ 04:44:03]   GPU model: 4 x Tesla V100-PCIE-16GB\n[codecarbon WARNING @ 04:44:03] Cloud provider 'azure' do not publish electricity carbon intensity. Using country value instead.\n"
+        }
+      ],
+      "execution_count": 3,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "caZjjFP0OyQNMVgZDiwswE",
+          "report_properties": {
+            "rowId": "un8W7ez7ZwoGb5Co6nydEV"
+          },
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706503443742
+        },
+        "tags": []
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "device: str = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+        "\n",
+        "SEED: int = 42\n",
+        "\n",
+        "BATCH_SIZE: int = 32\n",
+        "EPOCHS: int = 5\n",
+        "model_ckpt: str = \"distilbert-base-uncased\"\n",
+        "\n",
+        "# WandB configuration\n",
+        "os.environ[\"WANDB_PROJECT\"] = \"DAEDRA multiclass model training\" \n",
+        "os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\"  # log all model checkpoints\n",
+        "os.environ[\"WANDB_NOTEBOOK_NAME\"] = \"DAEDRA.ipynb\""
+      ],
+      "outputs": [],
+      "execution_count": 4,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706503443899
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%watermark --iversion"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "shap    : 0.44.1\nnumpy   : 1.23.5\npandas  : 2.0.2\nlogging : 0.5.1.2\ntorch   : 1.12.0\nevaluate: 0.4.1\nwandb   : 0.16.2\nre      : 2.2.1\n\n"
+        }
+      ],
+      "execution_count": 5,
+      "metadata": {
+        "collapsed": false,
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!nvidia-smi"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Mon Jan 29 04:44:03 2024       \r\n+---------------------------------------------------------------------------------------+\r\n| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |\r\n|-----------------------------------------+----------------------+----------------------+\r\n| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\r\n| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\r\n|                                         |                      |               MIG M. |\r\n|=========================================+======================+======================|\r\n|   0  Tesla V100-PCIE-16GB           Off | 00000001:00:00.0 Off |                  Off |\r\n| N/A   26C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n|   1  Tesla V100-PCIE-16GB           Off | 00000002:00:00.0 Off |                  Off |\r\n| N/A   25C    P0              23W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n|   2  Tesla V100-PCIE-16GB           Off | 00000003:00:00.0 Off |                  Off |\r\n| N/A   26C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n|   3  Tesla V100-PCIE-16GB           Off | 00000004:00:00.0 Off |                  Off |\r\n| N/A   27C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n                                                                                         \r\n+---------------------------------------------------------------------------------------+\r\n| Processes:                                                                            |\r\n|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\r\n|        ID   ID                                                             Usage      |\r\n|=======================================================================================|\r\n|  No running processes found                                                           |\r\n+---------------------------------------------------------------------------------------+\r\n"
+        }
+      ],
+      "execution_count": 6,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "UU2oOJhwbIualogG1YyCMd",
+          "type": "CODE"
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Loading the data set"
+      ],
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "t45KHugmcPVaO0nuk8tGJ9",
+          "report_properties": {
+            "rowId": "40nN9Hvgi1clHNV5RAemI5"
+          },
+          "type": "MD"
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dataset = load_dataset(\"chrisvoncsefalvay/vaers-outcomes\")"
+      ],
+      "outputs": [],
+      "execution_count": 7,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706503446033
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dataset"
+      ],
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "execution_count": 8,
+          "data": {
+            "text/plain": "DatasetDict({\n    train: Dataset({\n        features: ['id', 'text', 'label'],\n        num_rows: 1270444\n    })\n    test: Dataset({\n        features: ['id', 'text', 'label'],\n        num_rows: 272238\n    })\n    val: Dataset({\n        features: ['id', 'text', 'label'],\n        num_rows: 272238\n    })\n})"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 8,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706503446252
+        },
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "SUBSAMPLING = 1.0\n",
+        "\n",
+        "if SUBSAMPLING < 1:\n",
+        "    _ = DatasetDict()\n",
+        "    for each in dataset.keys():\n",
+        "        _[each] = dataset[each].shuffle(seed=SEED).select(range(int(len(dataset[each]) * SUBSAMPLING)))\n",
+        "\n",
+        "    dataset = _"
+      ],
+      "outputs": [],
+      "execution_count": 9,
+      "metadata": {
+        "gather": {
+          "logged": 1706503446498
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Tokenisation and encoding"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def encode_ds(ds: Union[Dataset, DatasetDict], tokenizer_model: str = model_ckpt) -> Union[Dataset, DatasetDict]:\n",
+        "    return ds_enc"
+      ],
+      "outputs": [],
+      "execution_count": 10,
+      "metadata": {
+        "gather": {
+          "logged": 1706503446633
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Evaluation metrics"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "accuracy = evaluate.load(\"accuracy\")\n",
+        "precision, recall = evaluate.load(\"precision\"), evaluate.load(\"recall\")\n",
+        "f1 = evaluate.load(\"f1\")"
+      ],
+      "outputs": [],
+      "execution_count": 11,
+      "metadata": {
+        "gather": {
+          "logged": 1706503446863
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def compute_metrics(eval_pred):\n",
+        "    predictions, labels = eval_pred\n",
+        "    predictions = np.argmax(predictions, axis=1)\n",
+        "    return {\n",
+        "        'accuracy': accuracy.compute(predictions=predictions, references=labels)[\"accuracy\"],\n",
+        "        'precision_macroaverage': precision.compute(predictions=predictions, references=labels, average='macro')[\"precision\"],\n",
+        "        'precision_microaverage': precision.compute(predictions=predictions, references=labels, average='micro')[\"precision\"],\n",
+        "        'recall_macroaverage': recall.compute(predictions=predictions, references=labels, average='macro')[\"recall\"],\n",
+        "        'recall_microaverage': recall.compute(predictions=predictions, references=labels, average='micro')[\"recall\"],\n",
+        "        'f1_microaverage': f1.compute(predictions=predictions, references=labels, average='micro')[\"f1\"]\n",
+        "    }"
+      ],
+      "outputs": [],
+      "execution_count": 12,
+      "metadata": {
+        "gather": {
+          "logged": 1706503447004
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Training"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We specify a label map – this has to be done manually, even if `Datasets` has a function for it, as `AutoModelForSequenceClassification` requires an object with a length :("
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "label_map = {i: label for i, label in enumerate(dataset[\"test\"].features[\"label\"].names)}"
+      ],
+      "outputs": [],
+      "execution_count": 13,
+      "metadata": {
+        "gather": {
+          "logged": 1706503447186
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)\n",
+        "\n",
+        "cols = dataset[\"train\"].column_names\n",
+        "cols.remove(\"label\")\n",
+        "ds_enc = dataset.map(lambda x: tokenizer(x[\"text\"], truncation=True), batched=True, remove_columns=cols)\n"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "Map: 100%|██████████| 272238/272238 [01:45<00:00, 2592.04 examples/s]\n"
+        }
+      ],
+      "execution_count": 14,
+      "metadata": {
+        "gather": {
+          "logged": 1706503552083
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "\n",
+        "model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, \n",
+        "    num_labels=len(dataset[\"test\"].features[\"label\"].names), \n",
+        "    id2label=label_map, \n",
+        "    label2id={v:k for k,v in label_map.items()})\n",
+        "\n",
+        "args = TrainingArguments(\n",
+        "    output_dir=\"vaers\",\n",
+        "    evaluation_strategy=\"epoch\",\n",
+        "    save_strategy=\"epoch\",\n",
+        "    learning_rate=2e-5,\n",
+        "    per_device_train_batch_size=BATCH_SIZE,\n",
+        "    per_device_eval_batch_size=BATCH_SIZE,\n",
+        "    num_train_epochs=EPOCHS,\n",
+        "    weight_decay=.01,\n",
+        "    logging_steps=1,\n",
+        "    load_best_model_at_end=True,\n",
+        "    run_name=f\"daedra-training\",\n",
+        "    report_to=[\"wandb\"])\n",
+        "\n",
+        "trainer = Trainer(\n",
+        "        model=model,\n",
+        "        args=args,\n",
+        "        train_dataset=ds_enc[\"train\"],\n",
+        "        eval_dataset=ds_enc[\"test\"],\n",
+        "        tokenizer=tokenizer,\n",
+        "        compute_metrics=compute_metrics)"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\nYou should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+        }
+      ],
+      "execution_count": 15,
+      "metadata": {
+        "gather": {
+          "logged": 1706503554669
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "if SUBSAMPLING != 1.0:\n",
+        "    wandb_tag: List[str] = [f\"subsample-{SUBSAMPLING}\"]\n",
+        "else:\n",
+        "    wandb_tag: List[str] = [f\"full_sample\"]\n",
+        "\n",
+        "wandb_tag.append(f\"batch_size-{BATCH_SIZE}\")\n",
+        "wandb_tag.append(f\"base:{model_ckpt}\")\n",
+        "    \n",
+        "wandb.init(name=\"daedra_training_run\", tags=wandb_tag, magic=True)"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mchrisvoncsefalvay\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m wandb.init() arguments ignored because wandb magic has already been initialized\n"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Tracking run with wandb version 0.16.2"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/daedra-hptrain-cvc/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240129_044555-kjhyoltp</code>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/kjhyoltp' target=\"_blank\">daedra_training_run</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training</a>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/kjhyoltp' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/kjhyoltp</a>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Finishing last run (ID:kjhyoltp) before initializing another..."
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "W&B sync reduced upload amount by 26.5%             "
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View run <strong style=\"color:#cdcd00\">daedra_training_run</strong> at: <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/kjhyoltp' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/kjhyoltp</a><br/> View job at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjEzNDcyMTQwMw==/version_details/v1' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjEzNDcyMTQwMw==/version_details/v1</a><br/>Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Find logs at: <code>./wandb/run-20240129_044555-kjhyoltp/logs</code>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Successfully finished last run (ID:kjhyoltp). Initializing new run:<br/>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Tracking run with wandb version 0.16.2"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/daedra-hptrain-cvc/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240129_044558-ed51hqn6</code>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/ed51hqn6' target=\"_blank\">daedra_training_run</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training</a>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/ed51hqn6' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/ed51hqn6</a>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "execute_result",
+          "execution_count": 16,
+          "data": {
+            "text/html": "<button onClick=\"this.nextSibling.style.display='block';this.style.display='none';\">Display W&B run</button><iframe src='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/ed51hqn6?jupyter=true' style='border:none;width:100%;height:420px;display:none;'></iframe>",
+            "text/plain": "<wandb.sdk.wandb_run.Run at 0x7f19d09fdbe0>"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 16,
+      "metadata": {
+        "gather": {
+          "logged": 1706503566090
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "tracker.start()\n",
+        "trainer.train()\n",
+        "tracker.stop()\n"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "\n    <div>\n      \n      <progress value='183' max='49630' style='width:300px; height:20px; vertical-align: middle;'></progress>\n      [  183/49630 01:56 < 8:51:06, 1.55 it/s, Epoch 0.02/5]\n    </div>\n    <table border=\"1\" class=\"dataframe\">\n  <thead>\n <tr style=\"text-align: left;\">\n      <th>Epoch</th>\n      <th>Training Loss</th>\n      <th>Validation Loss</th>\n    </tr>\n  </thead>\n  <tbody>\n  </tbody>\n</table><p>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "[codecarbon INFO @ 04:46:20] Energy consumed for RAM : 0.000690 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 04:46:20] Energy consumed for all GPUs : 0.001499 kWh. Total GPU Power : 359.1829830586385 W\n[codecarbon INFO @ 04:46:20] Energy consumed for all CPUs : 0.000177 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 04:46:20] 0.002366 kWh of electricity used since the beginning.\n[codecarbon INFO @ 04:46:35] Energy consumed for RAM : 0.001378 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 04:46:35] Energy consumed for all GPUs : 0.004078 kWh. Total GPU Power : 619.6193403526773 W\n[codecarbon INFO @ 04:46:35] Energy consumed for all CPUs : 0.000355 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 04:46:35] 0.005811 kWh of electricity used since the beginning.\n[codecarbon INFO @ 04:46:50] Energy consumed for RAM : 0.002066 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 04:46:50] Energy consumed for all GPUs : 0.006632 kWh. Total GPU Power : 613.6554096062922 W\n[codecarbon INFO @ 04:46:50] Energy consumed for all CPUs : 0.000532 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 04:46:50] 0.009230 kWh of electricity used since the beginning.\n[codecarbon INFO @ 04:47:05] Energy consumed for RAM : 0.002754 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 04:47:05] Energy consumed for all GPUs : 0.009249 kWh. Total GPU Power : 628.5574609453653 W\n[codecarbon INFO @ 04:47:05] Energy consumed for all CPUs : 0.000709 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 04:47:05] 0.012712 kWh of electricity used since the beginning.\n[codecarbon INFO @ 04:47:20] Energy consumed for RAM : 0.003442 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 04:47:20] Energy consumed for all GPUs : 0.011850 kWh. Total GPU Power : 624.8454173521444 W\n[codecarbon INFO @ 04:47:20] Energy consumed for all CPUs : 0.000886 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 04:47:20] 0.016178 kWh of electricity used since the beginning.\n[codecarbon INFO @ 04:47:35] Energy consumed for RAM : 0.004130 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 04:47:35] Energy consumed for all GPUs : 0.014490 kWh. Total GPU Power : 634.7378588005432 W\n[codecarbon INFO @ 04:47:35] Energy consumed for all CPUs : 0.001063 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 04:47:35] 0.019683 kWh of electricity used since the beginning.\n[codecarbon INFO @ 04:47:50] Energy consumed for RAM : 0.004818 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 04:47:50] Energy consumed for all GPUs : 0.017140 kWh. Total GPU Power : 636.6500188212152 W\n[codecarbon INFO @ 04:47:50] Energy consumed for all CPUs : 0.001240 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 04:47:50] 0.023197 kWh of electricity used since the beginning.\n[codecarbon INFO @ 04:48:05] Energy consumed for RAM : 0.005506 kWh. RAM Power : 165.33123922348022 W\n[codecarbon INFO @ 04:48:05] Energy consumed for all GPUs : 0.019771 kWh. Total GPU Power : 631.881788173399 W\n[codecarbon INFO @ 04:48:05] Energy consumed for all CPUs : 0.001417 kWh. Total CPU Power : 42.5 W\n[codecarbon INFO @ 04:48:05] 0.026694 kWh of electricity used since the beginning.\n"
+        }
+      ],
+      "execution_count": 17,
+      "metadata": {
+        "gather": {
+          "logged": 1706486541798
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "wandb.finish()"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "gather": {
+          "logged": 1706486541918
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "variant = \"full_sample\" if SUBSAMPLING == 1.0 else f\"subsample-{SUBSAMPLING}\"\n",
+        "tokenizer._tokenizer.save(\"tokenizer.json\")\n",
+        "tokenizer.push_to_hub(\"chrisvoncsefalvay/daedra\")\n",
+        "sample = \"full sample\" if SUBSAMPLING == 1.0 else f\"{SUBSAMPLING * 100}% of the full sample\"\n",
+        "\n",
+        "model.push_to_hub(\"chrisvoncsefalvay/daedra\", \n",
+        "                  variant=variant,\n",
+        "                  commit_message=f\"DAEDRA model trained on {sample} of the VAERS dataset (training set size: {dataset['train'].num_rows:,})\")"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "gather": {
+          "logged": 1706486541928
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "variant = \"full_sample\" if SUBSAMPLING == 1.0 else f\"subsample-{SUBSAMPLING}\"\n",
+        "tokenizer._tokenizer.save(\"tokenizer.json\")\n",
+        "tokenizer.push_to_hub(\"chrisvoncsefalvay/daedra\")\n",
+        "sample = \"full sample\" if SUBSAMPLING == 1.0 else f\"{SUBSAMPLING * 100}% of the full sample\"\n",
+        "\n",
+        "model.push_to_hub(\"chrisvoncsefalvay/daedra\", \n",
+        "                  variant=variant,\n",
+        "                  commit_message=f\"DAEDRA model trained on {sample} of the VAERS dataset (training set size: {dataset['train'].num_rows:,})\")"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {}
+    }
+  ],
+  "metadata": {
+    "datalore": {
+      "base_environment": "default",
+      "computation_mode": "JUPYTER",
+      "package_manager": "pip",
+      "packages": [
+        {
+          "name": "datasets",
+          "source": "PIP",
+          "version": "2.16.1"
+        },
+        {
+          "name": "torch",
+          "source": "PIP",
+          "version": "2.1.2"
+        },
+        {
+          "name": "accelerate",
+          "source": "PIP",
+          "version": "0.26.1"
+        }
+      ],
+      "report_row_ids": [
+        "un8W7ez7ZwoGb5Co6nydEV",
+        "40nN9Hvgi1clHNV5RAemI5",
+        "TgRD90H5NSPpKS41OeXI1w",
+        "ZOm5BfUs3h1EGLaUkBGeEB",
+        "kOP0CZWNSk6vqE3wkPp7Vc",
+        "W4PWcOu2O2pRaZyoE2W80h",
+        "RolbOnQLIftk0vy9mIcz5M",
+        "8OPhUgbaNJmOdiq5D3a6vK",
+        "5Qrt3jSvSrpK6Ne1hS6shL",
+        "hTq7nFUrovN5Ao4u6dIYWZ",
+        "I8WNZLpJ1DVP2wiCW7YBIB",
+        "SawhU3I9BewSE1XBPstpNJ",
+        "80EtLEl2FIE4FqbWnUD3nT"
+      ],
+      "version": 3
+    },
+    "kernel_info": {
+      "name": "python38-azureml-pt-tf"
+    },
+    "kernelspec": {
+      "display_name": "azureml_py38_PT_TF",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.8.5",
+      "mimetype": "text/x-python",
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "pygments_lexer": "ipython3",
+      "nbconvert_exporter": "python",
+      "file_extension": ".py"
+    },
+    "microsoft": {
+      "host": {
+        "AzureML": {
+          "notebookHasBeenCompleted": true
+        }
+      },
+      "ms_spell_check": {
+        "ms_spell_check_language": "en"
+      }
+    },
+    "nteract": {
+      "version": "nteract-front-end@1.0.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}

notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-29-16-5-15Z.ipynb ADDED Viewed

	@@ -0,0 +1,729 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# DAEDRA: Determining Adverse Event Disposition for Regulatory Affairs\n",
+        "\n",
+        "DAEDRA is a language model intended to predict the disposition (outcome) of an adverse event based on the text of the event report. Intended to be used to classify reports in passive reporting systems, it is trained on the [VAERS](https://vaers.hhs.gov/) dataset, which contains reports of adverse events following vaccination in the United States."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "gather": {
+          "logged": 1706475754655
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "tags": []
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "/bin/bash: /anaconda/envs/azureml_py38_PT_TF/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n",
+            "Requirement already satisfied: accelerate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.26.1)\n",
+            "Requirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.4.2)\n",
+            "Requirement already satisfied: huggingface-hub in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.20.3)\n",
+            "Requirement already satisfied: pyyaml in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (6.0)\n",
+            "Requirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (23.1)\n",
+            "Requirement already satisfied: torch>=1.10.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.12.0)\n",
+            "Requirement already satisfied: psutil in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (5.9.5)\n",
+            "Requirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.23.5)\n",
+            "Requirement already satisfied: typing_extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from torch>=1.10.0->accelerate) (4.6.3)\n",
+            "Requirement already satisfied: tqdm>=4.42.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (4.65.0)\n",
+            "Requirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2.31.0)\n",
+            "Requirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (3.13.1)\n",
+            "Requirement already satisfied: fsspec>=2023.5.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2023.10.0)\n",
+            "Requirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.1.0)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (2023.5.7)\n",
+            "Requirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (1.26.16)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.4)\n",
+            "Note: you may need to restart the kernel to use updated packages.\n"
+          ]
+        }
+      ],
+      "source": [
+        "%pip install accelerate -U"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "/bin/bash: /anaconda/envs/azureml_py38_PT_TF/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n",
+            "Requirement already satisfied: transformers in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (4.37.1)\n",
+            "Requirement already satisfied: datasets in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.16.1)\n",
+            "Requirement already satisfied: shap in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.44.1)\n",
+            "Requirement already satisfied: watermark in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.4.3)\n",
+            "Requirement already satisfied: wandb in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.16.2)\n",
+            "Requirement already satisfied: evaluate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.4.1)\n",
+            "Requirement already satisfied: codecarbon in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.3.3)\n",
+            "Requirement already satisfied: pyyaml>=5.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (6.0)\n",
+            "Requirement already satisfied: tokenizers<0.19,>=0.14 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.15.1)\n",
+            "Requirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (3.13.1)\n",
+            "Requirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.20.3)\n",
+            "Requirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2.31.0)\n",
+            "Requirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.4.2)\n",
+            "Requirement already satisfied: regex!=2019.12.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2023.12.25)\n",
+            "Requirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (1.23.5)\n",
+            "Requirement already satisfied: tqdm>=4.27 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (4.65.0)\n",
+            "Requirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (23.1)\n",
+            "Requirement already satisfied: pyarrow>=8.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (9.0.0)\n",
+            "Requirement already satisfied: pandas in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2.0.2)\n",
+            "Requirement already satisfied: multiprocess in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.70.15)\n",
+            "Requirement already satisfied: dill<0.3.8,>=0.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.3.7)\n",
+            "Requirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2023.10.0)\n",
+            "Requirement already satisfied: xxhash in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.4.1)\n",
+            "Requirement already satisfied: pyarrow-hotfix in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.6)\n",
+            "Requirement already satisfied: aiohttp in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.9.1)\n",
+            "Requirement already satisfied: cloudpickle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (2.2.1)\n",
+            "Requirement already satisfied: scipy in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.10.1)\n",
+            "Requirement already satisfied: numba in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.58.1)\n",
+            "Requirement already satisfied: slicer==0.0.7 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.0.7)\n",
+            "Requirement already satisfied: scikit-learn in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.2.2)\n",
+            "Requirement already satisfied: setuptools in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (65.6.3)\n",
+            "Requirement already satisfied: ipython>=6.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (8.12.2)\n",
+            "Requirement already satisfied: importlib-metadata>=1.4 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (6.7.0)\n",
+            "Requirement already satisfied: setproctitle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.3.3)\n",
+            "Requirement already satisfied: typing-extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (4.6.3)\n",
+            "Requirement already satisfied: protobuf!=4.21.0,<5,>=3.12.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.19.6)\n",
+            "Requirement already satisfied: docker-pycreds>=0.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (0.4.0)\n",
+            "Requirement already satisfied: sentry-sdk>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.39.2)\n",
+            "Requirement already satisfied: psutil>=5.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (5.9.5)\n",
+            "Requirement already satisfied: Click!=8.0.0,>=7.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (8.1.3)\n",
+            "Requirement already satisfied: GitPython!=3.1.29,>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.1.31)\n",
+            "Requirement already satisfied: appdirs>=1.4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.4.4)\n",
+            "Requirement already satisfied: responses<0.19 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from evaluate) (0.18.0)\n",
+            "Requirement already satisfied: rapidfuzz in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (3.6.1)\n",
+            "Requirement already satisfied: pynvml in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (11.5.0)\n",
+            "Requirement already satisfied: prometheus-client in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (0.19.0)\n",
+            "Requirement already satisfied: py-cpuinfo in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (9.0.0)\n",
+            "Requirement already satisfied: arrow in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (1.3.0)\n",
+            "Requirement already satisfied: six>=1.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0)\n",
+            "Requirement already satisfied: async-timeout<5.0,>=4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (4.0.3)\n",
+            "Requirement already satisfied: frozenlist>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.4.1)\n",
+            "Requirement already satisfied: multidict<7.0,>=4.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (6.0.4)\n",
+            "Requirement already satisfied: yarl<2.0,>=1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.9.4)\n",
+            "Requirement already satisfied: attrs>=17.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (23.1.0)\n",
+            "Requirement already satisfied: aiosignal>=1.1.2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.1)\n",
+            "Requirement already satisfied: gitdb<5,>=4.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from GitPython!=3.1.29,>=1.0.0->wandb) (4.0.10)\n",
+            "Requirement already satisfied: zipp>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from importlib-metadata>=1.4->watermark) (3.15.0)\n",
+            "Requirement already satisfied: pickleshare in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.7.5)\n",
+            "Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (3.0.30)\n",
+            "Requirement already satisfied: backcall in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.2.0)\n",
+            "Requirement already satisfied: stack-data in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.6.2)\n",
+            "Requirement already satisfied: jedi>=0.16 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.18.2)\n",
+            "Requirement already satisfied: pygments>=2.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (2.15.1)\n",
+            "Requirement already satisfied: pexpect>4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (4.8.0)\n",
+            "Requirement already satisfied: decorator in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.1.1)\n",
+            "Requirement already satisfied: traitlets>=5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.9.0)\n",
+            "Requirement already satisfied: matplotlib-inline in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.1.6)\n",
+            "Requirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.1.0)\n",
+            "Requirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (1.26.16)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.4)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (2023.5.7)\n",
+            "Requirement already satisfied: python-dateutil>=2.7.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from arrow->codecarbon) (2.8.2)\n",
+            "Requirement already satisfied: types-python-dateutil>=2.8.10 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from arrow->codecarbon) (2.8.19.20240106)\n",
+            "Requirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from numba->shap) (0.41.1)\n",
+            "Requirement already satisfied: tzdata>=2022.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\n",
+            "Requirement already satisfied: pytz>=2020.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\n",
+            "Requirement already satisfied: joblib>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (1.2.0)\n",
+            "Requirement already satisfied: threadpoolctl>=2.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (3.1.0)\n",
+            "Requirement already satisfied: smmap<6,>=3.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.0)\n",
+            "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from jedi>=0.16->ipython>=6.0->watermark) (0.8.3)\n",
+            "Requirement already satisfied: ptyprocess>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pexpect>4.3->ipython>=6.0->watermark) (0.7.0)\n",
+            "Requirement already satisfied: wcwidth in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.0->watermark) (0.2.6)\n",
+            "Requirement already satisfied: executing>=1.2.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (1.2.0)\n",
+            "Requirement already satisfied: pure-eval in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (0.2.2)\n",
+            "Requirement already satisfied: asttokens>=2.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (2.2.1)\n",
+            "Note: you may need to restart the kernel to use updated packages.\n"
+          ]
+        }
+      ],
+      "source": [
+        "%pip install transformers datasets shap watermark wandb evaluate codecarbon"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 28,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "caZjjFP0OyQNMVgZDiwswE",
+          "report_properties": {
+            "rowId": "un8W7ez7ZwoGb5Co6nydEV"
+          },
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706503443742
+        },
+        "tags": []
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              " View run <strong style=\"color:#cdcd00\">daedra_0.05-distilbert-base-uncased</strong> at: <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/cwkdl3x7' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/cwkdl3x7</a><br/> View job at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjEzNDcyMTQwMw==/version_details/v3' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjEzNDcyMTQwMw==/version_details/v3</a><br/>Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Find logs at: <code>./wandb/run-20240129_152136-cwkdl3x7/logs</code>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "The watermark extension is already loaded. To reload it, use:\n",
+            "  %reload_ext watermark\n"
+          ]
+        }
+      ],
+      "source": [
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "import torch\n",
+        "import os\n",
+        "from typing import List, Union\n",
+        "from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, DataCollatorWithPadding, pipeline, AutoModel\n",
+        "from datasets import load_dataset, Dataset, DatasetDict\n",
+        "import shap\n",
+        "import wandb\n",
+        "import evaluate\n",
+        "import logging\n",
+        "\n",
+        "wandb.finish()\n",
+        "\n",
+        "\n",
+        "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+        "\n",
+        "%load_ext watermark"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706503443899
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      },
+      "outputs": [],
+      "source": [
+        "device: str = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+        "\n",
+        "SEED: int = 42\n",
+        "\n",
+        "BATCH_SIZE: int = 32\n",
+        "EPOCHS: int = 5\n",
+        "model_ckpt: str = \"distilbert-base-uncased\"\n",
+        "\n",
+        "# WandB configuration\n",
+        "os.environ[\"WANDB_PROJECT\"] = \"DAEDRA multiclass model training\" \n",
+        "os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\"  # log all model checkpoints\n",
+        "os.environ[\"WANDB_NOTEBOOK_NAME\"] = \"DAEDRA.ipynb\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "collapsed": false,
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "re      : 2.2.1\n",
+            "torch   : 1.12.0\n",
+            "wandb   : 0.16.2\n",
+            "logging : 0.5.1.2\n",
+            "numpy   : 1.23.5\n",
+            "pandas  : 2.0.2\n",
+            "evaluate: 0.4.1\n",
+            "shap    : 0.44.1\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "%watermark --iversion"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "UU2oOJhwbIualogG1YyCMd",
+          "type": "CODE"
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "/bin/bash: /anaconda/envs/azureml_py38_PT_TF/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n",
+            "Mon Jan 29 15:20:22 2024       \n",
+            "+---------------------------------------------------------------------------------------+\n",
+            "| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |\n",
+            "|-----------------------------------------+----------------------+----------------------+\n",
+            "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
+            "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
+            "|                                         |                      |               MIG M. |\n",
+            "|=========================================+======================+======================|\n",
+            "|   0  Tesla V100-PCIE-16GB           Off | 00000001:00:00.0 Off |                  Off |\n",
+            "| N/A   27C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\n",
+            "|                                         |                      |                  N/A |\n",
+            "+-----------------------------------------+----------------------+----------------------+\n",
+            "|   1  Tesla V100-PCIE-16GB           Off | 00000002:00:00.0 Off |                  Off |\n",
+            "| N/A   26C    P0              23W / 250W |      4MiB / 16384MiB |      0%      Default |\n",
+            "|                                         |                      |                  N/A |\n",
+            "+-----------------------------------------+----------------------+----------------------+\n",
+            "|   2  Tesla V100-PCIE-16GB           Off | 00000003:00:00.0 Off |                  Off |\n",
+            "| N/A   26C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\n",
+            "|                                         |                      |                  N/A |\n",
+            "+-----------------------------------------+----------------------+----------------------+\n",
+            "|   3  Tesla V100-PCIE-16GB           Off | 00000004:00:00.0 Off |                  Off |\n",
+            "| N/A   28C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\n",
+            "|                                         |                      |                  N/A |\n",
+            "+-----------------------------------------+----------------------+----------------------+\n",
+            "                                                                                         \n",
+            "+---------------------------------------------------------------------------------------+\n",
+            "| Processes:                                                                            |\n",
+            "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
+            "|        ID   ID                                                             Usage      |\n",
+            "|=======================================================================================|\n",
+            "|  No running processes found                                                           |\n",
+            "+---------------------------------------------------------------------------------------+\n"
+          ]
+        }
+      ],
+      "source": [
+        "!nvidia-smi"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "t45KHugmcPVaO0nuk8tGJ9",
+          "report_properties": {
+            "rowId": "40nN9Hvgi1clHNV5RAemI5"
+          },
+          "type": "MD"
+        }
+      },
+      "source": [
+        "## Loading the data set"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706503446033
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      },
+      "outputs": [],
+      "source": [
+        "dataset = load_dataset(\"chrisvoncsefalvay/vaers-outcomes\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706503446252
+        },
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "DatasetDict({\n",
+              "    train: Dataset({\n",
+              "        features: ['id', 'text', 'label'],\n",
+              "        num_rows: 1270444\n",
+              "    })\n",
+              "    test: Dataset({\n",
+              "        features: ['id', 'text', 'label'],\n",
+              "        num_rows: 272238\n",
+              "    })\n",
+              "    val: Dataset({\n",
+              "        features: ['id', 'text', 'label'],\n",
+              "        num_rows: 272238\n",
+              "    })\n",
+              "})"
+            ]
+          },
+          "execution_count": 8,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "gather": {
+          "logged": 1706503446498
+        }
+      },
+      "outputs": [],
+      "source": [
+        "SUBSAMPLING = 0.01\n",
+        "\n",
+        "if SUBSAMPLING < 1:\n",
+        "    _ = DatasetDict()\n",
+        "    for each in dataset.keys():\n",
+        "        _[each] = dataset[each].shuffle(seed=SEED).select(range(int(len(dataset[each]) * SUBSAMPLING)))\n",
+        "\n",
+        "    dataset = _"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Tokenisation and encoding"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "gather": {
+          "logged": 1706503446633
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def encode_ds(ds: Union[Dataset, DatasetDict], tokenizer_model: str = model_ckpt) -> Union[Dataset, DatasetDict]:\n",
+        "    return ds_enc"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Evaluation metrics"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "gather": {
+          "logged": 1706503446863
+        }
+      },
+      "outputs": [],
+      "source": [
+        "accuracy = evaluate.load(\"accuracy\")\n",
+        "precision, recall = evaluate.load(\"precision\"), evaluate.load(\"recall\")\n",
+        "f1 = evaluate.load(\"f1\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "gather": {
+          "logged": 1706503447004
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def compute_metrics(eval_pred):\n",
+        "    predictions, labels = eval_pred\n",
+        "    predictions = np.argmax(predictions, axis=1)\n",
+        "    return {\n",
+        "        'accuracy': accuracy.compute(predictions=predictions, references=labels)[\"accuracy\"],\n",
+        "        'precision_macroaverage': precision.compute(predictions=predictions, references=labels, average='macro')[\"precision\"],\n",
+        "        'precision_microaverage': precision.compute(predictions=predictions, references=labels, average='micro')[\"precision\"],\n",
+        "        'recall_macroaverage': recall.compute(predictions=predictions, references=labels, average='macro')[\"recall\"],\n",
+        "        'recall_microaverage': recall.compute(predictions=predictions, references=labels, average='micro')[\"recall\"],\n",
+        "        'f1_microaverage': f1.compute(predictions=predictions, references=labels, average='micro')[\"f1\"]\n",
+        "    }"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Training"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We specify a label map – this has to be done manually, even if `Datasets` has a function for it, as `AutoModelForSequenceClassification` requires an object with a length :("
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "metadata": {
+        "gather": {
+          "logged": 1706503447186
+        }
+      },
+      "outputs": [],
+      "source": [
+        "label_map = {i: label for i, label in enumerate(dataset[\"test\"].features[\"label\"].names)}"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def train_from_model(model_ckpt: str, push: bool = False):\n",
+        "    print(f\"Initialising training based on {model_ckpt}...\")\n",
+        "\n",
+        "    print(\"Tokenising...\")\n",
+        "    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)\n",
+        "\n",
+        "    cols = dataset[\"train\"].column_names\n",
+        "    cols.remove(\"label\")\n",
+        "    ds_enc = dataset.map(lambda x: tokenizer(x[\"text\"], truncation=True, max_length=512), batched=True, remove_columns=cols)\n",
+        "\n",
+        "    print(\"Loading model...\")\n",
+        "    try:\n",
+        "        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, \n",
+        "                                                                    num_labels=len(dataset[\"test\"].features[\"label\"].names), \n",
+        "                                                                    id2label=label_map, \n",
+        "                                                                    label2id={v:k for k,v in label_map.items()})\n",
+        "    except OSError:\n",
+        "        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, \n",
+        "                                                            num_labels=len(dataset[\"test\"].features[\"label\"].names), \n",
+        "                                                            id2label=label_map, \n",
+        "                                                            label2id={v:k for k,v in label_map.items()},\n",
+        "                                                            from_tf=True)\n",
+        "\n",
+        "\n",
+        "    args = TrainingArguments(\n",
+        "        output_dir=\"vaers\",\n",
+        "        evaluation_strategy=\"epoch\",\n",
+        "        save_strategy=\"epoch\",\n",
+        "        learning_rate=2e-5,\n",
+        "        per_device_train_batch_size=BATCH_SIZE,\n",
+        "        per_device_eval_batch_size=BATCH_SIZE,\n",
+        "        num_train_epochs=EPOCHS,\n",
+        "        weight_decay=.01,\n",
+        "        logging_steps=1,\n",
+        "        load_best_model_at_end=True,\n",
+        "        run_name=f\"daedra-training\",\n",
+        "        report_to=[\"wandb\"])\n",
+        "\n",
+        "    trainer = Trainer(\n",
+        "            model=model,\n",
+        "            args=args,\n",
+        "            train_dataset=ds_enc[\"train\"],\n",
+        "            eval_dataset=ds_enc[\"test\"],\n",
+        "            tokenizer=tokenizer,\n",
+        "            compute_metrics=compute_metrics)\n",
+        "    \n",
+        "    if SUBSAMPLING != 1.0:\n",
+        "        wandb_tag: List[str] = [f\"subsample-{SUBSAMPLING}\"]\n",
+        "    else:\n",
+        "        wandb_tag: List[str] = [f\"full_sample\"]\n",
+        "\n",
+        "    wandb_tag.append(f\"batch_size-{BATCH_SIZE}\")\n",
+        "    wandb_tag.append(f\"base:{model_ckpt}\")\n",
+        "        \n",
+        "    wandb.init(name=f\"daedra_{SUBSAMPLING}-{model_ckpt}\", tags=wandb_tag, magic=True)\n",
+        "\n",
+        "    print(\"Starting training...\")\n",
+        "\n",
+        "    trainer.train()\n",
+        "\n",
+        "    print(\"Training finished.\")\n",
+        "\n",
+        "    if push:\n",
+        "        variant = \"full_sample\" if SUBSAMPLING == 1.0 else f\"subsample-{SUBSAMPLING}\"\n",
+        "        tokenizer._tokenizer.save(\"tokenizer.json\")\n",
+        "        tokenizer.push_to_hub(\"chrisvoncsefalvay/daedra\")\n",
+        "        sample = \"full sample\" if SUBSAMPLING == 1.0 else f\"{SUBSAMPLING * 100}% of the full sample\"\n",
+        "\n",
+        "        model.push_to_hub(\"chrisvoncsefalvay/daedra\", \n",
+        "                        variant=variant,\n",
+        "                        commit_message=f\"DAEDRA model trained on {sample} of the VAERS dataset (training set size: {dataset['train'].num_rows:,}), based on {model_ckpt}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {
+        "gather": {
+          "logged": 1706503552083
+        }
+      },
+      "outputs": [],
+      "source": [
+        "\n",
+        "base_models = [\n",
+        "    \"bert-base-uncased\",\n",
+        "    \"distilbert-base-uncased\",\n",
+        "]"
+      ]
+    }
+  ],
+  "metadata": {
+    "datalore": {
+      "base_environment": "default",
+      "computation_mode": "JUPYTER",
+      "package_manager": "pip",
+      "packages": [
+        {
+          "name": "datasets",
+          "source": "PIP",
+          "version": "2.16.1"
+        },
+        {
+          "name": "torch",
+          "source": "PIP",
+          "version": "2.1.2"
+        },
+        {
+          "name": "accelerate",
+          "source": "PIP",
+          "version": "0.26.1"
+        }
+      ],
+      "report_row_ids": [
+        "un8W7ez7ZwoGb5Co6nydEV",
+        "40nN9Hvgi1clHNV5RAemI5",
+        "TgRD90H5NSPpKS41OeXI1w",
+        "ZOm5BfUs3h1EGLaUkBGeEB",
+        "kOP0CZWNSk6vqE3wkPp7Vc",
+        "W4PWcOu2O2pRaZyoE2W80h",
+        "RolbOnQLIftk0vy9mIcz5M",
+        "8OPhUgbaNJmOdiq5D3a6vK",
+        "5Qrt3jSvSrpK6Ne1hS6shL",
+        "hTq7nFUrovN5Ao4u6dIYWZ",
+        "I8WNZLpJ1DVP2wiCW7YBIB",
+        "SawhU3I9BewSE1XBPstpNJ",
+        "80EtLEl2FIE4FqbWnUD3nT"
+      ],
+      "version": 3
+    },
+    "kernel_info": {
+      "name": "python38-azureml-pt-tf"
+    },
+    "kernelspec": {
+      "display_name": "azureml_py38_PT_TF",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.5"
+    },
+    "microsoft": {
+      "host": {
+        "AzureML": {
+          "notebookHasBeenCompleted": true
+        }
+      },
+      "ms_spell_check": {
+        "ms_spell_check_language": "en"
+      }
+    },
+    "nteract": {
+      "version": "nteract-front-end@1.0.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}

notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-29-17-44-52Z.ipynb ADDED Viewed

	@@ -0,0 +1,739 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# DAEDRA: Determining Adverse Event Disposition for Regulatory Affairs\n",
+        "\n",
+        "DAEDRA is a language model intended to predict the disposition (outcome) of an adverse event based on the text of the event report. Intended to be used to classify reports in passive reporting systems, it is trained on the [VAERS](https://vaers.hhs.gov/) dataset, which contains reports of adverse events following vaccination in the United States."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "gather": {
+          "logged": 1706475754655
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "tags": []
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "/bin/bash: /anaconda/envs/azureml_py38_PT_TF/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n",
+            "Requirement already satisfied: accelerate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.26.1)\n",
+            "Requirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.4.2)\n",
+            "Requirement already satisfied: huggingface-hub in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.20.3)\n",
+            "Requirement already satisfied: pyyaml in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (6.0)\n",
+            "Requirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (23.1)\n",
+            "Requirement already satisfied: torch>=1.10.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.12.0)\n",
+            "Requirement already satisfied: psutil in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (5.9.5)\n",
+            "Requirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.23.5)\n",
+            "Requirement already satisfied: typing_extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from torch>=1.10.0->accelerate) (4.6.3)\n",
+            "Requirement already satisfied: tqdm>=4.42.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (4.65.0)\n",
+            "Requirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2.31.0)\n",
+            "Requirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (3.13.1)\n",
+            "Requirement already satisfied: fsspec>=2023.5.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2023.10.0)\n",
+            "Requirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.1.0)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (2023.5.7)\n",
+            "Requirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (1.26.16)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.4)\n",
+            "Note: you may need to restart the kernel to use updated packages.\n"
+          ]
+        }
+      ],
+      "source": [
+        "%pip install accelerate -U"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "/bin/bash: /anaconda/envs/azureml_py38_PT_TF/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n",
+            "Requirement already satisfied: transformers in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (4.37.1)\n",
+            "Requirement already satisfied: datasets in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.16.1)\n",
+            "Requirement already satisfied: shap in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.44.1)\n",
+            "Requirement already satisfied: watermark in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.4.3)\n",
+            "Requirement already satisfied: wandb in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.16.2)\n",
+            "Requirement already satisfied: evaluate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.4.1)\n",
+            "Requirement already satisfied: codecarbon in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.3.3)\n",
+            "Requirement already satisfied: pyyaml>=5.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (6.0)\n",
+            "Requirement already satisfied: tokenizers<0.19,>=0.14 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.15.1)\n",
+            "Requirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (3.13.1)\n",
+            "Requirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.20.3)\n",
+            "Requirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2.31.0)\n",
+            "Requirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.4.2)\n",
+            "Requirement already satisfied: regex!=2019.12.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2023.12.25)\n",
+            "Requirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (1.23.5)\n",
+            "Requirement already satisfied: tqdm>=4.27 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (4.65.0)\n",
+            "Requirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (23.1)\n",
+            "Requirement already satisfied: pyarrow>=8.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (9.0.0)\n",
+            "Requirement already satisfied: pandas in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2.0.2)\n",
+            "Requirement already satisfied: multiprocess in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.70.15)\n",
+            "Requirement already satisfied: dill<0.3.8,>=0.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.3.7)\n",
+            "Requirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2023.10.0)\n",
+            "Requirement already satisfied: xxhash in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.4.1)\n",
+            "Requirement already satisfied: pyarrow-hotfix in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.6)\n",
+            "Requirement already satisfied: aiohttp in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.9.1)\n",
+            "Requirement already satisfied: cloudpickle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (2.2.1)\n",
+            "Requirement already satisfied: scipy in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.10.1)\n",
+            "Requirement already satisfied: numba in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.58.1)\n",
+            "Requirement already satisfied: slicer==0.0.7 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.0.7)\n",
+            "Requirement already satisfied: scikit-learn in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.2.2)\n",
+            "Requirement already satisfied: setuptools in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (65.6.3)\n",
+            "Requirement already satisfied: ipython>=6.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (8.12.2)\n",
+            "Requirement already satisfied: importlib-metadata>=1.4 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (6.7.0)\n",
+            "Requirement already satisfied: setproctitle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.3.3)\n",
+            "Requirement already satisfied: typing-extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (4.6.3)\n",
+            "Requirement already satisfied: protobuf!=4.21.0,<5,>=3.12.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.19.6)\n",
+            "Requirement already satisfied: docker-pycreds>=0.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (0.4.0)\n",
+            "Requirement already satisfied: sentry-sdk>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.39.2)\n",
+            "Requirement already satisfied: psutil>=5.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (5.9.5)\n",
+            "Requirement already satisfied: Click!=8.0.0,>=7.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (8.1.3)\n",
+            "Requirement already satisfied: GitPython!=3.1.29,>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.1.31)\n",
+            "Requirement already satisfied: appdirs>=1.4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.4.4)\n",
+            "Requirement already satisfied: responses<0.19 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from evaluate) (0.18.0)\n",
+            "Requirement already satisfied: rapidfuzz in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (3.6.1)\n",
+            "Requirement already satisfied: pynvml in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (11.5.0)\n",
+            "Requirement already satisfied: prometheus-client in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (0.19.0)\n",
+            "Requirement already satisfied: py-cpuinfo in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (9.0.0)\n",
+            "Requirement already satisfied: arrow in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (1.3.0)\n",
+            "Requirement already satisfied: six>=1.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0)\n",
+            "Requirement already satisfied: async-timeout<5.0,>=4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (4.0.3)\n",
+            "Requirement already satisfied: frozenlist>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.4.1)\n",
+            "Requirement already satisfied: multidict<7.0,>=4.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (6.0.4)\n",
+            "Requirement already satisfied: yarl<2.0,>=1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.9.4)\n",
+            "Requirement already satisfied: attrs>=17.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (23.1.0)\n",
+            "Requirement already satisfied: aiosignal>=1.1.2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.1)\n",
+            "Requirement already satisfied: gitdb<5,>=4.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from GitPython!=3.1.29,>=1.0.0->wandb) (4.0.10)\n",
+            "Requirement already satisfied: zipp>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from importlib-metadata>=1.4->watermark) (3.15.0)\n",
+            "Requirement already satisfied: pickleshare in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.7.5)\n",
+            "Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (3.0.30)\n",
+            "Requirement already satisfied: backcall in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.2.0)\n",
+            "Requirement already satisfied: stack-data in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.6.2)\n",
+            "Requirement already satisfied: jedi>=0.16 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.18.2)\n",
+            "Requirement already satisfied: pygments>=2.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (2.15.1)\n",
+            "Requirement already satisfied: pexpect>4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (4.8.0)\n",
+            "Requirement already satisfied: decorator in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.1.1)\n",
+            "Requirement already satisfied: traitlets>=5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.9.0)\n",
+            "Requirement already satisfied: matplotlib-inline in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.1.6)\n",
+            "Requirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.1.0)\n",
+            "Requirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (1.26.16)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.4)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (2023.5.7)\n",
+            "Requirement already satisfied: python-dateutil>=2.7.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from arrow->codecarbon) (2.8.2)\n",
+            "Requirement already satisfied: types-python-dateutil>=2.8.10 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from arrow->codecarbon) (2.8.19.20240106)\n",
+            "Requirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from numba->shap) (0.41.1)\n",
+            "Requirement already satisfied: tzdata>=2022.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\n",
+            "Requirement already satisfied: pytz>=2020.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\n",
+            "Requirement already satisfied: joblib>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (1.2.0)\n",
+            "Requirement already satisfied: threadpoolctl>=2.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (3.1.0)\n",
+            "Requirement already satisfied: smmap<6,>=3.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.0)\n",
+            "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from jedi>=0.16->ipython>=6.0->watermark) (0.8.3)\n",
+            "Requirement already satisfied: ptyprocess>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pexpect>4.3->ipython>=6.0->watermark) (0.7.0)\n",
+            "Requirement already satisfied: wcwidth in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.0->watermark) (0.2.6)\n",
+            "Requirement already satisfied: executing>=1.2.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (1.2.0)\n",
+            "Requirement already satisfied: pure-eval in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (0.2.2)\n",
+            "Requirement already satisfied: asttokens>=2.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (2.2.1)\n",
+            "Note: you may need to restart the kernel to use updated packages.\n"
+          ]
+        }
+      ],
+      "source": [
+        "%pip install transformers datasets shap watermark wandb evaluate codecarbon"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 28,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "caZjjFP0OyQNMVgZDiwswE",
+          "report_properties": {
+            "rowId": "un8W7ez7ZwoGb5Co6nydEV"
+          },
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706503443742
+        },
+        "tags": []
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              " View run <strong style=\"color:#cdcd00\">daedra_0.05-distilbert-base-uncased</strong> at: <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/cwkdl3x7' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/cwkdl3x7</a><br/> View job at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjEzNDcyMTQwMw==/version_details/v3' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjEzNDcyMTQwMw==/version_details/v3</a><br/>Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Find logs at: <code>./wandb/run-20240129_152136-cwkdl3x7/logs</code>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "The watermark extension is already loaded. To reload it, use:\n",
+            "  %reload_ext watermark\n"
+          ]
+        }
+      ],
+      "source": [
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "import torch\n",
+        "import os\n",
+        "from typing import List, Union\n",
+        "from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, DataCollatorWithPadding, pipeline, AutoModel\n",
+        "from datasets import load_dataset, Dataset, DatasetDict\n",
+        "import shap\n",
+        "import wandb\n",
+        "import evaluate\n",
+        "import logging\n",
+        "\n",
+        "wandb.finish()\n",
+        "\n",
+        "\n",
+        "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+        "\n",
+        "%load_ext watermark"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706503443899
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      },
+      "outputs": [],
+      "source": [
+        "device: str = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+        "\n",
+        "SEED: int = 42\n",
+        "\n",
+        "BATCH_SIZE: int = 32\n",
+        "EPOCHS: int = 5\n",
+        "model_ckpt: str = \"distilbert-base-uncased\"\n",
+        "\n",
+        "# WandB configuration\n",
+        "os.environ[\"WANDB_PROJECT\"] = \"DAEDRA multiclass model training\" \n",
+        "os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\"  # log all model checkpoints\n",
+        "os.environ[\"WANDB_NOTEBOOK_NAME\"] = \"DAEDRA.ipynb\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "collapsed": false,
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "re      : 2.2.1\n",
+            "torch   : 1.12.0\n",
+            "wandb   : 0.16.2\n",
+            "logging : 0.5.1.2\n",
+            "numpy   : 1.23.5\n",
+            "pandas  : 2.0.2\n",
+            "evaluate: 0.4.1\n",
+            "shap    : 0.44.1\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "%watermark --iversion"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "UU2oOJhwbIualogG1YyCMd",
+          "type": "CODE"
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "/bin/bash: /anaconda/envs/azureml_py38_PT_TF/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n",
+            "Mon Jan 29 15:20:22 2024       \n",
+            "+---------------------------------------------------------------------------------------+\n",
+            "| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |\n",
+            "|-----------------------------------------+----------------------+----------------------+\n",
+            "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
+            "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
+            "|                                         |                      |               MIG M. |\n",
+            "|=========================================+======================+======================|\n",
+            "|   0  Tesla V100-PCIE-16GB           Off | 00000001:00:00.0 Off |                  Off |\n",
+            "| N/A   27C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\n",
+            "|                                         |                      |                  N/A |\n",
+            "+-----------------------------------------+----------------------+----------------------+\n",
+            "|   1  Tesla V100-PCIE-16GB           Off | 00000002:00:00.0 Off |                  Off |\n",
+            "| N/A   26C    P0              23W / 250W |      4MiB / 16384MiB |      0%      Default |\n",
+            "|                                         |                      |                  N/A |\n",
+            "+-----------------------------------------+----------------------+----------------------+\n",
+            "|   2  Tesla V100-PCIE-16GB           Off | 00000003:00:00.0 Off |                  Off |\n",
+            "| N/A   26C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\n",
+            "|                                         |                      |                  N/A |\n",
+            "+-----------------------------------------+----------------------+----------------------+\n",
+            "|   3  Tesla V100-PCIE-16GB           Off | 00000004:00:00.0 Off |                  Off |\n",
+            "| N/A   28C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\n",
+            "|                                         |                      |                  N/A |\n",
+            "+-----------------------------------------+----------------------+----------------------+\n",
+            "                                                                                         \n",
+            "+---------------------------------------------------------------------------------------+\n",
+            "| Processes:                                                                            |\n",
+            "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
+            "|        ID   ID                                                             Usage      |\n",
+            "|=======================================================================================|\n",
+            "|  No running processes found                                                           |\n",
+            "+---------------------------------------------------------------------------------------+\n"
+          ]
+        }
+      ],
+      "source": [
+        "!nvidia-smi"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "t45KHugmcPVaO0nuk8tGJ9",
+          "report_properties": {
+            "rowId": "40nN9Hvgi1clHNV5RAemI5"
+          },
+          "type": "MD"
+        }
+      },
+      "source": [
+        "## Loading the data set"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706503446033
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      },
+      "outputs": [],
+      "source": [
+        "dataset = load_dataset(\"chrisvoncsefalvay/vaers-outcomes\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706503446252
+        },
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "DatasetDict({\n",
+              "    train: Dataset({\n",
+              "        features: ['id', 'text', 'label'],\n",
+              "        num_rows: 1270444\n",
+              "    })\n",
+              "    test: Dataset({\n",
+              "        features: ['id', 'text', 'label'],\n",
+              "        num_rows: 272238\n",
+              "    })\n",
+              "    val: Dataset({\n",
+              "        features: ['id', 'text', 'label'],\n",
+              "        num_rows: 272238\n",
+              "    })\n",
+              "})"
+            ]
+          },
+          "execution_count": 8,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "gather": {
+          "logged": 1706503446498
+        }
+      },
+      "outputs": [],
+      "source": [
+        "SUBSAMPLING = 0.01\n",
+        "\n",
+        "if SUBSAMPLING < 1:\n",
+        "    _ = DatasetDict()\n",
+        "    for each in dataset.keys():\n",
+        "        _[each] = dataset[each].shuffle(seed=SEED).select(range(int(len(dataset[each]) * SUBSAMPLING)))\n",
+        "\n",
+        "    dataset = _"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Tokenisation and encoding"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "gather": {
+          "logged": 1706503446633
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def encode_ds(ds: Union[Dataset, DatasetDict], tokenizer_model: str = model_ckpt) -> Union[Dataset, DatasetDict]:\n",
+        "    return ds_enc"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Evaluation metrics"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "gather": {
+          "logged": 1706503446863
+        }
+      },
+      "outputs": [],
+      "source": [
+        "accuracy = evaluate.load(\"accuracy\")\n",
+        "precision, recall = evaluate.load(\"precision\"), evaluate.load(\"recall\")\n",
+        "f1 = evaluate.load(\"f1\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "gather": {
+          "logged": 1706503447004
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def compute_metrics(eval_pred):\n",
+        "    predictions, labels = eval_pred\n",
+        "    predictions = np.argmax(predictions, axis=1)\n",
+        "    return {\n",
+        "        'accuracy': accuracy.compute(predictions=predictions, references=labels)[\"accuracy\"],\n",
+        "        'precision_macroaverage': precision.compute(predictions=predictions, references=labels, average='macro')[\"precision\"],\n",
+        "        'precision_microaverage': precision.compute(predictions=predictions, references=labels, average='micro')[\"precision\"],\n",
+        "        'recall_macroaverage': recall.compute(predictions=predictions, references=labels, average='macro')[\"recall\"],\n",
+        "        'recall_microaverage': recall.compute(predictions=predictions, references=labels, average='micro')[\"recall\"],\n",
+        "        'f1_microaverage': f1.compute(predictions=predictions, references=labels, average='micro')[\"f1\"]\n",
+        "    }"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Training"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We specify a label map – this has to be done manually, even if `Datasets` has a function for it, as `AutoModelForSequenceClassification` requires an object with a length :("
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "metadata": {
+        "gather": {
+          "logged": 1706503447186
+        }
+      },
+      "outputs": [],
+      "source": [
+        "label_map = {i: label for i, label in enumerate(dataset[\"test\"].features[\"label\"].names)}"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def train_from_model(model_ckpt: str, push: bool = False):\n",
+        "    print(f\"Initialising training based on {model_ckpt}...\")\n",
+        "\n",
+        "    print(\"Tokenising...\")\n",
+        "    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)\n",
+        "\n",
+        "    cols = dataset[\"train\"].column_names\n",
+        "    cols.remove(\"label\")\n",
+        "    ds_enc = dataset.map(lambda x: tokenizer(x[\"text\"], truncation=True, max_length=512), batched=True, remove_columns=cols)\n",
+        "\n",
+        "    print(\"Loading model...\")\n",
+        "    try:\n",
+        "        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, \n",
+        "                                                                    num_labels=len(dataset[\"test\"].features[\"label\"].names), \n",
+        "                                                                    id2label=label_map, \n",
+        "                                                                    label2id={v:k for k,v in label_map.items()})\n",
+        "    except OSError:\n",
+        "        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, \n",
+        "                                                            num_labels=len(dataset[\"test\"].features[\"label\"].names), \n",
+        "                                                            id2label=label_map, \n",
+        "                                                            label2id={v:k for k,v in label_map.items()},\n",
+        "                                                            from_tf=True)\n",
+        "\n",
+        "\n",
+        "    args = TrainingArguments(\n",
+        "        output_dir=\"vaers\",\n",
+        "        evaluation_strategy=\"epoch\",\n",
+        "        save_strategy=\"epoch\",\n",
+        "        learning_rate=2e-5,\n",
+        "        per_device_train_batch_size=BATCH_SIZE,\n",
+        "        per_device_eval_batch_size=BATCH_SIZE,\n",
+        "        num_train_epochs=EPOCHS,\n",
+        "        weight_decay=.01,\n",
+        "        logging_steps=1,\n",
+        "        load_best_model_at_end=True,\n",
+        "        run_name=f\"daedra-training\",\n",
+        "        report_to=[\"wandb\"])\n",
+        "\n",
+        "    trainer = Trainer(\n",
+        "            model=model,\n",
+        "            args=args,\n",
+        "            train_dataset=ds_enc[\"train\"],\n",
+        "            eval_dataset=ds_enc[\"test\"],\n",
+        "            tokenizer=tokenizer,\n",
+        "            compute_metrics=compute_metrics)\n",
+        "    \n",
+        "    if SUBSAMPLING != 1.0:\n",
+        "        wandb_tag: List[str] = [f\"subsample-{SUBSAMPLING}\"]\n",
+        "    else:\n",
+        "        wandb_tag: List[str] = [f\"full_sample\"]\n",
+        "\n",
+        "    wandb_tag.append(f\"batch_size-{BATCH_SIZE}\")\n",
+        "    wandb_tag.append(f\"base:{model_ckpt}\")\n",
+        "        \n",
+        "    wandb.init(name=f\"daedra_{SUBSAMPLING}-{model_ckpt}\", tags=wandb_tag, magic=True)\n",
+        "\n",
+        "    print(\"Starting training...\")\n",
+        "\n",
+        "    trainer.train()\n",
+        "\n",
+        "    print(\"Training finished.\")\n",
+        "\n",
+        "    if push:\n",
+        "        variant = \"full_sample\" if SUBSAMPLING == 1.0 else f\"subsample-{SUBSAMPLING}\"\n",
+        "        tokenizer._tokenizer.save(\"tokenizer.json\")\n",
+        "        tokenizer.push_to_hub(\"chrisvoncsefalvay/daedra\")\n",
+        "        sample = \"full sample\" if SUBSAMPLING == 1.0 else f\"{SUBSAMPLING * 100}% of the full sample\"\n",
+        "\n",
+        "        model.push_to_hub(\"chrisvoncsefalvay/daedra\", \n",
+        "                        variant=variant,\n",
+        "                        commit_message=f\"DAEDRA model trained on {sample} of the VAERS dataset (training set size: {dataset['train'].num_rows:,}), based on {model_ckpt}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {
+        "gather": {
+          "logged": 1706503552083
+        }
+      },
+      "outputs": [],
+      "source": [
+        "\n",
+        "base_models = [\n",
+        "    \"bert-base-uncased\",\n",
+        "    \"distilbert-base-uncased\",\n",
+        "]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "for md in base_models:\n",
+        "    train_from_model(md)"
+      ]
+    }
+  ],
+  "metadata": {
+    "datalore": {
+      "base_environment": "default",
+      "computation_mode": "JUPYTER",
+      "package_manager": "pip",
+      "packages": [
+        {
+          "name": "datasets",
+          "source": "PIP",
+          "version": "2.16.1"
+        },
+        {
+          "name": "torch",
+          "source": "PIP",
+          "version": "2.1.2"
+        },
+        {
+          "name": "accelerate",
+          "source": "PIP",
+          "version": "0.26.1"
+        }
+      ],
+      "report_row_ids": [
+        "un8W7ez7ZwoGb5Co6nydEV",
+        "40nN9Hvgi1clHNV5RAemI5",
+        "TgRD90H5NSPpKS41OeXI1w",
+        "ZOm5BfUs3h1EGLaUkBGeEB",
+        "kOP0CZWNSk6vqE3wkPp7Vc",
+        "W4PWcOu2O2pRaZyoE2W80h",
+        "RolbOnQLIftk0vy9mIcz5M",
+        "8OPhUgbaNJmOdiq5D3a6vK",
+        "5Qrt3jSvSrpK6Ne1hS6shL",
+        "hTq7nFUrovN5Ao4u6dIYWZ",
+        "I8WNZLpJ1DVP2wiCW7YBIB",
+        "SawhU3I9BewSE1XBPstpNJ",
+        "80EtLEl2FIE4FqbWnUD3nT"
+      ],
+      "version": 3
+    },
+    "kernel_info": {
+      "name": "python38-azureml-pt-tf"
+    },
+    "kernelspec": {
+      "display_name": "azureml_py38_PT_TF",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.5"
+    },
+    "microsoft": {
+      "host": {
+        "AzureML": {
+          "notebookHasBeenCompleted": true
+        }
+      },
+      "ms_spell_check": {
+        "ms_spell_check_language": "en"
+      }
+    },
+    "nteract": {
+      "version": "nteract-front-end@1.0.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}

notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-29-3-40-27Z.ipynb ADDED Viewed

	@@ -0,0 +1,1001 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# DAEDRA: Determining Adverse Event Disposition for Regulatory Affairs\n",
+        "\n",
+        "DAEDRA is a language model intended to predict the disposition (outcome) of an adverse event based on the text of the event report. Intended to be used to classify reports in passive reporting systems, it is trained on the [VAERS](https://vaers.hhs.gov/) dataset, which contains reports of adverse events following vaccination in the United States."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "gather": {
+          "logged": 1706475754655
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "tags": []
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Requirement already satisfied: accelerate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.26.1)\n",
+            "Requirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (23.1)\n",
+            "Requirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.4.2)\n",
+            "Requirement already satisfied: huggingface-hub in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.20.3)\n",
+            "Requirement already satisfied: psutil in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (5.9.5)\n",
+            "Requirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.23.5)\n",
+            "Requirement already satisfied: torch>=1.10.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.12.0)\n",
+            "Requirement already satisfied: pyyaml in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (6.0)\n",
+            "Requirement already satisfied: typing_extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from torch>=1.10.0->accelerate) (4.6.3)\n",
+            "Requirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2.31.0)\n",
+            "Requirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (3.13.1)\n",
+            "Requirement already satisfied: fsspec>=2023.5.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2023.10.0)\n",
+            "Requirement already satisfied: tqdm>=4.42.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (4.65.0)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.4)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (2023.5.7)\n",
+            "Requirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.1.0)\n",
+            "Requirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (1.26.16)\n",
+            "Note: you may need to restart the kernel to use updated packages.\n"
+          ]
+        }
+      ],
+      "source": [
+        "%pip install accelerate -U"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Requirement already satisfied: transformers in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (4.37.1)\n",
+            "Requirement already satisfied: datasets in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.16.1)\n",
+            "Requirement already satisfied: shap in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.44.1)\n",
+            "Requirement already satisfied: watermark in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.4.3)\n",
+            "Requirement already satisfied: wandb in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.16.2)\n",
+            "Requirement already satisfied: evaluate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.4.1)\n",
+            "Requirement already satisfied: codecarbon in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.3.3)\n",
+            "Requirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.20.3)\n",
+            "Requirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (23.1)\n",
+            "Requirement already satisfied: tokenizers<0.19,>=0.14 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.15.1)\n",
+            "Requirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (1.23.5)\n",
+            "Requirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.4.2)\n",
+            "Requirement already satisfied: pyyaml>=5.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (6.0)\n",
+            "Requirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (3.13.1)\n",
+            "Requirement already satisfied: tqdm>=4.27 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (4.65.0)\n",
+            "Requirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2.31.0)\n",
+            "Requirement already satisfied: regex!=2019.12.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2023.12.25)\n",
+            "Requirement already satisfied: aiohttp in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.9.1)\n",
+            "Requirement already satisfied: pyarrow>=8.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (9.0.0)\n",
+            "Requirement already satisfied: multiprocess in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.70.15)\n",
+            "Requirement already satisfied: dill<0.3.8,>=0.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.3.7)\n",
+            "Requirement already satisfied: pandas in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2.0.2)\n",
+            "Requirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2023.10.0)\n",
+            "Requirement already satisfied: xxhash in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.4.1)\n",
+            "Requirement already satisfied: pyarrow-hotfix in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.6)\n",
+            "Requirement already satisfied: scikit-learn in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.2.2)\n",
+            "Requirement already satisfied: scipy in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.10.1)\n",
+            "Requirement already satisfied: slicer==0.0.7 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.0.7)\n",
+            "Requirement already satisfied: numba in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.58.1)\n",
+            "Requirement already satisfied: cloudpickle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (2.2.1)\n",
+            "Requirement already satisfied: setuptools in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (65.6.3)\n",
+            "Requirement already satisfied: importlib-metadata>=1.4 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (6.7.0)\n",
+            "Requirement already satisfied: ipython>=6.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (8.12.2)\n",
+            "Requirement already satisfied: protobuf!=4.21.0,<5,>=3.12.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.19.6)\n",
+            "Requirement already satisfied: sentry-sdk>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.39.2)\n",
+            "Requirement already satisfied: docker-pycreds>=0.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (0.4.0)\n",
+            "Requirement already satisfied: appdirs>=1.4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.4.4)\n",
+            "Requirement already satisfied: typing-extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (4.6.3)\n",
+            "Requirement already satisfied: psutil>=5.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (5.9.5)\n",
+            "Requirement already satisfied: setproctitle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.3.3)\n",
+            "Requirement already satisfied: Click!=8.0.0,>=7.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (8.1.3)\n",
+            "Requirement already satisfied: GitPython!=3.1.29,>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.1.31)\n",
+            "Requirement already satisfied: responses<0.19 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from evaluate) (0.18.0)\n",
+            "Requirement already satisfied: prometheus-client in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (0.19.0)\n",
+            "Requirement already satisfied: pynvml in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (11.5.0)\n",
+            "Requirement already satisfied: arrow in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (1.3.0)\n",
+            "Requirement already satisfied: rapidfuzz in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (3.6.1)\n",
+            "Requirement already satisfied: py-cpuinfo in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (9.0.0)\n",
+            "Requirement already satisfied: six>=1.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0)\n",
+            "Requirement already satisfied: frozenlist>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.4.1)\n",
+            "Requirement already satisfied: yarl<2.0,>=1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.9.4)\n",
+            "Requirement already satisfied: attrs>=17.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (23.1.0)\n",
+            "Requirement already satisfied: async-timeout<5.0,>=4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (4.0.3)\n",
+            "Requirement already satisfied: multidict<7.0,>=4.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (6.0.4)\n",
+            "Requirement already satisfied: aiosignal>=1.1.2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.1)\n",
+            "Requirement already satisfied: gitdb<5,>=4.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from GitPython!=3.1.29,>=1.0.0->wandb) (4.0.10)\n",
+            "Requirement already satisfied: zipp>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from importlib-metadata>=1.4->watermark) (3.15.0)\n",
+            "Requirement already satisfied: stack-data in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.6.2)\n",
+            "Requirement already satisfied: traitlets>=5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.9.0)\n",
+            "Requirement already satisfied: pygments>=2.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (2.15.1)\n",
+            "Requirement already satisfied: pickleshare in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.7.5)\n",
+            "Requirement already satisfied: backcall in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.2.0)\n",
+            "Requirement already satisfied: decorator in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.1.1)\n",
+            "Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (3.0.30)\n",
+            "Requirement already satisfied: pexpect>4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (4.8.0)\n",
+            "Requirement already satisfied: matplotlib-inline in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.1.6)\n",
+            "Requirement already satisfied: jedi>=0.16 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.18.2)\n",
+            "Requirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.1.0)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (2023.5.7)\n",
+            "Requirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (1.26.16)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.4)\n",
+            "Requirement already satisfied: types-python-dateutil>=2.8.10 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from arrow->codecarbon) (2.8.19.20240106)\n",
+            "Requirement already satisfied: python-dateutil>=2.7.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from arrow->codecarbon) (2.8.2)\n",
+            "Requirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from numba->shap) (0.41.1)\n",
+            "Requirement already satisfied: tzdata>=2022.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\n",
+            "Requirement already satisfied: pytz>=2020.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\n",
+            "Requirement already satisfied: joblib>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (1.2.0)\n",
+            "Requirement already satisfied: threadpoolctl>=2.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (3.1.0)\n",
+            "Requirement already satisfied: smmap<6,>=3.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.0)\n",
+            "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from jedi>=0.16->ipython>=6.0->watermark) (0.8.3)\n",
+            "Requirement already satisfied: ptyprocess>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pexpect>4.3->ipython>=6.0->watermark) (0.7.0)\n",
+            "Requirement already satisfied: wcwidth in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.0->watermark) (0.2.6)\n",
+            "Requirement already satisfied: pure-eval in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (0.2.2)\n",
+            "Requirement already satisfied: executing>=1.2.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (1.2.0)\n",
+            "Requirement already satisfied: asttokens>=2.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (2.2.1)\n",
+            "Note: you may need to restart the kernel to use updated packages.\n"
+          ]
+        }
+      ],
+      "source": [
+        "%pip install transformers datasets shap watermark wandb evaluate codecarbon"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "caZjjFP0OyQNMVgZDiwswE",
+          "report_properties": {
+            "rowId": "un8W7ez7ZwoGb5Co6nydEV"
+          },
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706486372154
+        },
+        "tags": []
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+            "  from .autonotebook import tqdm as notebook_tqdm\n",
+            "2024-01-28 23:59:27.034680: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
+            "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+            "2024-01-28 23:59:27.996419: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n",
+            "2024-01-28 23:59:27.999143: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n",
+            "2024-01-28 23:59:27.999161: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n",
+            "[codecarbon INFO @ 23:59:30] [setup] RAM Tracking...\n",
+            "[codecarbon INFO @ 23:59:30] [setup] GPU Tracking...\n",
+            "[codecarbon INFO @ 23:59:30] Tracking Nvidia GPU via pynvml\n",
+            "[codecarbon INFO @ 23:59:30] [setup] CPU Tracking...\n",
+            "[codecarbon WARNING @ 23:59:30] No CPU tracking mode found. Falling back on CPU constant mode.\n",
+            "[codecarbon WARNING @ 23:59:31] We saw that you have a Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz but we don't know it. Please contact us.\n",
+            "[codecarbon INFO @ 23:59:31] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz\n",
+            "[codecarbon INFO @ 23:59:31] >>> Tracker's metadata:\n",
+            "[codecarbon INFO @ 23:59:31]   Platform system: Linux-5.15.0-1040-azure-x86_64-with-glibc2.10\n",
+            "[codecarbon INFO @ 23:59:31]   Python version: 3.8.5\n",
+            "[codecarbon INFO @ 23:59:31]   CodeCarbon version: 2.3.3\n",
+            "[codecarbon INFO @ 23:59:31]   Available RAM : 440.883 GB\n",
+            "[codecarbon INFO @ 23:59:31]   CPU count: 24\n",
+            "[codecarbon INFO @ 23:59:31]   CPU model: Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz\n",
+            "[codecarbon INFO @ 23:59:31]   GPU count: 4\n",
+            "[codecarbon INFO @ 23:59:31]   GPU model: 4 x Tesla V100-PCIE-16GB\n",
+            "[codecarbon WARNING @ 23:59:32] Cloud provider 'azure' do not publish electricity carbon intensity. Using country value instead.\n"
+          ]
+        }
+      ],
+      "source": [
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "import torch\n",
+        "import os\n",
+        "from typing import List, Union\n",
+        "from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, DataCollatorWithPadding, pipeline\n",
+        "from datasets import load_dataset, Dataset, DatasetDict\n",
+        "import shap\n",
+        "import wandb\n",
+        "import evaluate\n",
+        "from codecarbon import EmissionsTracker\n",
+        "import logging\n",
+        "\n",
+        "logging.getLogger('codecarbon').propagate = False\n",
+        "\n",
+        "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+        "tracker = EmissionsTracker()\n",
+        "\n",
+        "%load_ext watermark"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706486372304
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      },
+      "outputs": [],
+      "source": [
+        "device: str = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+        "\n",
+        "SEED: int = 42\n",
+        "\n",
+        "BATCH_SIZE: int = 32\n",
+        "EPOCHS: int = 3\n",
+        "model_ckpt: str = \"distilbert-base-uncased\"\n",
+        "\n",
+        "# WandB configuration\n",
+        "os.environ[\"WANDB_PROJECT\"] = \"DAEDRA multiclass model training\" \n",
+        "os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\"  # log all model checkpoints\n",
+        "os.environ[\"WANDB_NOTEBOOK_NAME\"] = \"DAEDRA.ipynb\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "collapsed": false,
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "re      : 2.2.1\n",
+            "evaluate: 0.4.1\n",
+            "pandas  : 2.0.2\n",
+            "wandb   : 0.16.2\n",
+            "numpy   : 1.23.5\n",
+            "torch   : 1.12.0\n",
+            "logging : 0.5.1.2\n",
+            "shap    : 0.44.1\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "%watermark --iversion"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "UU2oOJhwbIualogG1YyCMd",
+          "type": "CODE"
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Sun Jan 28 23:59:32 2024       \r\n",
+            "+---------------------------------------------------------------------------------------+\r\n",
+            "| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |\r\n",
+            "|-----------------------------------------+----------------------+----------------------+\r\n",
+            "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\r\n",
+            "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\r\n",
+            "|                                         |                      |               MIG M. |\r\n",
+            "|=========================================+======================+======================|\r\n",
+            "|   0  Tesla V100-PCIE-16GB           Off | 00000001:00:00.0 Off |                  Off |\r\n",
+            "| N/A   26C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n",
+            "|                                         |                      |                  N/A |\r\n",
+            "+-----------------------------------------+----------------------+----------------------+\r\n",
+            "|   1  Tesla V100-PCIE-16GB           Off | 00000002:00:00.0 Off |                  Off |\r\n",
+            "| N/A   25C    P0              23W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n",
+            "|                                         |                      |                  N/A |\r\n",
+            "+-----------------------------------------+----------------------+----------------------+\r\n",
+            "|   2  Tesla V100-PCIE-16GB           Off | 00000003:00:00.0 Off |                  Off |\r\n",
+            "| N/A   25C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n",
+            "|                                         |                      |                  N/A |\r\n",
+            "+-----------------------------------------+----------------------+----------------------+\r\n",
+            "|   3  Tesla V100-PCIE-16GB           Off | 00000004:00:00.0 Off |                  Off |\r\n",
+            "| N/A   27C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n",
+            "|                                         |                      |                  N/A |\r\n",
+            "+-----------------------------------------+----------------------+----------------------+\r\n",
+            "                                                                                         \r\n",
+            "+---------------------------------------------------------------------------------------+\r\n",
+            "| Processes:                                                                            |\r\n",
+            "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\r\n",
+            "|        ID   ID                                                             Usage      |\r\n",
+            "|=======================================================================================|\r\n",
+            "|  No running processes found                                                           |\r\n",
+            "+---------------------------------------------------------------------------------------+\r\n"
+          ]
+        }
+      ],
+      "source": [
+        "!nvidia-smi"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "t45KHugmcPVaO0nuk8tGJ9",
+          "report_properties": {
+            "rowId": "40nN9Hvgi1clHNV5RAemI5"
+          },
+          "type": "MD"
+        }
+      },
+      "source": [
+        "## Loading the data set"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706486373931
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      },
+      "outputs": [],
+      "source": [
+        "dataset = load_dataset(\"chrisvoncsefalvay/vaers-outcomes\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706486374218
+        },
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "DatasetDict({\n",
+              "    train: Dataset({\n",
+              "        features: ['id', 'text', 'label'],\n",
+              "        num_rows: 1270444\n",
+              "    })\n",
+              "    test: Dataset({\n",
+              "        features: ['id', 'text', 'label'],\n",
+              "        num_rows: 272238\n",
+              "    })\n",
+              "    val: Dataset({\n",
+              "        features: ['id', 'text', 'label'],\n",
+              "        num_rows: 272238\n",
+              "    })\n",
+              "})"
+            ]
+          },
+          "execution_count": 8,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "gather": {
+          "logged": 1706486374480
+        }
+      },
+      "outputs": [],
+      "source": [
+        "SUBSAMPLING = 0.5\n",
+        "\n",
+        "if SUBSAMPLING < 1:\n",
+        "    _ = DatasetDict()\n",
+        "    for each in dataset.keys():\n",
+        "        _[each] = dataset[each].shuffle(seed=SEED).select(range(int(len(dataset[each]) * SUBSAMPLING)))\n",
+        "\n",
+        "    dataset = _"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Tokenisation and encoding"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "gather": {
+          "logged": 1706486375030
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def encode_ds(ds: Union[Dataset, DatasetDict], tokenizer_model: str = model_ckpt) -> Union[Dataset, DatasetDict]:\n",
+        "    return ds_enc"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Evaluation metrics"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "metadata": {
+        "gather": {
+          "logged": 1706486375197
+        }
+      },
+      "outputs": [],
+      "source": [
+        "accuracy = evaluate.load(\"accuracy\")\n",
+        "precision, recall = evaluate.load(\"precision\"), evaluate.load(\"recall\")\n",
+        "f1 = evaluate.load(\"f1\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {
+        "gather": {
+          "logged": 1706486375361
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def compute_metrics(eval_pred):\n",
+        "    predictions, labels = eval_pred\n",
+        "    predictions = np.argmax(predictions, axis=1)\n",
+        "    return {\n",
+        "        'accuracy': accuracy.compute(predictions=predictions, references=labels)[\"accuracy\"],\n",
+        "        'precision_macroaverage': precision.compute(predictions=predictions, references=labels, average='macro')[\"precision\"],\n",
+        "        'precision_microaverage': precision.compute(predictions=predictions, references=labels, average='micro')[\"precision\"],\n",
+        "        'recall_macroaverage': recall.compute(predictions=predictions, references=labels, average='macro')[\"recall\"],\n",
+        "        'recall_microaverage': recall.compute(predictions=predictions, references=labels, average='micro')[\"recall\"],\n",
+        "        'f1_microaverage': f1.compute(predictions=predictions, references=labels, average='micro')[\"f1\"]\n",
+        "    }"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Training"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We specify a label map – this has to be done manually, even if `Datasets` has a function for it, as `AutoModelForSequenceClassification` requires an object with a length :("
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {
+        "gather": {
+          "logged": 1706486375569
+        }
+      },
+      "outputs": [],
+      "source": [
+        "label_map = {i: label for i, label in enumerate(dataset[\"test\"].features[\"label\"].names)}"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 14,
+      "metadata": {
+        "gather": {
+          "logged": 1706486433708
+        }
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Map: 100%|██████████| 136119/136119 [00:56<00:00, 2412.00 examples/s]\n",
+            "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+          ]
+        }
+      ],
+      "source": [
+        "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)\n",
+        "\n",
+        "cols = dataset[\"train\"].column_names\n",
+        "cols.remove(\"label\")\n",
+        "ds_enc = dataset.map(lambda x: tokenizer(x[\"text\"], truncation=True), batched=True, remove_columns=cols)\n",
+        "\n",
+        "model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, \n",
+        "    num_labels=len(dataset[\"test\"].features[\"label\"].names), \n",
+        "    id2label=label_map, \n",
+        "    label2id={v:k for k,v in label_map.items()})\n",
+        "\n",
+        "args = TrainingArguments(\n",
+        "    output_dir=\"vaers\",\n",
+        "    evaluation_strategy=\"epoch\",\n",
+        "    save_strategy=\"epoch\",\n",
+        "    learning_rate=2e-5,\n",
+        "    per_device_train_batch_size=BATCH_SIZE,\n",
+        "    per_device_eval_batch_size=BATCH_SIZE,\n",
+        "    num_train_epochs=EPOCHS,\n",
+        "    weight_decay=.01,\n",
+        "    logging_steps=1,\n",
+        "    load_best_model_at_end=True,\n",
+        "    run_name=f\"daedra-training\",\n",
+        "    report_to=[\"wandb\"])\n",
+        "\n",
+        "trainer = Trainer(\n",
+        "        model=model,\n",
+        "        args=args,\n",
+        "        train_dataset=ds_enc[\"train\"],\n",
+        "        eval_dataset=ds_enc[\"test\"],\n",
+        "        tokenizer=tokenizer,\n",
+        "        compute_metrics=compute_metrics)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 15,
+      "metadata": {
+        "gather": {
+          "logged": 1706486444806
+        }
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mchrisvoncsefalvay\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
+            "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m wandb.init() arguments ignored because wandb magic has already been initialized\n"
+          ]
+        },
+        {
+          "data": {
+            "text/html": [
+              "Tracking run with wandb version 0.16.2"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/daedra-hptrain-cvc/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240129_000035-xm7aguww</code>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/xm7aguww' target=\"_blank\">daedra_training_run</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/xm7aguww' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/xm7aguww</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Finishing last run (ID:xm7aguww) before initializing another..."
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              " View run <strong style=\"color:#cdcd00\">daedra_training_run</strong> at: <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/xm7aguww' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/xm7aguww</a><br/>Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Find logs at: <code>./wandb/run-20240129_000035-xm7aguww/logs</code>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Successfully finished last run (ID:xm7aguww). Initializing new run:<br/>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Tracking run with wandb version 0.16.2"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/daedra-hptrain-cvc/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240129_000037-qfvjuxwm</code>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/qfvjuxwm' target=\"_blank\">daedra_training_run</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/qfvjuxwm' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/qfvjuxwm</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "<button onClick=\"this.nextSibling.style.display='block';this.style.display='none';\">Display W&B run</button><iframe src='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/qfvjuxwm?jupyter=true' style='border:none;width:100%;height:420px;display:none;'></iframe>"
+            ],
+            "text/plain": [
+              "<wandb.sdk.wandb_run.Run at 0x7f6e1c1e64c0>"
+            ]
+          },
+          "execution_count": 15,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "if SUBSAMPLING != 1.0:\n",
+        "    wandb_tag: List[str] = [f\"subsample-{SUBSAMPLING}\"]\n",
+        "else:\n",
+        "    wandb_tag: List[str] = [f\"full_sample\"]\n",
+        "\n",
+        "wandb_tag.append(f\"batch_size-{BATCH_SIZE}\")\n",
+        "wandb_tag.append(f\"base:{model_ckpt}\")\n",
+        "    \n",
+        "wandb.init(name=\"daedra_training_run\", tags=wandb_tag, magic=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 16,
+      "metadata": {
+        "gather": {
+          "logged": 1706486541798
+        }
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"
+          ]
+        },
+        {
+          "data": {
+            "text/html": [
+              "\n",
+              "    <div>\n",
+              "      \n",
+              "      <progress value='138' max='14889' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+              "      [  138/14889 01:27 < 2:38:40, 1.55 it/s, Epoch 0.03/3]\n",
+              "    </div>\n",
+              "    <table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              " <tr style=\"text-align: left;\">\n",
+              "      <th>Epoch</th>\n",
+              "      <th>Training Loss</th>\n",
+              "      <th>Validation Loss</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "  </tbody>\n",
+              "</table><p>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "[codecarbon INFO @ 00:00:59] Energy consumed for RAM : 0.000690 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 00:00:59] Energy consumed for all GPUs : 0.001468 kWh. Total GPU Power : 351.7461416352095 W\n",
+            "[codecarbon INFO @ 00:00:59] Energy consumed for all CPUs : 0.000178 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 00:00:59] 0.002336 kWh of electricity used since the beginning.\n",
+            "[codecarbon INFO @ 00:01:14] Energy consumed for RAM : 0.001378 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 00:01:14] Energy consumed for all GPUs : 0.004025 kWh. Total GPU Power : 614.3289592286081 W\n",
+            "[codecarbon INFO @ 00:01:14] Energy consumed for all CPUs : 0.000355 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 00:01:14] 0.005757 kWh of electricity used since the beginning.\n",
+            "[codecarbon INFO @ 00:01:29] Energy consumed for RAM : 0.002066 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 00:01:29] Energy consumed for all GPUs : 0.006586 kWh. Total GPU Power : 615.1209943099732 W\n",
+            "[codecarbon INFO @ 00:01:29] Energy consumed for all CPUs : 0.000532 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 00:01:29] 0.009184 kWh of electricity used since the beginning.\n",
+            "[codecarbon INFO @ 00:01:44] Energy consumed for RAM : 0.002754 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 00:01:44] Energy consumed for all GPUs : 0.009201 kWh. Total GPU Power : 628.2177623002755 W\n",
+            "[codecarbon INFO @ 00:01:44] Energy consumed for all CPUs : 0.000709 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 00:01:44] 0.012664 kWh of electricity used since the beginning.\n",
+            "[codecarbon INFO @ 00:01:59] Energy consumed for RAM : 0.003442 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 00:01:59] Energy consumed for all GPUs : 0.011831 kWh. Total GPU Power : 631.8056507544826 W\n",
+            "[codecarbon INFO @ 00:01:59] Energy consumed for all CPUs : 0.000886 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 00:01:59] 0.016159 kWh of electricity used since the beginning.\n",
+            "[codecarbon INFO @ 00:02:14] Energy consumed for RAM : 0.004130 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 00:02:14] Energy consumed for all GPUs : 0.014450 kWh. Total GPU Power : 629.2086149888297 W\n",
+            "[codecarbon INFO @ 00:02:14] Energy consumed for all CPUs : 0.001063 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 00:02:14] 0.019643 kWh of electricity used since the beginning.\n",
+            "\n",
+            "KeyboardInterrupt\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "tracker.start()\n",
+        "trainer.train()\n",
+        "tracker.stop()\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "gather": {
+          "logged": 1706486541918
+        }
+      },
+      "outputs": [],
+      "source": [
+        "wandb.finish()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "gather": {
+          "logged": 1706486541928
+        }
+      },
+      "outputs": [],
+      "source": [
+        "variant = \"full_sample\" if SUBSAMPLING == 1.0 else f\"subsample-{SUBSAMPLING}\"\n",
+        "tokenizer._tokenizer.save(\"tokenizer.json\")\n",
+        "tokenizer.push_to_hub(\"chrisvoncsefalvay/daedra\")\n",
+        "sample = \"full sample\" if SUBSAMPLING == 1.0 else f\"{SUBSAMPLING * 100}% of the full sample\"\n",
+        "\n",
+        "model.push_to_hub(\"chrisvoncsefalvay/daedra\", \n",
+        "                  variant=variant,\n",
+        "                  commit_message=f\"DAEDRA model trained on {sample} of the VAERS dataset (training set size: {dataset['train'].num_rows:,})\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "datalore": {
+      "base_environment": "default",
+      "computation_mode": "JUPYTER",
+      "package_manager": "pip",
+      "packages": [
+        {
+          "name": "datasets",
+          "source": "PIP",
+          "version": "2.16.1"
+        },
+        {
+          "name": "torch",
+          "source": "PIP",
+          "version": "2.1.2"
+        },
+        {
+          "name": "accelerate",
+          "source": "PIP",
+          "version": "0.26.1"
+        }
+      ],
+      "report_row_ids": [
+        "un8W7ez7ZwoGb5Co6nydEV",
+        "40nN9Hvgi1clHNV5RAemI5",
+        "TgRD90H5NSPpKS41OeXI1w",
+        "ZOm5BfUs3h1EGLaUkBGeEB",
+        "kOP0CZWNSk6vqE3wkPp7Vc",
+        "W4PWcOu2O2pRaZyoE2W80h",
+        "RolbOnQLIftk0vy9mIcz5M",
+        "8OPhUgbaNJmOdiq5D3a6vK",
+        "5Qrt3jSvSrpK6Ne1hS6shL",
+        "hTq7nFUrovN5Ao4u6dIYWZ",
+        "I8WNZLpJ1DVP2wiCW7YBIB",
+        "SawhU3I9BewSE1XBPstpNJ",
+        "80EtLEl2FIE4FqbWnUD3nT"
+      ],
+      "version": 3
+    },
+    "kernel_info": {
+      "name": "python38-azureml-pt-tf"
+    },
+    "kernelspec": {
+      "display_name": "Python 3.8 - Pytorch and Tensorflow",
+      "language": "python",
+      "name": "python38-azureml-pt-tf"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.5"
+    },
+    "microsoft": {
+      "host": {
+        "AzureML": {
+          "notebookHasBeenCompleted": true
+        }
+      },
+      "ms_spell_check": {
+        "ms_spell_check_language": "en"
+      }
+    },
+    "nteract": {
+      "version": "nteract-front-end@1.0.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}

notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-29-4-40-54Z.ipynb ADDED Viewed

	@@ -0,0 +1,1073 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# DAEDRA: Determining Adverse Event Disposition for Regulatory Affairs\n",
+        "\n",
+        "DAEDRA is a language model intended to predict the disposition (outcome) of an adverse event based on the text of the event report. Intended to be used to classify reports in passive reporting systems, it is trained on the [VAERS](https://vaers.hhs.gov/) dataset, which contains reports of adverse events following vaccination in the United States."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "gather": {
+          "logged": 1706475754655
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "tags": []
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "/bin/bash: /anaconda/envs/azureml_py38_PT_TF/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n",
+            "Requirement already satisfied: accelerate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.26.1)\n",
+            "Requirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (23.1)\n",
+            "Requirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.4.2)\n",
+            "Requirement already satisfied: huggingface-hub in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.20.3)\n",
+            "Requirement already satisfied: pyyaml in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (6.0)\n",
+            "Requirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.23.5)\n",
+            "Requirement already satisfied: psutil in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (5.9.5)\n",
+            "Requirement already satisfied: torch>=1.10.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.12.0)\n",
+            "Requirement already satisfied: typing_extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from torch>=1.10.0->accelerate) (4.6.3)\n",
+            "Requirement already satisfied: fsspec>=2023.5.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2023.10.0)\n",
+            "Requirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (3.13.1)\n",
+            "Requirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2.31.0)\n",
+            "Requirement already satisfied: tqdm>=4.42.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (4.65.0)\n",
+            "Requirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (1.26.16)\n",
+            "Requirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.1.0)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.4)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (2023.5.7)\n",
+            "Note: you may need to restart the kernel to use updated packages.\n"
+          ]
+        }
+      ],
+      "source": [
+        "%pip install accelerate -U"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "/bin/bash: /anaconda/envs/azureml_py38_PT_TF/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n",
+            "Requirement already satisfied: transformers in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (4.37.1)\n",
+            "Requirement already satisfied: datasets in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.16.1)\n",
+            "Requirement already satisfied: shap in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.44.1)\n",
+            "Requirement already satisfied: watermark in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.4.3)\n",
+            "Requirement already satisfied: wandb in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.16.2)\n",
+            "Requirement already satisfied: evaluate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.4.1)\n",
+            "Requirement already satisfied: codecarbon in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.3.3)\n",
+            "Requirement already satisfied: tokenizers<0.19,>=0.14 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.15.1)\n",
+            "Requirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (3.13.1)\n",
+            "Requirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2.31.0)\n",
+            "Requirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (1.23.5)\n",
+            "Requirement already satisfied: pyyaml>=5.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (6.0)\n",
+            "Requirement already satisfied: regex!=2019.12.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2023.12.25)\n",
+            "Requirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.20.3)\n",
+            "Requirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.4.2)\n",
+            "Requirement already satisfied: tqdm>=4.27 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (4.65.0)\n",
+            "Requirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (23.1)\n",
+            "Requirement already satisfied: pyarrow>=8.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (9.0.0)\n",
+            "Requirement already satisfied: xxhash in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.4.1)\n",
+            "Requirement already satisfied: aiohttp in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.9.1)\n",
+            "Requirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2023.10.0)\n",
+            "Requirement already satisfied: pandas in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2.0.2)\n",
+            "Requirement already satisfied: multiprocess in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.70.15)\n",
+            "Requirement already satisfied: dill<0.3.8,>=0.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.3.7)\n",
+            "Requirement already satisfied: pyarrow-hotfix in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.6)\n",
+            "Requirement already satisfied: slicer==0.0.7 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.0.7)\n",
+            "Requirement already satisfied: cloudpickle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (2.2.1)\n",
+            "Requirement already satisfied: numba in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.58.1)\n",
+            "Requirement already satisfied: scipy in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.10.1)\n",
+            "Requirement already satisfied: scikit-learn in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.2.2)\n",
+            "Requirement already satisfied: importlib-metadata>=1.4 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (6.7.0)\n",
+            "Requirement already satisfied: ipython>=6.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (8.12.2)\n",
+            "Requirement already satisfied: setuptools in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (65.6.3)\n",
+            "Requirement already satisfied: psutil>=5.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (5.9.5)\n",
+            "Requirement already satisfied: setproctitle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.3.3)\n",
+            "Requirement already satisfied: appdirs>=1.4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.4.4)\n",
+            "Requirement already satisfied: docker-pycreds>=0.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (0.4.0)\n",
+            "Requirement already satisfied: sentry-sdk>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.39.2)\n",
+            "Requirement already satisfied: Click!=8.0.0,>=7.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (8.1.3)\n",
+            "Requirement already satisfied: protobuf!=4.21.0,<5,>=3.12.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.19.6)\n",
+            "Requirement already satisfied: typing-extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (4.6.3)\n",
+            "Requirement already satisfied: GitPython!=3.1.29,>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.1.31)\n",
+            "Requirement already satisfied: responses<0.19 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from evaluate) (0.18.0)\n",
+            "Requirement already satisfied: rapidfuzz in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (3.6.1)\n",
+            "Requirement already satisfied: prometheus-client in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (0.19.0)\n",
+            "Requirement already satisfied: py-cpuinfo in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (9.0.0)\n",
+            "Requirement already satisfied: arrow in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (1.3.0)\n",
+            "Requirement already satisfied: pynvml in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (11.5.0)\n",
+            "Requirement already satisfied: six>=1.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0)\n",
+            "Requirement already satisfied: multidict<7.0,>=4.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (6.0.4)\n",
+            "Requirement already satisfied: attrs>=17.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (23.1.0)\n",
+            "Requirement already satisfied: frozenlist>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.4.1)\n",
+            "Requirement already satisfied: async-timeout<5.0,>=4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (4.0.3)\n",
+            "Requirement already satisfied: aiosignal>=1.1.2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.1)\n",
+            "Requirement already satisfied: yarl<2.0,>=1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.9.4)\n",
+            "Requirement already satisfied: gitdb<5,>=4.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from GitPython!=3.1.29,>=1.0.0->wandb) (4.0.10)\n",
+            "Requirement already satisfied: zipp>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from importlib-metadata>=1.4->watermark) (3.15.0)\n",
+            "Requirement already satisfied: decorator in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.1.1)\n",
+            "Requirement already satisfied: matplotlib-inline in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.1.6)\n",
+            "Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (3.0.30)\n",
+            "Requirement already satisfied: pygments>=2.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (2.15.1)\n",
+            "Requirement already satisfied: stack-data in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.6.2)\n",
+            "Requirement already satisfied: backcall in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.2.0)\n",
+            "Requirement already satisfied: pickleshare in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.7.5)\n",
+            "Requirement already satisfied: traitlets>=5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.9.0)\n",
+            "Requirement already satisfied: jedi>=0.16 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.18.2)\n",
+            "Requirement already satisfied: pexpect>4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (4.8.0)\n",
+            "Requirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (1.26.16)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.4)\n",
+            "Requirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.1.0)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (2023.5.7)\n",
+            "Requirement already satisfied: python-dateutil>=2.7.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from arrow->codecarbon) (2.8.2)\n",
+            "Requirement already satisfied: types-python-dateutil>=2.8.10 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from arrow->codecarbon) (2.8.19.20240106)\n",
+            "Requirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from numba->shap) (0.41.1)\n",
+            "Requirement already satisfied: pytz>=2020.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\n",
+            "Requirement already satisfied: tzdata>=2022.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\n",
+            "Requirement already satisfied: joblib>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (1.2.0)\n",
+            "Requirement already satisfied: threadpoolctl>=2.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (3.1.0)\n",
+            "Requirement already satisfied: smmap<6,>=3.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.0)\n",
+            "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from jedi>=0.16->ipython>=6.0->watermark) (0.8.3)\n",
+            "Requirement already satisfied: ptyprocess>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pexpect>4.3->ipython>=6.0->watermark) (0.7.0)\n",
+            "Requirement already satisfied: wcwidth in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.0->watermark) (0.2.6)\n",
+            "Requirement already satisfied: asttokens>=2.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (2.2.1)\n",
+            "Requirement already satisfied: executing>=1.2.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (1.2.0)\n",
+            "Requirement already satisfied: pure-eval in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (0.2.2)\n",
+            "Note: you may need to restart the kernel to use updated packages.\n"
+          ]
+        }
+      ],
+      "source": [
+        "%pip install transformers datasets shap watermark wandb evaluate codecarbon"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "caZjjFP0OyQNMVgZDiwswE",
+          "report_properties": {
+            "rowId": "un8W7ez7ZwoGb5Co6nydEV"
+          },
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706486372154
+        },
+        "tags": []
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "[codecarbon INFO @ 04:20:20] [setup] RAM Tracking...\n",
+            "[codecarbon INFO @ 04:20:20] [setup] GPU Tracking...\n",
+            "[codecarbon INFO @ 04:20:20] Tracking Nvidia GPU via pynvml\n",
+            "[codecarbon INFO @ 04:20:20] [setup] CPU Tracking...\n",
+            "[codecarbon WARNING @ 04:20:20] No CPU tracking mode found. Falling back on CPU constant mode.\n",
+            "[codecarbon WARNING @ 04:20:21] We saw that you have a Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz but we don't know it. Please contact us.\n",
+            "[codecarbon INFO @ 04:20:21] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz\n",
+            "[codecarbon INFO @ 04:20:21] >>> Tracker's metadata:\n",
+            "[codecarbon INFO @ 04:20:21]   Platform system: Linux-5.15.0-1040-azure-x86_64-with-glibc2.10\n",
+            "[codecarbon INFO @ 04:20:21]   Python version: 3.8.5\n",
+            "[codecarbon INFO @ 04:20:21]   CodeCarbon version: 2.3.3\n",
+            "[codecarbon INFO @ 04:20:21]   Available RAM : 440.883 GB\n",
+            "[codecarbon INFO @ 04:20:21]   CPU count: 24\n",
+            "[codecarbon INFO @ 04:20:21]   CPU model: Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz\n",
+            "[codecarbon INFO @ 04:20:21]   GPU count: 4\n",
+            "[codecarbon INFO @ 04:20:21]   GPU model: 4 x Tesla V100-PCIE-16GB\n",
+            "[codecarbon WARNING @ 04:20:21] Cloud provider 'azure' do not publish electricity carbon intensity. Using country value instead.\n"
+          ]
+        }
+      ],
+      "source": [
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "import torch\n",
+        "import os\n",
+        "from typing import List, Union\n",
+        "from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, DataCollatorWithPadding, pipeline\n",
+        "from datasets import load_dataset, Dataset, DatasetDict\n",
+        "import shap\n",
+        "import wandb\n",
+        "import evaluate\n",
+        "from codecarbon import EmissionsTracker\n",
+        "import logging\n",
+        "\n",
+        "wandb.finish()\n",
+        "\n",
+        "logging.getLogger('codecarbon').propagate = False\n",
+        "\n",
+        "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+        "tracker = EmissionsTracker()\n",
+        "\n",
+        "%load_ext watermark"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706486372304
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      },
+      "outputs": [],
+      "source": [
+        "device: str = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+        "\n",
+        "SEED: int = 42\n",
+        "\n",
+        "BATCH_SIZE: int = 32\n",
+        "EPOCHS: int = 5\n",
+        "model_ckpt: str = \"distilbert-base-uncased\"\n",
+        "\n",
+        "# WandB configuration\n",
+        "os.environ[\"WANDB_PROJECT\"] = \"DAEDRA multiclass model training\" \n",
+        "os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\"  # log all model checkpoints\n",
+        "os.environ[\"WANDB_NOTEBOOK_NAME\"] = \"DAEDRA.ipynb\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "collapsed": false,
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "re      : 2.2.1\n",
+            "pandas  : 2.0.2\n",
+            "evaluate: 0.4.1\n",
+            "logging : 0.5.1.2\n",
+            "torch   : 1.12.0\n",
+            "shap    : 0.44.1\n",
+            "wandb   : 0.16.2\n",
+            "numpy   : 1.23.5\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "%watermark --iversion"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "UU2oOJhwbIualogG1YyCMd",
+          "type": "CODE"
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "/bin/bash: /anaconda/envs/azureml_py38_PT_TF/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n",
+            "Mon Jan 29 04:20:46 2024       \n",
+            "+---------------------------------------------------------------------------------------+\n",
+            "| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |\n",
+            "|-----------------------------------------+----------------------+----------------------+\n",
+            "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
+            "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
+            "|                                         |                      |               MIG M. |\n",
+            "|=========================================+======================+======================|\n",
+            "|   0  Tesla V100-PCIE-16GB           Off | 00000001:00:00.0 Off |                  Off |\n",
+            "| N/A   26C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\n",
+            "|                                         |                      |                  N/A |\n",
+            "+-----------------------------------------+----------------------+----------------------+\n",
+            "|   1  Tesla V100-PCIE-16GB           Off | 00000002:00:00.0 Off |                  Off |\n",
+            "| N/A   25C    P0              23W / 250W |      4MiB / 16384MiB |      0%      Default |\n",
+            "|                                         |                      |                  N/A |\n",
+            "+-----------------------------------------+----------------------+----------------------+\n",
+            "|   2  Tesla V100-PCIE-16GB           Off | 00000003:00:00.0 Off |                  Off |\n",
+            "| N/A   26C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\n",
+            "|                                         |                      |                  N/A |\n",
+            "+-----------------------------------------+----------------------+----------------------+\n",
+            "|   3  Tesla V100-PCIE-16GB           Off | 00000004:00:00.0 Off |                  Off |\n",
+            "| N/A   27C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\n",
+            "|                                         |                      |                  N/A |\n",
+            "+-----------------------------------------+----------------------+----------------------+\n",
+            "                                                                                         \n",
+            "+---------------------------------------------------------------------------------------+\n",
+            "| Processes:                                                                            |\n",
+            "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
+            "|        ID   ID                                                             Usage      |\n",
+            "|=======================================================================================|\n",
+            "|  No running processes found                                                           |\n",
+            "+---------------------------------------------------------------------------------------+\n"
+          ]
+        }
+      ],
+      "source": [
+        "!nvidia-smi"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "t45KHugmcPVaO0nuk8tGJ9",
+          "report_properties": {
+            "rowId": "40nN9Hvgi1clHNV5RAemI5"
+          },
+          "type": "MD"
+        }
+      },
+      "source": [
+        "## Loading the data set"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706486373931
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      },
+      "outputs": [],
+      "source": [
+        "dataset = load_dataset(\"chrisvoncsefalvay/vaers-outcomes\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706486374218
+        },
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "DatasetDict({\n",
+              "    train: Dataset({\n",
+              "        features: ['id', 'text', 'label'],\n",
+              "        num_rows: 1270444\n",
+              "    })\n",
+              "    test: Dataset({\n",
+              "        features: ['id', 'text', 'label'],\n",
+              "        num_rows: 272238\n",
+              "    })\n",
+              "    val: Dataset({\n",
+              "        features: ['id', 'text', 'label'],\n",
+              "        num_rows: 272238\n",
+              "    })\n",
+              "})"
+            ]
+          },
+          "execution_count": 9,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "gather": {
+          "logged": 1706486374480
+        }
+      },
+      "outputs": [],
+      "source": [
+        "SUBSAMPLING = 1.0\n",
+        "\n",
+        "if SUBSAMPLING < 1:\n",
+        "    _ = DatasetDict()\n",
+        "    for each in dataset.keys():\n",
+        "        _[each] = dataset[each].shuffle(seed=SEED).select(range(int(len(dataset[each]) * SUBSAMPLING)))\n",
+        "\n",
+        "    dataset = _"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Tokenisation and encoding"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "metadata": {
+        "gather": {
+          "logged": 1706486375030
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def encode_ds(ds: Union[Dataset, DatasetDict], tokenizer_model: str = model_ckpt) -> Union[Dataset, DatasetDict]:\n",
+        "    return ds_enc"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Evaluation metrics"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {
+        "gather": {
+          "logged": 1706486375197
+        }
+      },
+      "outputs": [],
+      "source": [
+        "accuracy = evaluate.load(\"accuracy\")\n",
+        "precision, recall = evaluate.load(\"precision\"), evaluate.load(\"recall\")\n",
+        "f1 = evaluate.load(\"f1\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {
+        "gather": {
+          "logged": 1706486375361
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def compute_metrics(eval_pred):\n",
+        "    predictions, labels = eval_pred\n",
+        "    predictions = np.argmax(predictions, axis=1)\n",
+        "    return {\n",
+        "        'accuracy': accuracy.compute(predictions=predictions, references=labels)[\"accuracy\"],\n",
+        "        'precision_macroaverage': precision.compute(predictions=predictions, references=labels, average='macro')[\"precision\"],\n",
+        "        'precision_microaverage': precision.compute(predictions=predictions, references=labels, average='micro')[\"precision\"],\n",
+        "        'recall_macroaverage': recall.compute(predictions=predictions, references=labels, average='macro')[\"recall\"],\n",
+        "        'recall_microaverage': recall.compute(predictions=predictions, references=labels, average='micro')[\"recall\"],\n",
+        "        'f1_microaverage': f1.compute(predictions=predictions, references=labels, average='micro')[\"f1\"]\n",
+        "    }"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Training"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We specify a label map – this has to be done manually, even if `Datasets` has a function for it, as `AutoModelForSequenceClassification` requires an object with a length :("
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 14,
+      "metadata": {
+        "gather": {
+          "logged": 1706486375569
+        }
+      },
+      "outputs": [],
+      "source": [
+        "label_map = {i: label for i, label in enumerate(dataset[\"test\"].features[\"label\"].names)}"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 15,
+      "metadata": {
+        "gather": {
+          "logged": 1706486433708
+        }
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1270444/1270444 [08:09<00:00, 2595.90 examples/s]\n",
+            "Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 272238/272238 [01:45<00:00, 2585.25 examples/s]\n",
+            "Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████���█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 272238/272238 [01:44<00:00, 2605.66 examples/s]\n"
+          ]
+        }
+      ],
+      "source": [
+        "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)\n",
+        "\n",
+        "cols = dataset[\"train\"].column_names\n",
+        "cols.remove(\"label\")\n",
+        "ds_enc = dataset.map(lambda x: tokenizer(x[\"text\"], truncation=True), batched=True, remove_columns=cols)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 16,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+          ]
+        }
+      ],
+      "source": [
+        "\n",
+        "model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, \n",
+        "    num_labels=len(dataset[\"test\"].features[\"label\"].names), \n",
+        "    id2label=label_map, \n",
+        "    label2id={v:k for k,v in label_map.items()})\n",
+        "\n",
+        "args = TrainingArguments(\n",
+        "    output_dir=\"vaers\",\n",
+        "    evaluation_strategy=\"epoch\",\n",
+        "    save_strategy=\"epoch\",\n",
+        "    learning_rate=2e-5,\n",
+        "    per_device_train_batch_size=BATCH_SIZE,\n",
+        "    per_device_eval_batch_size=BATCH_SIZE,\n",
+        "    num_train_epochs=EPOCHS,\n",
+        "    weight_decay=.01,\n",
+        "    logging_steps=1,\n",
+        "    load_best_model_at_end=True,\n",
+        "    run_name=f\"daedra-training\",\n",
+        "    report_to=[\"wandb\"])\n",
+        "\n",
+        "trainer = Trainer(\n",
+        "        model=model,\n",
+        "        args=args,\n",
+        "        train_dataset=ds_enc[\"train\"],\n",
+        "        eval_dataset=ds_enc[\"test\"],\n",
+        "        tokenizer=tokenizer,\n",
+        "        compute_metrics=compute_metrics)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 17,
+      "metadata": {
+        "gather": {
+          "logged": 1706486444806
+        }
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mchrisvoncsefalvay\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
+            "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m wandb.init() arguments ignored because wandb magic has already been initialized\n"
+          ]
+        },
+        {
+          "data": {
+            "text/html": [
+              "Tracking run with wandb version 0.16.2"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/daedra-hptrain-cvc/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240129_043232-tl59png2</code>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/tl59png2' target=\"_blank\">daedra_training_run</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/tl59png2' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/tl59png2</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Finishing last run (ID:tl59png2) before initializing another..."
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              " View run <strong style=\"color:#cdcd00\">daedra_training_run</strong> at: <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/tl59png2' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/tl59png2</a><br/> View job at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjEzNDcyMTQwMw==/version_details/v0' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjEzNDcyMTQwMw==/version_details/v0</a><br/>Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Find logs at: <code>./wandb/run-20240129_043232-tl59png2/logs</code>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Successfully finished last run (ID:tl59png2). Initializing new run:<br/>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Tracking run with wandb version 0.16.2"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/daedra-hptrain-cvc/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240129_043243-x8j2xw0x</code>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/x8j2xw0x' target=\"_blank\">daedra_training_run</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/x8j2xw0x' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/x8j2xw0x</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "<button onClick=\"this.nextSibling.style.display='block';this.style.display='none';\">Display W&B run</button><iframe src='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/x8j2xw0x?jupyter=true' style='border:none;width:100%;height:420px;display:none;'></iframe>"
+            ],
+            "text/plain": [
+              "<wandb.sdk.wandb_run.Run at 0x7ffa3d0e9bb0>"
+            ]
+          },
+          "execution_count": 17,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "if SUBSAMPLING != 1.0:\n",
+        "    wandb_tag: List[str] = [f\"subsample-{SUBSAMPLING}\"]\n",
+        "else:\n",
+        "    wandb_tag: List[str] = [f\"full_sample\"]\n",
+        "\n",
+        "wandb_tag.append(f\"batch_size-{BATCH_SIZE}\")\n",
+        "wandb_tag.append(f\"base:{model_ckpt}\")\n",
+        "    \n",
+        "wandb.init(name=\"daedra_training_run\", tags=wandb_tag, magic=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 18,
+      "metadata": {
+        "gather": {
+          "logged": 1706486541798
+        }
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"
+          ]
+        },
+        {
+          "data": {
+            "text/html": [
+              "\n",
+              "    <div>\n",
+              "      \n",
+              "      <progress value='394' max='49630' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+              "      [  394/49630 04:13 < 8:50:52, 1.55 it/s, Epoch 0.04/5]\n",
+              "    </div>\n",
+              "    <table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              " <tr style=\"text-align: left;\">\n",
+              "      <th>Epoch</th>\n",
+              "      <th>Training Loss</th>\n",
+              "      <th>Validation Loss</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "  </tbody>\n",
+              "</table><p>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "[codecarbon INFO @ 04:33:12] Energy consumed for RAM : 0.000689 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 04:33:12] Energy consumed for all GPUs : 0.001450 kWh. Total GPU Power : 347.66451200921796 W\n",
+            "[codecarbon INFO @ 04:33:12] Energy consumed for all CPUs : 0.000177 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 04:33:12] 0.002317 kWh of electricity used since the beginning.\n",
+            "[codecarbon INFO @ 04:33:27] Energy consumed for RAM : 0.001378 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 04:33:27] Energy consumed for all GPUs : 0.004012 kWh. Total GPU Power : 615.4556826768763 W\n",
+            "[codecarbon INFO @ 04:33:27] Energy consumed for all CPUs : 0.000355 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 04:33:27] 0.005745 kWh of electricity used since the beginning.\n",
+            "[codecarbon INFO @ 04:33:42] Energy consumed for RAM : 0.002066 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 04:33:42] Energy consumed for all GPUs : 0.006596 kWh. Total GPU Power : 620.9110211178034 W\n",
+            "[codecarbon INFO @ 04:33:42] Energy consumed for all CPUs : 0.000532 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 04:33:42] 0.009194 kWh of electricity used since the beginning.\n",
+            "[codecarbon INFO @ 04:33:57] Energy consumed for RAM : 0.002754 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 04:33:57] Energy consumed for all GPUs : 0.009183 kWh. Total GPU Power : 621.1270289526989 W\n",
+            "[codecarbon INFO @ 04:33:57] Energy consumed for all CPUs : 0.000709 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 04:33:57] 0.012645 kWh of electricity used since the beginning.\n",
+            "[codecarbon INFO @ 04:34:12] Energy consumed for RAM : 0.003442 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 04:34:12] Energy consumed for all GPUs : 0.011798 kWh. Total GPU Power : 628.3875606622404 W\n",
+            "[codecarbon INFO @ 04:34:12] Energy consumed for all CPUs : 0.000886 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 04:34:12] 0.016125 kWh of electricity used since the beginning.\n",
+            "[codecarbon INFO @ 04:34:27] Energy consumed for RAM : 0.004130 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 04:34:27] Energy consumed for all GPUs : 0.014431 kWh. Total GPU Power : 632.4054645127197 W\n",
+            "[codecarbon INFO @ 04:34:27] Energy consumed for all CPUs : 0.001063 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 04:34:27] 0.019623 kWh of electricity used since the beginning.\n",
+            "[codecarbon INFO @ 04:34:42] Energy consumed for RAM : 0.004818 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 04:34:42] Energy consumed for all GPUs : 0.017064 kWh. Total GPU Power : 632.6571124342939 W\n",
+            "[codecarbon INFO @ 04:34:42] Energy consumed for all CPUs : 0.001240 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 04:34:42] 0.023122 kWh of electricity used since the beginning.\n",
+            "[codecarbon INFO @ 04:34:57] Energy consumed for RAM : 0.005506 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 04:34:57] Energy consumed for all GPUs : 0.019707 kWh. Total GPU Power : 634.7921879339333 W\n",
+            "[codecarbon INFO @ 04:34:57] Energy consumed for all CPUs : 0.001417 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 04:34:57] 0.026631 kWh of electricity used since the beginning.\n",
+            "[codecarbon INFO @ 04:35:12] Energy consumed for RAM : 0.006194 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 04:35:12] Energy consumed for all GPUs : 0.022334 kWh. Total GPU Power : 630.3609394863598 W\n",
+            "[codecarbon INFO @ 04:35:12] Energy consumed for all CPUs : 0.001594 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 04:35:12] 0.030123 kWh of electricity used since the beginning.\n",
+            "[codecarbon INFO @ 04:35:27] Energy consumed for RAM : 0.006882 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 04:35:27] Energy consumed for all GPUs : 0.024956 kWh. Total GPU Power : 630.704729336156 W\n",
+            "[codecarbon INFO @ 04:35:27] Energy consumed for all CPUs : 0.001771 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 04:35:27] 0.033609 kWh of electricity used since the beginning.\n",
+            "[codecarbon INFO @ 04:35:42] Energy consumed for RAM : 0.007570 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 04:35:42] Energy consumed for all GPUs : 0.027604 kWh. Total GPU Power : 636.1545465788125 W\n",
+            "[codecarbon INFO @ 04:35:42] Energy consumed for all CPUs : 0.001948 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 04:35:42] 0.037121 kWh of electricity used since the beginning.\n",
+            "[codecarbon INFO @ 04:35:57] Energy consumed for RAM : 0.008258 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 04:35:57] Energy consumed for all GPUs : 0.030255 kWh. Total GPU Power : 636.9769106141198 W\n",
+            "[codecarbon INFO @ 04:35:57] Energy consumed for all CPUs : 0.002125 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 04:35:57] 0.040638 kWh of electricity used since the beginning.\n",
+            "[codecarbon INFO @ 04:36:12] Energy consumed for RAM : 0.008946 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 04:36:12] Energy consumed for all GPUs : 0.032913 kWh. Total GPU Power : 638.3412890613937 W\n",
+            "[codecarbon INFO @ 04:36:12] Energy consumed for all CPUs : 0.002302 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 04:36:12] 0.044161 kWh of electricity used since the beginning.\n",
+            "[codecarbon INFO @ 04:36:27] Energy consumed for RAM : 0.009634 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 04:36:27] Energy consumed for all GPUs : 0.035515 kWh. Total GPU Power : 625.0502398771333 W\n",
+            "[codecarbon INFO @ 04:36:27] Energy consumed for all CPUs : 0.002479 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 04:36:27] 0.047628 kWh of electricity used since the beginning.\n",
+            "[codecarbon INFO @ 04:36:42] Energy consumed for RAM : 0.010322 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 04:36:42] Energy consumed for all GPUs : 0.038183 kWh. Total GPU Power : 641.00719087638 W\n",
+            "[codecarbon INFO @ 04:36:42] Energy consumed for all CPUs : 0.002656 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 04:36:42] 0.051162 kWh of electricity used since the beginning.\n",
+            "[codecarbon INFO @ 04:36:57] Energy consumed for RAM : 0.011010 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 04:36:57] Energy consumed for all GPUs : 0.040821 kWh. Total GPU Power : 633.4817689949092 W\n",
+            "[codecarbon INFO @ 04:36:57] Energy consumed for all CPUs : 0.002834 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 04:36:57] 0.054665 kWh of electricity used since the beginning.\n",
+            "[codecarbon INFO @ 04:37:12] Energy consumed for RAM : 0.011698 kWh. RAM Power : 165.33123922348022 W\n",
+            "[codecarbon INFO @ 04:37:12] Energy consumed for all GPUs : 0.043484 kWh. Total GPU Power : 639.8452880027475 W\n",
+            "[codecarbon INFO @ 04:37:12] Energy consumed for all CPUs : 0.003011 kWh. Total CPU Power : 42.5 W\n",
+            "[codecarbon INFO @ 04:37:12] 0.058193 kWh of electricity used since the beginning.\n"
+          ]
+        }
+      ],
+      "source": [
+        "tracker.start()\n",
+        "trainer.train()\n",
+        "tracker.stop()\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "gather": {
+          "logged": 1706486541918
+        }
+      },
+      "outputs": [],
+      "source": [
+        "wandb.finish()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "gather": {
+          "logged": 1706486541928
+        }
+      },
+      "outputs": [],
+      "source": [
+        "variant = \"full_sample\" if SUBSAMPLING == 1.0 else f\"subsample-{SUBSAMPLING}\"\n",
+        "tokenizer._tokenizer.save(\"tokenizer.json\")\n",
+        "tokenizer.push_to_hub(\"chrisvoncsefalvay/daedra\")\n",
+        "sample = \"full sample\" if SUBSAMPLING == 1.0 else f\"{SUBSAMPLING * 100}% of the full sample\"\n",
+        "\n",
+        "model.push_to_hub(\"chrisvoncsefalvay/daedra\", \n",
+        "                  variant=variant,\n",
+        "                  commit_message=f\"DAEDRA model trained on {sample} of the VAERS dataset (training set size: {dataset['train'].num_rows:,})\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "variant = \"full_sample\" if SUBSAMPLING == 1.0 else f\"subsample-{SUBSAMPLING}\"\n",
+        "tokenizer._tokenizer.save(\"tokenizer.json\")\n",
+        "tokenizer.push_to_hub(\"chrisvoncsefalvay/daedra\")\n",
+        "sample = \"full sample\" if SUBSAMPLING == 1.0 else f\"{SUBSAMPLING * 100}% of the full sample\"\n",
+        "\n",
+        "model.push_to_hub(\"chrisvoncsefalvay/daedra\", \n",
+        "                  variant=variant,\n",
+        "                  commit_message=f\"DAEDRA model trained on {sample} of the VAERS dataset (training set size: {dataset['train'].num_rows:,})\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "datalore": {
+      "base_environment": "default",
+      "computation_mode": "JUPYTER",
+      "package_manager": "pip",
+      "packages": [
+        {
+          "name": "datasets",
+          "source": "PIP",
+          "version": "2.16.1"
+        },
+        {
+          "name": "torch",
+          "source": "PIP",
+          "version": "2.1.2"
+        },
+        {
+          "name": "accelerate",
+          "source": "PIP",
+          "version": "0.26.1"
+        }
+      ],
+      "report_row_ids": [
+        "un8W7ez7ZwoGb5Co6nydEV",
+        "40nN9Hvgi1clHNV5RAemI5",
+        "TgRD90H5NSPpKS41OeXI1w",
+        "ZOm5BfUs3h1EGLaUkBGeEB",
+        "kOP0CZWNSk6vqE3wkPp7Vc",
+        "W4PWcOu2O2pRaZyoE2W80h",
+        "RolbOnQLIftk0vy9mIcz5M",
+        "8OPhUgbaNJmOdiq5D3a6vK",
+        "5Qrt3jSvSrpK6Ne1hS6shL",
+        "hTq7nFUrovN5Ao4u6dIYWZ",
+        "I8WNZLpJ1DVP2wiCW7YBIB",
+        "SawhU3I9BewSE1XBPstpNJ",
+        "80EtLEl2FIE4FqbWnUD3nT"
+      ],
+      "version": 3
+    },
+    "kernel_info": {
+      "name": "python38-azureml-pt-tf"
+    },
+    "kernelspec": {
+      "display_name": "azureml_py38_PT_TF",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.5"
+    },
+    "microsoft": {
+      "host": {
+        "AzureML": {
+          "notebookHasBeenCompleted": true
+        }
+      },
+      "ms_spell_check": {
+        "ms_spell_check_language": "en"
+      }
+    },
+    "nteract": {
+      "version": "nteract-front-end@1.0.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}

notebooks/.ipynb_aml_checkpoints/DAEDRA-checkpoint2024-0-30-21-44-8Z.ipynb ADDED Viewed

	@@ -0,0 +1,671 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# DAEDRA: Determining Adverse Event Disposition for Regulatory Affairs\n",
+        "\n",
+        "DAEDRA is a language model intended to predict the disposition (outcome) of an adverse event based on the text of the event report. Intended to be used to classify reports in passive reporting systems, it is trained on the [VAERS](https://vaers.hhs.gov/) dataset, which contains reports of adverse events following vaccination in the United States."
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%pip install accelerate -U"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Requirement already satisfied: accelerate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.26.1)\nRequirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.23.5)\nRequirement already satisfied: huggingface-hub in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.20.3)\nRequirement already satisfied: pyyaml in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (6.0)\nRequirement already satisfied: psutil in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (5.9.5)\nRequirement already satisfied: torch>=1.10.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.12.0)\nRequirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (23.1)\nRequirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.4.2)\nRequirement already satisfied: typing_extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from torch>=1.10.0->accelerate) (4.6.3)\nRequirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2.31.0)\nRequirement already satisfied: tqdm>=4.42.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (4.65.0)\nRequirement already satisfied: fsspec>=2023.5.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2023.10.0)\nRequirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (3.13.1)\nRequirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (2023.5.7)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (1.26.16)\nRequirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.4)\nRequirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.1.0)\nNote: you may need to restart the kernel to use updated packages.\n"
+        }
+      ],
+      "execution_count": 1,
+      "metadata": {
+        "gather": {
+          "logged": 1706475754655
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "tags": []
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%pip install transformers datasets shap watermark wandb evaluate codecarbon"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Requirement already satisfied: transformers in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (4.37.1)\nRequirement already satisfied: datasets in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.16.1)\nRequirement already satisfied: shap in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.44.1)\nRequirement already satisfied: watermark in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.4.3)\nRequirement already satisfied: wandb in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.16.2)\nRequirement already satisfied: evaluate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.4.1)\nRequirement already satisfied: codecarbon in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.3.3)\nRequirement already satisfied: regex!=2019.12.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2023.12.25)\nRequirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (23.1)\nRequirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.4.2)\nRequirement already satisfied: pyyaml>=5.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (6.0)\nRequirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (1.23.5)\nRequirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (3.13.1)\nRequirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2.31.0)\nRequirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.20.3)\nRequirement already satisfied: tqdm>=4.27 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (4.65.0)\nRequirement already satisfied: tokenizers<0.19,>=0.14 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.15.1)\nRequirement already satisfied: pyarrow>=8.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (9.0.0)\nRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2023.10.0)\nRequirement already satisfied: pyarrow-hotfix in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.6)\nRequirement already satisfied: dill<0.3.8,>=0.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.3.7)\nRequirement already satisfied: aiohttp in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.9.1)\nRequirement already satisfied: multiprocess in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.70.15)\nRequirement already satisfied: xxhash in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.4.1)\nRequirement already satisfied: pandas in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2.0.2)\nRequirement already satisfied: scipy in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.10.1)\nRequirement already satisfied: slicer==0.0.7 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.0.7)\nRequirement already satisfied: cloudpickle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (2.2.1)\nRequirement already satisfied: scikit-learn in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.2.2)\nRequirement already satisfied: numba in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.58.1)\nRequirement already satisfied: setuptools in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (65.6.3)\nRequirement already satisfied: ipython>=6.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (8.12.2)\nRequirement already satisfied: importlib-metadata>=1.4 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (6.7.0)\nRequirement already satisfied: Click!=8.0.0,>=7.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (8.1.3)\nRequirement already satisfied: typing-extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (4.6.3)\nRequirement already satisfied: protobuf!=4.21.0,<5,>=3.12.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.19.6)\nRequirement already satisfied: setproctitle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.3.3)\nRequirement already satisfied: psutil>=5.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (5.9.5)\nRequirement already satisfied: sentry-sdk>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.39.2)\nRequirement already satisfied: docker-pycreds>=0.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (0.4.0)\nRequirement already satisfied: appdirs>=1.4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.4.4)\nRequirement already satisfied: GitPython!=3.1.29,>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.1.31)\nRequirement already satisfied: responses<0.19 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from evaluate) (0.18.0)\nRequirement already satisfied: prometheus-client in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (0.19.0)\nRequirement already satisfied: py-cpuinfo in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (9.0.0)\nRequirement already satisfied: rapidfuzz in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (3.6.1)\nRequirement already satisfied: pynvml in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (11.5.0)\nRequirement already satisfied: arrow in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (1.3.0)\nRequirement already satisfied: six>=1.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0)\nRequirement already satisfied: yarl<2.0,>=1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.9.4)\nRequirement already satisfied: async-timeout<5.0,>=4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (4.0.3)\nRequirement already satisfied: frozenlist>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.4.1)\nRequirement already satisfied: multidict<7.0,>=4.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (6.0.4)\nRequirement already satisfied: aiosignal>=1.1.2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.1)\nRequirement already satisfied: attrs>=17.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (23.1.0)\nRequirement already satisfied: gitdb<5,>=4.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from GitPython!=3.1.29,>=1.0.0->wandb) (4.0.10)\nRequirement already satisfied: zipp>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from importlib-metadata>=1.4->watermark) (3.15.0)\nRequirement already satisfied: pickleshare in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.7.5)\nRequirement already satisfied: backcall in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.2.0)\nRequirement already satisfied: stack-data in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.6.2)\nRequirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (3.0.30)\nRequirement already satisfied: matplotlib-inline in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.1.6)\nRequirement already satisfied: decorator in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.1.1)\nRequirement already satisfied: traitlets>=5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.9.0)\nRequirement already satisfied: jedi>=0.16 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.18.2)\nRequirement already satisfied: pygments>=2.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (2.15.1)\nRequirement already satisfied: pexpect>4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (4.8.0)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (1.26.16)\nRequirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.1.0)\nRequirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.4)\nRequirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (2023.5.7)\nRequirement already satisfied: types-python-dateutil>=2.8.10 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from arrow->codecarbon) (2.8.19.20240106)\nRequirement already satisfied: python-dateutil>=2.7.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from arrow->codecarbon) (2.8.2)\nRequirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from numba->shap) (0.41.1)\nRequirement already satisfied: pytz>=2020.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\nRequirement already satisfied: tzdata>=2022.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (3.1.0)\nRequirement already satisfied: joblib>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (1.2.0)\nRequirement already satisfied: smmap<6,>=3.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.0)\nRequirement already satisfied: parso<0.9.0,>=0.8.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from jedi>=0.16->ipython>=6.0->watermark) (0.8.3)\nRequirement already satisfied: ptyprocess>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pexpect>4.3->ipython>=6.0->watermark) (0.7.0)\nRequirement already satisfied: wcwidth in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.0->watermark) (0.2.6)\nRequirement already satisfied: asttokens>=2.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (2.2.1)\nRequirement already satisfied: pure-eval in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (0.2.2)\nRequirement already satisfied: executing>=1.2.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (1.2.0)\nNote: you may need to restart the kernel to use updated packages.\n"
+        }
+      ],
+      "execution_count": 2,
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "import torch\n",
+        "import os\n",
+        "from typing import List, Union\n",
+        "from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, DataCollatorWithPadding, pipeline, AutoModel\n",
+        "from datasets import load_dataset, Dataset, DatasetDict\n",
+        "import shap\n",
+        "import wandb\n",
+        "import evaluate\n",
+        "import logging\n",
+        "\n",
+        "wandb.finish()\n",
+        "\n",
+        "\n",
+        "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+        "\n",
+        "%load_ext watermark"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n2024-01-29 17:46:15.020290: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\nTo enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n2024-01-29 17:46:16.031641: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n2024-01-29 17:46:16.031779: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n2024-01-29 17:46:16.031793: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
+        }
+      ],
+      "execution_count": 3,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "caZjjFP0OyQNMVgZDiwswE",
+          "report_properties": {
+            "rowId": "un8W7ez7ZwoGb5Co6nydEV"
+          },
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706550378660
+        },
+        "tags": []
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "device: str = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+        "\n",
+        "SEED: int = 42\n",
+        "\n",
+        "BATCH_SIZE: int = 32\n",
+        "EPOCHS: int = 5\n",
+        "model_ckpt: str = \"distilbert-base-uncased\"\n",
+        "\n",
+        "# WandB configuration\n",
+        "os.environ[\"WANDB_PROJECT\"] = \"DAEDRA multiclass model training\" \n",
+        "os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\"  # log all model checkpoints\n",
+        "os.environ[\"WANDB_NOTEBOOK_NAME\"] = \"DAEDRA.ipynb\""
+      ],
+      "outputs": [],
+      "execution_count": 4,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706550378812
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%watermark --iversion"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "shap    : 0.44.1\npandas  : 2.0.2\nwandb   : 0.16.2\nre      : 2.2.1\nevaluate: 0.4.1\ntorch   : 1.12.0\nnumpy   : 1.23.5\nlogging : 0.5.1.2\n\n"
+        }
+      ],
+      "execution_count": 5,
+      "metadata": {
+        "collapsed": false,
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!nvidia-smi"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Mon Jan 29 17:46:18 2024       \r\n+---------------------------------------------------------------------------------------+\r\n| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |\r\n|-----------------------------------------+----------------------+----------------------+\r\n| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\r\n| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\r\n|                                         |                      |               MIG M. |\r\n|=========================================+======================+======================|\r\n|   0  Tesla V100-PCIE-16GB           Off | 00000001:00:00.0 Off |                  Off |\r\n| N/A   26C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n|   1  Tesla V100-PCIE-16GB           Off | 00000002:00:00.0 Off |                  Off |\r\n| N/A   25C    P0              23W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n|   2  Tesla V100-PCIE-16GB           Off | 00000003:00:00.0 Off |                  Off |\r\n| N/A   25C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n|   3  Tesla V100-PCIE-16GB           Off | 00000004:00:00.0 Off |                  Off |\r\n| N/A   27C    P0              24W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n                                                                                         \r\n+---------------------------------------------------------------------------------------+\r\n| Processes:                                                                            |\r\n|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\r\n|        ID   ID                                                             Usage      |\r\n|=======================================================================================|\r\n|  No running processes found                                                           |\r\n+---------------------------------------------------------------------------------------+\r\n"
+        }
+      ],
+      "execution_count": 6,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "UU2oOJhwbIualogG1YyCMd",
+          "type": "CODE"
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Loading the data set"
+      ],
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "t45KHugmcPVaO0nuk8tGJ9",
+          "report_properties": {
+            "rowId": "40nN9Hvgi1clHNV5RAemI5"
+          },
+          "type": "MD"
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dataset = load_dataset(\"chrisvoncsefalvay/vaers-outcomes\")"
+      ],
+      "outputs": [],
+      "execution_count": 7,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706550381141
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dataset"
+      ],
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "execution_count": 8,
+          "data": {
+            "text/plain": "DatasetDict({\n    train: Dataset({\n        features: ['id', 'text', 'label'],\n        num_rows: 1270444\n    })\n    test: Dataset({\n        features: ['id', 'text', 'label'],\n        num_rows: 272238\n    })\n    val: Dataset({\n        features: ['id', 'text', 'label'],\n        num_rows: 272238\n    })\n})"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 8,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706550381303
+        },
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "SUBSAMPLING = 0.01\n",
+        "\n",
+        "if SUBSAMPLING < 1:\n",
+        "    _ = DatasetDict()\n",
+        "    for each in dataset.keys():\n",
+        "        _[each] = dataset[each].shuffle(seed=SEED).select(range(int(len(dataset[each]) * SUBSAMPLING)))\n",
+        "\n",
+        "    dataset = _"
+      ],
+      "outputs": [],
+      "execution_count": 9,
+      "metadata": {
+        "gather": {
+          "logged": 1706550381472
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Tokenisation and encoding"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def encode_ds(ds: Union[Dataset, DatasetDict], tokenizer_model: str = model_ckpt) -> Union[Dataset, DatasetDict]:\n",
+        "    return ds_enc"
+      ],
+      "outputs": [],
+      "execution_count": 10,
+      "metadata": {
+        "gather": {
+          "logged": 1706550381637
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Evaluation metrics"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "accuracy = evaluate.load(\"accuracy\")\n",
+        "precision, recall = evaluate.load(\"precision\"), evaluate.load(\"recall\")\n",
+        "f1 = evaluate.load(\"f1\")"
+      ],
+      "outputs": [],
+      "execution_count": 11,
+      "metadata": {
+        "gather": {
+          "logged": 1706550381778
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def compute_metrics(eval_pred):\n",
+        "    predictions, labels = eval_pred\n",
+        "    predictions = np.argmax(predictions, axis=1)\n",
+        "    return {\n",
+        "        'accuracy': accuracy.compute(predictions=predictions, references=labels)[\"accuracy\"],\n",
+        "        'precision_macroaverage': precision.compute(predictions=predictions, references=labels, average='macro')[\"precision\"],\n",
+        "        'precision_microaverage': precision.compute(predictions=predictions, references=labels, average='micro')[\"precision\"],\n",
+        "        'recall_macroaverage': recall.compute(predictions=predictions, references=labels, average='macro')[\"recall\"],\n",
+        "        'recall_microaverage': recall.compute(predictions=predictions, references=labels, average='micro')[\"recall\"],\n",
+        "        'f1_microaverage': f1.compute(predictions=predictions, references=labels, average='micro')[\"f1\"]\n",
+        "    }"
+      ],
+      "outputs": [],
+      "execution_count": 12,
+      "metadata": {
+        "gather": {
+          "logged": 1706550381891
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Training"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We specify a label map – this has to be done manually, even if `Datasets` has a function for it, as `AutoModelForSequenceClassification` requires an object with a length :("
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "label_map = {i: label for i, label in enumerate(dataset[\"test\"].features[\"label\"].names)}"
+      ],
+      "outputs": [],
+      "execution_count": 13,
+      "metadata": {
+        "gather": {
+          "logged": 1706550382032
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def train_from_model(model_ckpt: str, push: bool = False):\n",
+        "    print(f\"Initialising training based on {model_ckpt}...\")\n",
+        "\n",
+        "    print(\"Tokenising...\")\n",
+        "    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)\n",
+        "\n",
+        "    cols = dataset[\"train\"].column_names\n",
+        "    cols.remove(\"label\")\n",
+        "    ds_enc = dataset.map(lambda x: tokenizer(x[\"text\"], truncation=True, max_length=512), batched=True, remove_columns=cols)\n",
+        "\n",
+        "    print(\"Loading model...\")\n",
+        "    try:\n",
+        "        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, \n",
+        "                                                                    num_labels=len(dataset[\"test\"].features[\"label\"].names), \n",
+        "                                                                    id2label=label_map, \n",
+        "                                                                    label2id={v:k for k,v in label_map.items()})\n",
+        "    except OSError:\n",
+        "        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, \n",
+        "                                                            num_labels=len(dataset[\"test\"].features[\"label\"].names), \n",
+        "                                                            id2label=label_map, \n",
+        "                                                            label2id={v:k for k,v in label_map.items()},\n",
+        "                                                            from_tf=True)\n",
+        "\n",
+        "\n",
+        "    args = TrainingArguments(\n",
+        "        output_dir=\"vaers\",\n",
+        "        evaluation_strategy=\"epoch\",\n",
+        "        save_strategy=\"epoch\",\n",
+        "        learning_rate=2e-5,\n",
+        "        per_device_train_batch_size=BATCH_SIZE,\n",
+        "        per_device_eval_batch_size=BATCH_SIZE,\n",
+        "        num_train_epochs=EPOCHS,\n",
+        "        weight_decay=.01,\n",
+        "        logging_steps=1,\n",
+        "        load_best_model_at_end=True,\n",
+        "        run_name=f\"daedra-training\",\n",
+        "        report_to=[\"wandb\"])\n",
+        "\n",
+        "    trainer = Trainer(\n",
+        "            model=model,\n",
+        "            args=args,\n",
+        "            train_dataset=ds_enc[\"train\"],\n",
+        "            eval_dataset=ds_enc[\"test\"],\n",
+        "            tokenizer=tokenizer,\n",
+        "            compute_metrics=compute_metrics)\n",
+        "    \n",
+        "    if SUBSAMPLING != 1.0:\n",
+        "        wandb_tag: List[str] = [f\"subsample-{SUBSAMPLING}\"]\n",
+        "    else:\n",
+        "        wandb_tag: List[str] = [f\"full_sample\"]\n",
+        "\n",
+        "    wandb_tag.append(f\"batch_size-{BATCH_SIZE}\")\n",
+        "    wandb_tag.append(f\"base:{model_ckpt}\")\n",
+        "        \n",
+        "    wandb.init(name=f\"daedra_{SUBSAMPLING}-{model_ckpt}\", tags=wandb_tag, magic=True)\n",
+        "\n",
+        "    print(\"Starting training...\")\n",
+        "\n",
+        "    trainer.train()\n",
+        "\n",
+        "    print(\"Training finished.\")\n",
+        "\n",
+        "    if push:\n",
+        "        variant = \"full_sample\" if SUBSAMPLING == 1.0 else f\"subsample-{SUBSAMPLING}\"\n",
+        "        tokenizer._tokenizer.save(\"tokenizer.json\")\n",
+        "        tokenizer.push_to_hub(\"chrisvoncsefalvay/daedra\")\n",
+        "        sample = \"full sample\" if SUBSAMPLING == 1.0 else f\"{SUBSAMPLING * 100}% of the full sample\"\n",
+        "\n",
+        "        model.push_to_hub(\"chrisvoncsefalvay/daedra\", \n",
+        "                        variant=variant,\n",
+        "                        commit_message=f\"DAEDRA model trained on {sample} of the VAERS dataset (training set size: {dataset['train'].num_rows:,}), based on {model_ckpt}\")"
+      ],
+      "outputs": [],
+      "execution_count": 14,
+      "metadata": {
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1706550382160
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "\n",
+        "base_models = [\n",
+        "    \"bert-base-uncased\",\n",
+        "    \"distilbert-base-uncased\",\n",
+        "]"
+      ],
+      "outputs": [],
+      "execution_count": 15,
+      "metadata": {
+        "gather": {
+          "logged": 1706550382318
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "BATCH_SIZE=1\n",
+        "\n",
+        "train_from_model(\"biobert/Bio_ClinicalBERT/\")"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Initialising training based on biobert/Bio_ClinicalBERT/...\nTokenising...\nLoading model...\n"
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "Map: 100%|██████████| 2722/2722 [00:01<00:00, 2195.12 examples/s]\nAll TF 2.0 model weights were used when initializing BertForSequenceClassification.\n\nAll the weights of BertForSequenceClassification were initialized from the TF 2.0 model.\nIf your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.\n"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Finishing last run (ID:sg022tqh) before initializing another..."
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View run <strong style=\"color:#cdcd00\">daedra_0.01-biobert/Bio_ClinicalBERT/</strong> at: <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/sg022tqh' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/sg022tqh</a><br/> View job at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjEzNDcyMTQwMw==/version_details/v6' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjEzNDcyMTQwMw==/version_details/v6</a><br/>Synced 6 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Find logs at: <code>./wandb/run-20240129_174816-sg022tqh/logs</code>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Successfully finished last run (ID:sg022tqh). Initializing new run:<br/>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Tracking run with wandb version 0.16.2"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/daedra-hptrain-cvc/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240129_174936-kilkkg1j</code>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/kilkkg1j' target=\"_blank\">daedra_0.01-biobert/Bio_ClinicalBERT/</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training</a>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/kilkkg1j' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/kilkkg1j</a>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Starting training...\n"
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "\n    <div>\n      \n      <progress value='1496' max='15880' style='width:300px; height:20px; vertical-align: middle;'></progress>\n      [ 1496/15880 07:43 < 1:14:19, 3.23 it/s, Epoch 0.47/5]\n    </div>\n    <table border=\"1\" class=\"dataframe\">\n  <thead>\n <tr style=\"text-align: left;\">\n      <th>Epoch</th>\n      <th>Training Loss</th>\n      <th>Validation Loss</th>\n    </tr>\n  </thead>\n  <tbody>\n  </tbody>\n</table><p>"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 21,
+      "metadata": {
+        "gather": {
+          "logged": 1706551053473
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    }
+  ],
+  "metadata": {
+    "datalore": {
+      "base_environment": "default",
+      "computation_mode": "JUPYTER",
+      "package_manager": "pip",
+      "packages": [
+        {
+          "name": "datasets",
+          "source": "PIP",
+          "version": "2.16.1"
+        },
+        {
+          "name": "torch",
+          "source": "PIP",
+          "version": "2.1.2"
+        },
+        {
+          "name": "accelerate",
+          "source": "PIP",
+          "version": "0.26.1"
+        }
+      ],
+      "report_row_ids": [
+        "un8W7ez7ZwoGb5Co6nydEV",
+        "40nN9Hvgi1clHNV5RAemI5",
+        "TgRD90H5NSPpKS41OeXI1w",
+        "ZOm5BfUs3h1EGLaUkBGeEB",
+        "kOP0CZWNSk6vqE3wkPp7Vc",
+        "W4PWcOu2O2pRaZyoE2W80h",
+        "RolbOnQLIftk0vy9mIcz5M",
+        "8OPhUgbaNJmOdiq5D3a6vK",
+        "5Qrt3jSvSrpK6Ne1hS6shL",
+        "hTq7nFUrovN5Ao4u6dIYWZ",
+        "I8WNZLpJ1DVP2wiCW7YBIB",
+        "SawhU3I9BewSE1XBPstpNJ",
+        "80EtLEl2FIE4FqbWnUD3nT"
+      ],
+      "version": 3
+    },
+    "kernel_info": {
+      "name": "python38-azureml-pt-tf"
+    },
+    "kernelspec": {
+      "display_name": "azureml_py38_PT_TF",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.8.5",
+      "mimetype": "text/x-python",
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "pygments_lexer": "ipython3",
+      "nbconvert_exporter": "python",
+      "file_extension": ".py"
+    },
+    "microsoft": {
+      "host": {
+        "AzureML": {
+          "notebookHasBeenCompleted": true
+        }
+      },
+      "ms_spell_check": {
+        "ms_spell_check_language": "en"
+      }
+    },
+    "nteract": {
+      "version": "nteract-front-end@1.0.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}

notebooks/.ipynb_aml_checkpoints/microsample_model_comparison-checkpoint2024-0-31-14-6-22Z.ipynb ADDED Viewed

File without changes

notebooks/DAEDRA-Copy1.ipynb ADDED Viewed

	@@ -0,0 +1,1634 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# DAEDRA: Determining Adverse Event Disposition for Regulatory Affairs\n",
+    "\n",
+    "DAEDRA is a language model intended to predict the disposition (outcome) of an adverse event based on the text of the event report. Intended to be used to classify reports in passive reporting systems, it is trained on the [VAERS](https://vaers.hhs.gov/) dataset, which contains reports of adverse events following vaccination in the United States."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: accelerate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.26.1)\n",
+      "Requirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (23.1)\n",
+      "Requirement already satisfied: pyyaml in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (6.0)\n",
+      "Requirement already satisfied: psutil in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (5.9.5)\n",
+      "Requirement already satisfied: torch>=1.10.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.12.0)\n",
+      "Requirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.23.5)\n",
+      "Requirement already satisfied: huggingface-hub in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.20.3)\n",
+      "Requirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.4.2)\n",
+      "Requirement already satisfied: typing_extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from torch>=1.10.0->accelerate) (4.6.3)\n",
+      "Requirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (3.13.1)\n",
+      "Requirement already satisfied: tqdm>=4.42.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (4.65.0)\n",
+      "Requirement already satisfied: fsspec>=2023.5.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2023.10.0)\n",
+      "Requirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2.31.0)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (2023.5.7)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.1.0)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (1.26.16)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.4)\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# %pip install accelerate -U"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true,
+     "source_hidden": false
+    },
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: transformers in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (4.37.1)\n",
+      "Requirement already satisfied: datasets in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.16.1)\n",
+      "Requirement already satisfied: shap in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.44.1)\n",
+      "Requirement already satisfied: watermark in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.4.3)\n",
+      "Requirement already satisfied: wandb in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.16.2)\n",
+      "Requirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (1.23.5)\n",
+      "Requirement already satisfied: tqdm>=4.27 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (4.65.0)\n",
+      "Requirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.20.3)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (6.0)\n",
+      "Requirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (3.13.1)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2023.12.25)\n",
+      "Requirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2.31.0)\n",
+      "Requirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (23.1)\n",
+      "Requirement already satisfied: tokenizers<0.19,>=0.14 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.15.1)\n",
+      "Requirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.4.2)\n",
+      "Requirement already satisfied: pandas in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2.0.2)\n",
+      "Requirement already satisfied: xxhash in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.4.1)\n",
+      "Requirement already satisfied: multiprocess in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.70.15)\n",
+      "Requirement already satisfied: dill<0.3.8,>=0.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.3.7)\n",
+      "Requirement already satisfied: aiohttp in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.9.1)\n",
+      "Requirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2023.10.0)\n",
+      "Requirement already satisfied: pyarrow>=8.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (9.0.0)\n",
+      "Requirement already satisfied: pyarrow-hotfix in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.6)\n",
+      "Requirement already satisfied: scipy in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.10.1)\n",
+      "Requirement already satisfied: cloudpickle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (2.2.1)\n",
+      "Requirement already satisfied: numba in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.58.1)\n",
+      "Requirement already satisfied: slicer==0.0.7 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.0.7)\n",
+      "Requirement already satisfied: scikit-learn in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.2.2)\n",
+      "Requirement already satisfied: ipython>=6.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (8.12.2)\n",
+      "Requirement already satisfied: setuptools in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (65.6.3)\n",
+      "Requirement already satisfied: importlib-metadata>=1.4 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (6.7.0)\n",
+      "Requirement already satisfied: psutil>=5.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (5.9.5)\n",
+      "Requirement already satisfied: setproctitle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.3.3)\n",
+      "Requirement already satisfied: appdirs>=1.4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.4.4)\n",
+      "Requirement already satisfied: GitPython!=3.1.29,>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.1.31)\n",
+      "Requirement already satisfied: protobuf!=4.21.0,<5,>=3.12.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.19.6)\n",
+      "Requirement already satisfied: typing-extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (4.6.3)\n",
+      "Requirement already satisfied: docker-pycreds>=0.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (0.4.0)\n",
+      "Requirement already satisfied: sentry-sdk>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.39.2)\n",
+      "Requirement already satisfied: Click!=8.0.0,>=7.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (8.1.3)\n",
+      "Requirement already satisfied: six>=1.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0)\n",
+      "Requirement already satisfied: frozenlist>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.4.1)\n",
+      "Requirement already satisfied: aiosignal>=1.1.2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.1)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (23.1.0)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (6.0.4)\n",
+      "Requirement already satisfied: async-timeout<5.0,>=4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (4.0.3)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.9.4)\n",
+      "Requirement already satisfied: gitdb<5,>=4.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from GitPython!=3.1.29,>=1.0.0->wandb) (4.0.10)\n",
+      "Requirement already satisfied: zipp>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from importlib-metadata>=1.4->watermark) (3.15.0)\n",
+      "Requirement already satisfied: pygments>=2.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (2.15.1)\n",
+      "Requirement already satisfied: decorator in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.1.1)\n",
+      "Requirement already satisfied: traitlets>=5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.9.0)\n",
+      "Requirement already satisfied: matplotlib-inline in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.1.6)\n",
+      "Requirement already satisfied: backcall in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.2.0)\n",
+      "Requirement already satisfied: stack-data in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.6.2)\n",
+      "Requirement already satisfied: pickleshare in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.7.5)\n",
+      "Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (3.0.30)\n",
+      "Requirement already satisfied: jedi>=0.16 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.18.2)\n",
+      "Requirement already satisfied: pexpect>4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (4.8.0)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.1.0)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.4)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (1.26.16)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (2023.5.7)\n",
+      "Requirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from numba->shap) (0.41.1)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2.8.2)\n",
+      "Requirement already satisfied: tzdata>=2022.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\n",
+      "Requirement already satisfied: pytz>=2020.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\n",
+      "Requirement already satisfied: threadpoolctl>=2.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (3.1.0)\n",
+      "Requirement already satisfied: joblib>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (1.2.0)\n",
+      "Requirement already satisfied: smmap<6,>=3.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.0)\n",
+      "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from jedi>=0.16->ipython>=6.0->watermark) (0.8.3)\n",
+      "Requirement already satisfied: ptyprocess>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pexpect>4.3->ipython>=6.0->watermark) (0.7.0)\n",
+      "Requirement already satisfied: wcwidth in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.0->watermark) (0.2.6)\n",
+      "Requirement already satisfied: asttokens>=2.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (2.2.1)\n",
+      "Requirement already satisfied: executing>=1.2.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (1.2.0)\n",
+      "Requirement already satisfied: pure-eval in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (0.2.2)\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# %pip install transformers datasets shap watermark wandb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": false,
+     "hide_output_from_viewers": false,
+     "node_id": "caZjjFP0OyQNMVgZDiwswE",
+     "report_properties": {
+      "rowId": "un8W7ez7ZwoGb5Co6nydEV"
+     },
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449625034
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The watermark extension is already loaded. To reload it, use:\n",
+      "  %reload_ext watermark\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import os\n",
+    "from typing import List\n",
+    "from sklearn.metrics import f1_score, accuracy_score, classification_report\n",
+    "from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, pipeline\n",
+    "from datasets import load_dataset, Dataset, DatasetDict\n",
+    "from pyarrow import Table\n",
+    "import shap\n",
+    "import wandb\n",
+    "\n",
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+    "\n",
+    "%load_ext watermark"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {
+    "collapsed": false,
+    "gather": {
+     "logged": 1706449721319
+    },
+    "jupyter": {
+     "outputs_hidden": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "device: str = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+    "\n",
+    "SEED: int = 42\n",
+    "\n",
+    "BATCH_SIZE: int = 32\n",
+    "EPOCHS: int = 3\n",
+    "model_ckpt: str = \"distilbert-base-uncased\"\n",
+    "\n",
+    "CLASS_NAMES: List[str] = [\"DIED\",\n",
+    "                          \"ER_VISIT\",\n",
+    "                          \"HOSPITAL\",\n",
+    "                          \"OFC_VISIT\",\n",
+    "                          #\"X_STAY\",      # pruned\n",
+    "                          #\"DISABLE\",     # pruned\n",
+    "                          #\"D_PRESENTED\"  # pruned\n",
+    "                          ]\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "# WandB configuration\n",
+    "os.environ[\"WANDB_PROJECT\"] = \"DAEDRA model training\"  # name your W&B project\n",
+    "os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\"  # log all model checkpoints\n",
+    "os.environ[\"WANDB_NOTEBOOK_NAME\"] = \"DAEDRA.ipynb\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "shap   : 0.44.1\n",
+      "torch  : 1.12.0\n",
+      "logging: 0.5.1.2\n",
+      "numpy  : 1.23.5\n",
+      "pandas : 2.0.2\n",
+      "re     : 2.2.1\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "%watermark --iversion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "UU2oOJhwbIualogG1YyCMd",
+     "type": "CODE"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sun Jan 28 13:54:22 2024       \n",
+      "+---------------------------------------------------------------------------------------+\n",
+      "| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |\n",
+      "|-----------------------------------------+----------------------+----------------------+\n",
+      "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
+      "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
+      "|                                         |                      |               MIG M. |\n",
+      "|=========================================+======================+======================|\n",
+      "|   0  Tesla V100-PCIE-16GB           Off | 00000001:00:00.0 Off |                  Off |\n",
+      "| N/A   30C    P0              38W / 250W |  12830MiB / 16384MiB |      0%      Default |\n",
+      "|                                         |                      |                  N/A |\n",
+      "+-----------------------------------------+----------------------+----------------------+\n",
+      "|   1  Tesla V100-PCIE-16GB           Off | 00000002:00:00.0 Off |                  Off |\n",
+      "| N/A   30C    P0              38W / 250W |  11960MiB / 16384MiB |      0%      Default |\n",
+      "|                                         |                      |                  N/A |\n",
+      "+-----------------------------------------+----------------------+----------------------+\n",
+      "                                                                                         \n",
+      "+---------------------------------------------------------------------------------------+\n",
+      "| Processes:                                                                            |\n",
+      "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
+      "|        ID   ID                                                             Usage      |\n",
+      "|=======================================================================================|\n",
+      "|    0   N/A  N/A     11781      C   .../envs/azureml_py38_PT_TF/bin/python    12826MiB |\n",
+      "|    1   N/A  N/A     11781      C   .../envs/azureml_py38_PT_TF/bin/python    11956MiB |\n",
+      "+---------------------------------------------------------------------------------------+\n"
+     ]
+    }
+   ],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": false,
+     "hide_output_from_viewers": false,
+     "node_id": "t45KHugmcPVaO0nuk8tGJ9",
+     "report_properties": {
+      "rowId": "40nN9Hvgi1clHNV5RAemI5"
+     },
+     "type": "MD"
+    }
+   },
+   "source": [
+    "## Loading the data set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {
+    "collapsed": false,
+    "gather": {
+     "logged": 1706449040507
+    },
+    "jupyter": {
+     "outputs_hidden": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset = load_dataset(\"chrisvoncsefalvay/vaers-outcomes\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {
+    "collapsed": false,
+    "gather": {
+     "logged": 1706449044205
+    },
+    "jupyter": {
+     "outputs_hidden": false,
+     "source_hidden": false
+    },
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['id', 'text', 'labels'],\n",
+       "        num_rows: 1270444\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['id', 'text', 'labels'],\n",
+       "        num_rows: 272238\n",
+       "    })\n",
+       "    val: Dataset({\n",
+       "        features: ['id', 'text', 'labels'],\n",
+       "        num_rows: 272238\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 47,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SUBSAMPLING: float = 0.1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {
+    "collapsed": false,
+    "gather": {
+     "logged": 1706449378281
+    },
+    "jupyter": {
+     "outputs_hidden": false,
+     "source_hidden": false
+    },
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def minisample(ds: DatasetDict, fraction: float) -> DatasetDict:\n",
+    "    res = DatasetDict()\n",
+    "\n",
+    "    res[\"train\"] = Dataset.from_dict(ds[\"train\"].shuffle()[:round(len(ds[\"train\"]) * fraction)])\n",
+    "    res[\"test\"] = Dataset.from_dict(ds[\"test\"].shuffle()[:round(len(ds[\"test\"]) * fraction)])\n",
+    "    res[\"val\"] = Dataset.from_dict(ds[\"val\"].shuffle()[:round(len(ds[\"val\"]) * fraction)])\n",
+    "    \n",
+    "    return res"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {
+    "collapsed": false,
+    "gather": {
+     "logged": 1706449384162
+    },
+    "jupyter": {
+     "outputs_hidden": false,
+     "source_hidden": false
+    },
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    }
+   },
+   "outputs": [],
+   "source": [
+    "dataset = minisample(dataset, SUBSAMPLING)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {
+    "collapsed": false,
+    "gather": {
+     "logged": 1706449387981
+    },
+    "jupyter": {
+     "outputs_hidden": false,
+     "source_hidden": false
+    },
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['id', 'text', 'labels'],\n",
+       "        num_rows: 127044\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['id', 'text', 'labels'],\n",
+       "        num_rows: 27224\n",
+       "    })\n",
+       "    val: Dataset({\n",
+       "        features: ['id', 'text', 'labels'],\n",
+       "        num_rows: 27224\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 50,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    }
+   },
+   "source": [
+    "We prune things down to the first four keys: `DIED`, `ER_VISIT`, `HOSPITAL`, `OFC_VISIT`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {
+    "collapsed": false,
+    "gather": {
+     "logged": 1706449443055
+    },
+    "jupyter": {
+     "outputs_hidden": false,
+     "source_hidden": false
+    },
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    }
+   },
+   "outputs": [],
+   "source": [
+    "ds = DatasetDict()\n",
+    "\n",
+    "for i in [\"test\", \"train\", \"val\"]:\n",
+    "    tab = Table.from_arrays([dataset[i][\"id\"], dataset[i][\"text\"], [i[:4] for i in dataset[i][\"labels\"]]], names=[\"id\", \"text\", \"labels\"])\n",
+    "    ds[i] = Dataset(tab)\n",
+    "\n",
+    "dataset = ds"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Tokenisation and encoding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "I7n646PIscsUZRoHu6m7zm",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449638377
+    }
+   },
+   "outputs": [],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "QBLOSI0yVIslV7v7qX9ZC3",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449642580
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def tokenize_and_encode(examples):\n",
+    "  return tokenizer(examples[\"text\"], truncation=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "slHeNysZOX9uWS9PB7jFDb",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449721161
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Map: 100%|██████████| 27224/27224 [00:11<00:00, 2347.91 examples/s]\n",
+      "Map: 100%|██████████| 127044/127044 [00:52<00:00, 2417.41 examples/s]\n",
+      "Map: 100%|██████████| 27224/27224 [00:11<00:00, 2376.02 examples/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "cols = dataset[\"train\"].column_names\n",
+    "cols.remove(\"labels\")\n",
+    "ds_enc = dataset.map(tokenize_and_encode, batched=True, remove_columns=cols)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "itXWkbDw9sqbkMuDP84QoT",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449743072
+    }
+   },
+   "outputs": [],
+   "source": [
+    "class MultiLabelTrainer(Trainer):\n",
+    "    def compute_loss(self, model, inputs, return_outputs=False):\n",
+    "        labels = inputs.pop(\"labels\")\n",
+    "        outputs = model(**inputs)\n",
+    "        logits = outputs.logits\n",
+    "        loss_fct = torch.nn.BCEWithLogitsLoss()\n",
+    "        loss = loss_fct(logits.view(-1, self.model.config.num_labels),\n",
+    "                        labels.float().view(-1, self.model.config.num_labels))\n",
+    "        return (loss, outputs) if return_outputs else loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "ZQU7aW6TV45VmhHOQRzcnF",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449761205
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(CLASS_NAMES)).to(\"cuda\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "swhgyyyxoGL8HjnXJtMuSW",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449761541
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def accuracy_threshold(y_pred, y_true, threshold=.5, sigmoid=True):\n",
+    "    y_pred = torch.from_numpy(y_pred)\n",
+    "    y_true = torch.from_numpy(y_true)\n",
+    "\n",
+    "    if sigmoid:\n",
+    "        y_pred = y_pred.sigmoid()\n",
+    "\n",
+    "    return ((y_pred > threshold) == y_true.bool()).float().mean().item()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "1Uq3HtkaBxtHNAnSwit5cI",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449761720
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def compute_metrics(eval_pred):\n",
+    "    predictions, labels = eval_pred\n",
+    "    return {'accuracy_thresh': accuracy_threshold(predictions, labels)}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "1iPZOTKPwSkTgX5dORqT89",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449761893
+    }
+   },
+   "outputs": [],
+   "source": [
+    "args = TrainingArguments(\n",
+    "    output_dir=\"vaers\",\n",
+    "    evaluation_strategy=\"epoch\",\n",
+    "    learning_rate=2e-5,\n",
+    "    per_device_train_batch_size=BATCH_SIZE,\n",
+    "    per_device_eval_batch_size=BATCH_SIZE,\n",
+    "    num_train_epochs=EPOCHS,\n",
+    "    weight_decay=.01,\n",
+    "    logging_steps=1,\n",
+    "    run_name=f\"daedra-training\",\n",
+    "    report_to=[\"wandb\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "bnRkNvRYltLun6gCEgL7v0",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449769103
+    }
+   },
+   "outputs": [],
+   "source": [
+    "multi_label_trainer = MultiLabelTrainer(\n",
+    "    model, \n",
+    "    args, \n",
+    "    train_dataset=ds_enc[\"train\"], \n",
+    "    eval_dataset=ds_enc[\"test\"], \n",
+    "    compute_metrics=compute_metrics, \n",
+    "    tokenizer=tokenizer\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "LO54PlDkWQdFrzV25FvduB",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449880674
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "Changes to your `wandb` environment variables will be ignored because your `wandb` session has already started. For more information on how to modify your settings with `wandb.init()` arguments, please refer to <a href='https://wandb.me/wandb-init' target=\"_blank\">the W&B docs</a>."
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Changes to your `wandb` environment variables will be ignored because your `wandb` session has already started. For more information on how to modify your settings with `wandb.init()` arguments, please refer to <a href='https://wandb.me/wandb-init' target=\"_blank\">the W&B docs</a>."
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m wandb.init() arguments ignored because wandb magic has already been initialized\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.16.2"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/cvc-vaers-bert-dnsd/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240128_141352-spfdhiij</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/spfdhiij' target=\"_blank\">init_evaluation_run</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/spfdhiij' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/spfdhiij</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Finishing last run (ID:spfdhiij) before initializing another..."
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run <strong style=\"color:#cdcd00\">init_evaluation_run</strong> at: <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/spfdhiij' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/spfdhiij</a><br/>Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Find logs at: <code>./wandb/run-20240128_141352-spfdhiij/logs</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Successfully finished last run (ID:spfdhiij). Initializing new run:<br/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.16.2"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/cvc-vaers-bert-dnsd/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240128_141354-mpe6cpuz</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/mpe6cpuz' target=\"_blank\">init_evaluation_run</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/mpe6cpuz' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/mpe6cpuz</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<style>\n",
+       "    table.wandb td:nth-child(1) { padding: 0 10px; text-align: left ; width: auto;} td:nth-child(2) {text-align: left ; width: 100%}\n",
+       "    .wandb-row { display: flex; flex-direction: row; flex-wrap: wrap; justify-content: flex-start; width: 100% }\n",
+       "    .wandb-col { display: flex; flex-direction: column; flex-basis: 100%; flex: 1; padding: 10px; }\n",
+       "    </style>\n",
+       "<div class=\"wandb-row\"><div class=\"wandb-col\"><h3>Run history:</h3><br/><table class=\"wandb\"><tr><td>eval/accuracy_thresh</td><td>▁</td></tr><tr><td>eval/loss</td><td>▁</td></tr><tr><td>eval/runtime</td><td>▁</td></tr><tr><td>eval/samples_per_second</td><td>▁</td></tr><tr><td>eval/steps_per_second</td><td>▁</td></tr><tr><td>train/global_step</td><td>▁</td></tr></table><br/></div><div class=\"wandb-col\"><h3>Run summary:</h3><br/><table class=\"wandb\"><tr><td>eval/accuracy_thresh</td><td>0.42136</td></tr><tr><td>eval/loss</td><td>0.69069</td></tr><tr><td>eval/runtime</td><td>79.1475</td></tr><tr><td>eval/samples_per_second</td><td>343.965</td></tr><tr><td>eval/steps_per_second</td><td>2.691</td></tr><tr><td>train/global_step</td><td>0</td></tr></table><br/></div></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run <strong style=\"color:#cdcd00\">init_evaluation_run</strong> at: <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/mpe6cpuz' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20model%20training/runs/mpe6cpuz</a><br/>Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Find logs at: <code>./wandb/run-20240128_141354-mpe6cpuz/logs</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "if SUBSAMPLING != 1.0:\n",
+    "    wandb_tag: List[str] = [f\"subsample-{SUBSAMPLING}\"]\n",
+    "else:\n",
+    "    wandb_tag: List[str] = [f\"full_sample\"]\n",
+    "    \n",
+    "wandb.init(name=\"init_evaluation_run\", tags=wandb_tag, magic=True)\n",
+    "\n",
+    "multi_label_trainer.evaluate()\n",
+    "wandb.finish()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "hf0Ei1QXEYDmBv1VNLZ4Zw",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706449934637
+    }
+   },
+   "outputs": [
+    {
+     "ename": "RuntimeError",
+     "evalue": "Caught RuntimeError in replica 0 on device 0.\nOriginal Traceback (most recent call last):\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py\", line 61, in _worker\n    output = module(*input, **kwargs)\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/nn/modules/module.py\", line 1130, in _call_impl\n    return forward_call(*input, **kwargs)\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/transformers/models/distilbert/modeling_distilbert.py\", line 1002, in forward\n    distilbert_output = self.distilbert(\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/nn/modules/module.py\", line 1130, in _call_impl\n    return forward_call(*input, **kwargs)\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/transformers/models/distilbert/modeling_distilbert.py\", line 822, in forward\n    return self.transformer(\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/nn/modules/module.py\", line 1130, in _call_impl\n    return forward_call(*input, **kwargs)\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/transformers/models/distilbert/modeling_distilbert.py\", line 587, in forward\n    layer_outputs = layer_module(\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/nn/modules/module.py\", line 1130, in _call_impl\n    return forward_call(*input, **kwargs)\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/transformers/models/distilbert/modeling_distilbert.py\", line 513, in forward\n    sa_output = self.attention(\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/nn/modules/module.py\", line 1130, in _call_impl\n    return forward_call(*input, **kwargs)\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/transformers/models/distilbert/modeling_distilbert.py\", line 243, in forward\n    scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, q_length, k_length)\nRuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 15.77 GiB total capacity; 14.69 GiB already allocated; 5.12 MiB free; 14.72 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF\n",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[62], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mmulti_label_trainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/transformers/trainer.py:1539\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m   1537\u001b[0m         hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m   1538\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1539\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1540\u001b[0m \u001b[43m        \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1541\u001b[0m \u001b[43m        \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1542\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1543\u001b[0m \u001b[43m        \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1544\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/transformers/trainer.py:1869\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m   1866\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_step_begin(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n\u001b[1;32m   1868\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39maccumulate(model):\n\u001b[0;32m-> 1869\u001b[0m     tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1871\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m   1872\u001b[0m     args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[1;32m   1873\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_tpu_available()\n\u001b[1;32m   1874\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[1;32m   1875\u001b[0m ):\n\u001b[1;32m   1876\u001b[0m     \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[1;32m   1877\u001b[0m     tr_loss \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n",
+      "File \u001b[0;32m/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/transformers/trainer.py:2768\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[0;34m(self, model, inputs)\u001b[0m\n\u001b[1;32m   2765\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m loss_mb\u001b[38;5;241m.\u001b[39mreduce_mean()\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m   2767\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompute_loss_context_manager():\n\u001b[0;32m-> 2768\u001b[0m     loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2770\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mn_gpu \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m   2771\u001b[0m     loss \u001b[38;5;241m=\u001b[39m loss\u001b[38;5;241m.\u001b[39mmean()  \u001b[38;5;66;03m# mean() to average on multi-gpu parallel training\u001b[39;00m\n",
+      "Cell \u001b[0;32mIn[55], line 4\u001b[0m, in \u001b[0;36mMultiLabelTrainer.compute_loss\u001b[0;34m(self, model, inputs, return_outputs)\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcompute_loss\u001b[39m(\u001b[38;5;28mself\u001b[39m, model, inputs, return_outputs\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m):\n\u001b[1;32m      3\u001b[0m     labels \u001b[38;5;241m=\u001b[39m inputs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlabels\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 4\u001b[0m     outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      5\u001b[0m     logits \u001b[38;5;241m=\u001b[39m outputs\u001b[38;5;241m.\u001b[39mlogits\n\u001b[1;32m      6\u001b[0m     loss_fct \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mnn\u001b[38;5;241m.\u001b[39mBCEWithLogitsLoss()\n",
+      "File \u001b[0;32m/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/nn/modules/module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m   1126\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1127\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1128\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1129\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1131\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
+      "File \u001b[0;32m/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py:168\u001b[0m, in \u001b[0;36mDataParallel.forward\u001b[0;34m(self, *inputs, **kwargs)\u001b[0m\n\u001b[1;32m    166\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodule(\u001b[38;5;241m*\u001b[39minputs[\u001b[38;5;241m0\u001b[39m], \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs[\u001b[38;5;241m0\u001b[39m])\n\u001b[1;32m    167\u001b[0m replicas \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreplicate(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodule, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdevice_ids[:\u001b[38;5;28mlen\u001b[39m(inputs)])\n\u001b[0;32m--> 168\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparallel_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreplicas\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    169\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgather(outputs, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_device)\n",
+      "File \u001b[0;32m/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py:178\u001b[0m, in \u001b[0;36mDataParallel.parallel_apply\u001b[0;34m(self, replicas, inputs, kwargs)\u001b[0m\n\u001b[1;32m    177\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mparallel_apply\u001b[39m(\u001b[38;5;28mself\u001b[39m, replicas, inputs, kwargs):\n\u001b[0;32m--> 178\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mparallel_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreplicas\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdevice_ids\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mreplicas\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py:86\u001b[0m, in \u001b[0;36mparallel_apply\u001b[0;34m(modules, inputs, kwargs_tup, devices)\u001b[0m\n\u001b[1;32m     84\u001b[0m     output \u001b[38;5;241m=\u001b[39m results[i]\n\u001b[1;32m     85\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(output, ExceptionWrapper):\n\u001b[0;32m---> 86\u001b[0m         \u001b[43moutput\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreraise\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     87\u001b[0m     outputs\u001b[38;5;241m.\u001b[39mappend(output)\n\u001b[1;32m     88\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m outputs\n",
+      "File \u001b[0;32m/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/_utils.py:461\u001b[0m, in \u001b[0;36mExceptionWrapper.reraise\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    457\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m    458\u001b[0m     \u001b[38;5;66;03m# If the exception takes multiple arguments, don't try to\u001b[39;00m\n\u001b[1;32m    459\u001b[0m     \u001b[38;5;66;03m# instantiate since we don't know how to\u001b[39;00m\n\u001b[1;32m    460\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(msg) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 461\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exception\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: Caught RuntimeError in replica 0 on device 0.\nOriginal Traceback (most recent call last):\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py\", line 61, in _worker\n    output = module(*input, **kwargs)\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/nn/modules/module.py\", line 1130, in _call_impl\n    return forward_call(*input, **kwargs)\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/transformers/models/distilbert/modeling_distilbert.py\", line 1002, in forward\n    distilbert_output = self.distilbert(\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/nn/modules/module.py\", line 1130, in _call_impl\n    return forward_call(*input, **kwargs)\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/transformers/models/distilbert/modeling_distilbert.py\", line 822, in forward\n    return self.transformer(\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/nn/modules/module.py\", line 1130, in _call_impl\n    return forward_call(*input, **kwargs)\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/transformers/models/distilbert/modeling_distilbert.py\", line 587, in forward\n    layer_outputs = layer_module(\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/nn/modules/module.py\", line 1130, in _call_impl\n    return forward_call(*input, **kwargs)\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/transformers/models/distilbert/modeling_distilbert.py\", line 513, in forward\n    sa_output = self.attention(\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/nn/modules/module.py\", line 1130, in _call_impl\n    return forward_call(*input, **kwargs)\n  File \"/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/transformers/models/distilbert/modeling_distilbert.py\", line 243, in forward\n    scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, q_length, k_length)\nRuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 15.77 GiB total capacity; 14.69 GiB already allocated; 5.12 MiB free; 14.72 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF\n"
+     ]
+    }
+   ],
+   "source": [
+    "if SUBSAMPLING != 1.0:\n",
+    "    wandb_tag: List[str] = [f\"subsample-{SUBSAMPLING}\"]\n",
+    "else:\n",
+    "    wandb_tag: List[str] = [f\"full_sample\"]\n",
+    "    \n",
+    "wandb.init(name=\"daedra_training_run\", tags=wandb_tag, magic=True)\n",
+    "\n",
+    "multi_label_trainer.train()\n",
+    "wandb.finish()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Evaluation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We instantiate a classifier `pipeline` and push it to CUDA."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "kHoUdBeqcyVXDSGv54C4aE",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706411459928
+    }
+   },
+   "outputs": [],
+   "source": [
+    "classifier = pipeline(\"text-classification\", \n",
+    "                      model, \n",
+    "                      tokenizer=tokenizer, \n",
+    "                      device=\"cuda:0\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We use the same tokenizer used for training to tokenize/encode the validation set."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "Dr5WCWA6jL51NR1fSrQu6Z",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706411523285
+    }
+   },
+   "outputs": [],
+   "source": [
+    "test_encodings = tokenizer.batch_encode_plus(dataset[\"val\"][\"text\"], \n",
+    "                                             max_length=None, \n",
+    "                                             padding='max_length', \n",
+    "                                             return_token_type_ids=True, \n",
+    "                                             truncation=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once we've made the data loadable by putting it into a `DataLoader`, we "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "MWfGq2tTkJNzFiDoUPq2X7",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706411543379
+    }
+   },
+   "outputs": [],
+   "source": [
+    "test_data = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']), \n",
+    "                                           torch.tensor(test_encodings['attention_mask']), \n",
+    "                                           torch.tensor(ds_enc[\"val\"][\"labels\"]), \n",
+    "                                           torch.tensor(test_encodings['token_type_ids']))\n",
+    "test_dataloader = torch.utils.data.DataLoader(test_data, \n",
+    "                                              sampler=torch.utils.data.SequentialSampler(test_data), \n",
+    "                                              batch_size=BATCH_SIZE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "1SJCSrQTRCexFCNCIyRrzL",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706411587843
+    }
+   },
+   "outputs": [],
+   "source": [
+    "model.eval()\n",
+    "\n",
+    "logit_preds, true_labels, pred_labels, tokenized_texts = [], [], [], []\n",
+    "\n",
+    "for i, batch in enumerate(test_dataloader):\n",
+    "  batch = tuple(t.to(device) for t in batch)\n",
+    "  \n",
+    "  # Unpack the inputs from our dataloader\n",
+    "  b_input_ids, b_input_mask, b_labels, b_token_types = batch\n",
+    "  \n",
+    "  with torch.no_grad():\n",
+    "    outs = model(b_input_ids, attention_mask=b_input_mask)\n",
+    "    b_logit_pred = outs[0]\n",
+    "    pred_label = torch.sigmoid(b_logit_pred)\n",
+    "\n",
+    "    b_logit_pred = b_logit_pred.detach().cpu().numpy()\n",
+    "    pred_label = pred_label.to('cpu').numpy()\n",
+    "    b_labels = b_labels.to('cpu').numpy()\n",
+    "\n",
+    "  tokenized_texts.append(b_input_ids)\n",
+    "  logit_preds.append(b_logit_pred)\n",
+    "  true_labels.append(b_labels)\n",
+    "  pred_labels.append(pred_label)\n",
+    "\n",
+    "# Flatten outputs\n",
+    "tokenized_texts = [item for sublist in tokenized_texts for item in sublist]\n",
+    "pred_labels = [item for sublist in pred_labels for item in sublist]\n",
+    "true_labels = [item for sublist in true_labels for item in sublist]\n",
+    "\n",
+    "# Converting flattened binary values to boolean values\n",
+    "true_bools = [tl == 1 for tl in true_labels]\n",
+    "pred_bools = [pl > 0.50 for pl in pred_labels] "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We create a classification report:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "eBprrgF086mznPbPVBpOLS",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706411588249
+    }
+   },
+   "outputs": [],
+   "source": [
+    "print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools, average='micro'))\n",
+    "print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools), '\\n')\n",
+    "clf_report = classification_report(true_bools, pred_bools, target_names=CLASS_NAMES)\n",
+    "print(clf_report)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we render a 'head to head' comparison table that maps each text prediction to actual and predicted labels."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "yELHY0IEwMlMw3x6e7hoD1",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706411588638
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Creating a map of class names from class numbers\n",
+    "idx2label = dict(zip(range(len(CLASS_NAMES)), CLASS_NAMES))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "jH0S35dDteUch01sa6me6e",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706411589004
+    }
+   },
+   "outputs": [],
+   "source": [
+    "true_label_idxs, pred_label_idxs = [], []\n",
+    "\n",
+    "for vals in true_bools:\n",
+    "  true_label_idxs.append(np.where(vals)[0].flatten().tolist())\n",
+    "for vals in pred_bools:\n",
+    "  pred_label_idxs.append(np.where(vals)[0].flatten().tolist())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "h4vHL8XdGpayZ6xLGJUF6F",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706411589301
+    }
+   },
+   "outputs": [],
+   "source": [
+    "true_label_texts, pred_label_texts = [], []\n",
+    "\n",
+    "for vals in true_label_idxs:\n",
+    "  if vals:\n",
+    "    true_label_texts.append([idx2label[val] for val in vals])\n",
+    "  else:\n",
+    "    true_label_texts.append(vals)\n",
+    "\n",
+    "for vals in pred_label_idxs:\n",
+    "  if vals:\n",
+    "    pred_label_texts.append([idx2label[val] for val in vals])\n",
+    "  else:\n",
+    "    pred_label_texts.append(vals)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "SxUmVHfQISEeptg1SawOmB",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706411591952
+    }
+   },
+   "outputs": [],
+   "source": [
+    "symptom_texts = [tokenizer.decode(text,\n",
+    "                                  skip_special_tokens=True,\n",
+    "                                  clean_up_tokenization_spaces=False) for text in tokenized_texts]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "BxFNigNGRLTOqraI55BPSH",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706411592512
+    }
+   },
+   "outputs": [],
+   "source": [
+    "comparisons_df = pd.DataFrame({'symptom_text': symptom_texts, \n",
+    "                               'true_labels': true_label_texts, \n",
+    "                               'pred_labels':pred_label_texts})\n",
+    "comparisons_df.to_csv('comparisons.csv')\n",
+    "comparisons_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Shapley analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "datalore": {
+     "hide_input_from_viewers": true,
+     "hide_output_from_viewers": true,
+     "node_id": "OpdZcoenX2HwzLdai7K5UA",
+     "type": "CODE"
+    },
+    "gather": {
+     "logged": 1706415109071
+    }
+   },
+   "outputs": [],
+   "source": [
+    "explainer = shap.Explainer(classifier, output_names=CLASS_NAMES)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    }
+   },
+   "source": [
+    "#### Sampling correct predictions\n",
+    "\n",
+    "First, let's look at some correct predictions of deaths:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "gather": {
+     "logged": 1706414973990
+    },
+    "jupyter": {
+     "outputs_hidden": false
+    },
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    }
+   },
+   "outputs": [],
+   "source": [
+    "correct_death_predictions = comparisons_df[comparisons_df['true_labels'].astype(str) == \"['DIED']\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "gather": {
+     "logged": 1706415114683
+    },
+    "jupyter": {
+     "outputs_hidden": false
+    },
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    }
+   },
+   "outputs": [],
+   "source": [
+    "texts = [i[:512] for i in correct_death_predictions.sample(n=6).symptom_text]\n",
+    "idxs = [i for i in range(len(texts))]\n",
+    "\n",
+    "d_s = Dataset(Table.from_arrays([idxs, texts], names=[\"idx\", \"texts\"]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "gather": {
+     "logged": 1706415129229
+    },
+    "jupyter": {
+     "outputs_hidden": false
+    },
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    }
+   },
+   "outputs": [],
+   "source": [
+    "shap_values = explainer(d_s[\"texts\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "gather": {
+     "logged": 1706415151494
+    },
+    "jupyter": {
+     "outputs_hidden": false
+    },
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    }
+   },
+   "outputs": [],
+   "source": [
+    "shap.plots.text(shap_values)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    },
+    "nteract": {
+     "transient": {
+      "deleting": false
+     }
+    }
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "datalore": {
+   "base_environment": "default",
+   "computation_mode": "JUPYTER",
+   "package_manager": "pip",
+   "packages": [
+    {
+     "name": "datasets",
+     "source": "PIP",
+     "version": "2.16.1"
+    },
+    {
+     "name": "torch",
+     "source": "PIP",
+     "version": "2.1.2"
+    },
+    {
+     "name": "accelerate",
+     "source": "PIP",
+     "version": "0.26.1"
+    }
+   ],
+   "report_row_ids": [
+    "un8W7ez7ZwoGb5Co6nydEV",
+    "40nN9Hvgi1clHNV5RAemI5",
+    "TgRD90H5NSPpKS41OeXI1w",
+    "ZOm5BfUs3h1EGLaUkBGeEB",
+    "kOP0CZWNSk6vqE3wkPp7Vc",
+    "W4PWcOu2O2pRaZyoE2W80h",
+    "RolbOnQLIftk0vy9mIcz5M",
+    "8OPhUgbaNJmOdiq5D3a6vK",
+    "5Qrt3jSvSrpK6Ne1hS6shL",
+    "hTq7nFUrovN5Ao4u6dIYWZ",
+    "I8WNZLpJ1DVP2wiCW7YBIB",
+    "SawhU3I9BewSE1XBPstpNJ",
+    "80EtLEl2FIE4FqbWnUD3nT"
+   ],
+   "version": 3
+  },
+  "kernelspec": {
+   "display_name": "Python 3.8 - Pytorch and Tensorflow",
+   "language": "python",
+   "name": "python38-azureml-pt-tf"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  },
+  "microsoft": {
+   "host": {
+    "AzureML": {
+     "notebookHasBeenCompleted": true
+    }
+   },
+   "ms_spell_check": {
+    "ms_spell_check_language": "en"
+   }
+  },
+  "nteract": {
+   "version": "nteract-front-end@1.0.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

notebooks/DAEDRA.ipynb ADDED Viewed

	@@ -0,0 +1,671 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# DAEDRA: Determining Adverse Event Disposition for Regulatory Affairs\n",
+        "\n",
+        "DAEDRA is a language model intended to predict the disposition (outcome) of an adverse event based on the text of the event report. Intended to be used to classify reports in passive reporting systems, it is trained on the [VAERS](https://vaers.hhs.gov/) dataset, which contains reports of adverse events following vaccination in the United States."
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%pip install accelerate -U"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Requirement already satisfied: accelerate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.26.1)\nRequirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.23.5)\nRequirement already satisfied: huggingface-hub in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.20.3)\nRequirement already satisfied: pyyaml in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (6.0)\nRequirement already satisfied: psutil in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (5.9.5)\nRequirement already satisfied: torch>=1.10.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.12.0)\nRequirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (23.1)\nRequirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.4.2)\nRequirement already satisfied: typing_extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from torch>=1.10.0->accelerate) (4.6.3)\nRequirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2.31.0)\nRequirement already satisfied: tqdm>=4.42.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (4.65.0)\nRequirement already satisfied: fsspec>=2023.5.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2023.10.0)\nRequirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (3.13.1)\nRequirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (2023.5.7)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (1.26.16)\nRequirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.4)\nRequirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.1.0)\nNote: you may need to restart the kernel to use updated packages.\n"
+        }
+      ],
+      "execution_count": 1,
+      "metadata": {
+        "gather": {
+          "logged": 1706475754655
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "tags": []
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%pip install transformers datasets shap watermark wandb evaluate codecarbon"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Requirement already satisfied: transformers in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (4.37.1)\nRequirement already satisfied: datasets in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.16.1)\nRequirement already satisfied: shap in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.44.1)\nRequirement already satisfied: watermark in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.4.3)\nRequirement already satisfied: wandb in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.16.2)\nRequirement already satisfied: evaluate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.4.1)\nRequirement already satisfied: codecarbon in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.3.3)\nRequirement already satisfied: regex!=2019.12.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2023.12.25)\nRequirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (23.1)\nRequirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.4.2)\nRequirement already satisfied: pyyaml>=5.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (6.0)\nRequirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (1.23.5)\nRequirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (3.13.1)\nRequirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2.31.0)\nRequirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.20.3)\nRequirement already satisfied: tqdm>=4.27 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (4.65.0)\nRequirement already satisfied: tokenizers<0.19,>=0.14 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.15.1)\nRequirement already satisfied: pyarrow>=8.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (9.0.0)\nRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2023.10.0)\nRequirement already satisfied: pyarrow-hotfix in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.6)\nRequirement already satisfied: dill<0.3.8,>=0.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.3.7)\nRequirement already satisfied: aiohttp in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.9.1)\nRequirement already satisfied: multiprocess in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.70.15)\nRequirement already satisfied: xxhash in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.4.1)\nRequirement already satisfied: pandas in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2.0.2)\nRequirement already satisfied: scipy in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.10.1)\nRequirement already satisfied: slicer==0.0.7 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.0.7)\nRequirement already satisfied: cloudpickle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (2.2.1)\nRequirement already satisfied: scikit-learn in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.2.2)\nRequirement already satisfied: numba in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.58.1)\nRequirement already satisfied: setuptools in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (65.6.3)\nRequirement already satisfied: ipython>=6.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (8.12.2)\nRequirement already satisfied: importlib-metadata>=1.4 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (6.7.0)\nRequirement already satisfied: Click!=8.0.0,>=7.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (8.1.3)\nRequirement already satisfied: typing-extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (4.6.3)\nRequirement already satisfied: protobuf!=4.21.0,<5,>=3.12.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.19.6)\nRequirement already satisfied: setproctitle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.3.3)\nRequirement already satisfied: psutil>=5.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (5.9.5)\nRequirement already satisfied: sentry-sdk>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.39.2)\nRequirement already satisfied: docker-pycreds>=0.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (0.4.0)\nRequirement already satisfied: appdirs>=1.4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.4.4)\nRequirement already satisfied: GitPython!=3.1.29,>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.1.31)\nRequirement already satisfied: responses<0.19 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from evaluate) (0.18.0)\nRequirement already satisfied: prometheus-client in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (0.19.0)\nRequirement already satisfied: py-cpuinfo in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (9.0.0)\nRequirement already satisfied: rapidfuzz in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (3.6.1)\nRequirement already satisfied: pynvml in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (11.5.0)\nRequirement already satisfied: arrow in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (1.3.0)\nRequirement already satisfied: six>=1.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0)\nRequirement already satisfied: yarl<2.0,>=1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.9.4)\nRequirement already satisfied: async-timeout<5.0,>=4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (4.0.3)\nRequirement already satisfied: frozenlist>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.4.1)\nRequirement already satisfied: multidict<7.0,>=4.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (6.0.4)\nRequirement already satisfied: aiosignal>=1.1.2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.1)\nRequirement already satisfied: attrs>=17.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (23.1.0)\nRequirement already satisfied: gitdb<5,>=4.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from GitPython!=3.1.29,>=1.0.0->wandb) (4.0.10)\nRequirement already satisfied: zipp>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from importlib-metadata>=1.4->watermark) (3.15.0)\nRequirement already satisfied: pickleshare in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.7.5)\nRequirement already satisfied: backcall in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.2.0)\nRequirement already satisfied: stack-data in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.6.2)\nRequirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (3.0.30)\nRequirement already satisfied: matplotlib-inline in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.1.6)\nRequirement already satisfied: decorator in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.1.1)\nRequirement already satisfied: traitlets>=5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.9.0)\nRequirement already satisfied: jedi>=0.16 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.18.2)\nRequirement already satisfied: pygments>=2.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (2.15.1)\nRequirement already satisfied: pexpect>4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (4.8.0)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (1.26.16)\nRequirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.1.0)\nRequirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.4)\nRequirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (2023.5.7)\nRequirement already satisfied: types-python-dateutil>=2.8.10 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from arrow->codecarbon) (2.8.19.20240106)\nRequirement already satisfied: python-dateutil>=2.7.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from arrow->codecarbon) (2.8.2)\nRequirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from numba->shap) (0.41.1)\nRequirement already satisfied: pytz>=2020.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\nRequirement already satisfied: tzdata>=2022.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (3.1.0)\nRequirement already satisfied: joblib>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (1.2.0)\nRequirement already satisfied: smmap<6,>=3.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.0)\nRequirement already satisfied: parso<0.9.0,>=0.8.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from jedi>=0.16->ipython>=6.0->watermark) (0.8.3)\nRequirement already satisfied: ptyprocess>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pexpect>4.3->ipython>=6.0->watermark) (0.7.0)\nRequirement already satisfied: wcwidth in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.0->watermark) (0.2.6)\nRequirement already satisfied: asttokens>=2.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (2.2.1)\nRequirement already satisfied: pure-eval in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (0.2.2)\nRequirement already satisfied: executing>=1.2.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (1.2.0)\nNote: you may need to restart the kernel to use updated packages.\n"
+        }
+      ],
+      "execution_count": 2,
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "import torch\n",
+        "import os\n",
+        "from typing import List, Union\n",
+        "from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, DataCollatorWithPadding, pipeline, AutoModel\n",
+        "from datasets import load_dataset, Dataset, DatasetDict\n",
+        "import shap\n",
+        "import wandb\n",
+        "import evaluate\n",
+        "import logging\n",
+        "\n",
+        "wandb.finish()\n",
+        "\n",
+        "\n",
+        "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+        "\n",
+        "%load_ext watermark"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n2024-01-29 17:46:15.020290: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\nTo enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n2024-01-29 17:46:16.031641: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n2024-01-29 17:46:16.031779: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n2024-01-29 17:46:16.031793: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
+        }
+      ],
+      "execution_count": 3,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "caZjjFP0OyQNMVgZDiwswE",
+          "report_properties": {
+            "rowId": "un8W7ez7ZwoGb5Co6nydEV"
+          },
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706550378660
+        },
+        "tags": []
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "device: str = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+        "\n",
+        "SEED: int = 42\n",
+        "\n",
+        "BATCH_SIZE: int = 32\n",
+        "EPOCHS: int = 5\n",
+        "model_ckpt: str = \"distilbert-base-uncased\"\n",
+        "\n",
+        "# WandB configuration\n",
+        "os.environ[\"WANDB_PROJECT\"] = \"DAEDRA multiclass model training\" \n",
+        "os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\"  # log all model checkpoints\n",
+        "os.environ[\"WANDB_NOTEBOOK_NAME\"] = \"DAEDRA.ipynb\""
+      ],
+      "outputs": [],
+      "execution_count": 4,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706550378812
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%watermark --iversion"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "shap    : 0.44.1\npandas  : 2.0.2\nwandb   : 0.16.2\nre      : 2.2.1\nevaluate: 0.4.1\ntorch   : 1.12.0\nnumpy   : 1.23.5\nlogging : 0.5.1.2\n\n"
+        }
+      ],
+      "execution_count": 5,
+      "metadata": {
+        "collapsed": false,
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!nvidia-smi"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Mon Jan 29 17:46:18 2024       \r\n+---------------------------------------------------------------------------------------+\r\n| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |\r\n|-----------------------------------------+----------------------+----------------------+\r\n| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\r\n| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\r\n|                                         |                      |               MIG M. |\r\n|=========================================+======================+======================|\r\n|   0  Tesla V100-PCIE-16GB           Off | 00000001:00:00.0 Off |                  Off |\r\n| N/A   26C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n|   1  Tesla V100-PCIE-16GB           Off | 00000002:00:00.0 Off |                  Off |\r\n| N/A   25C    P0              23W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n|   2  Tesla V100-PCIE-16GB           Off | 00000003:00:00.0 Off |                  Off |\r\n| N/A   25C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n|   3  Tesla V100-PCIE-16GB           Off | 00000004:00:00.0 Off |                  Off |\r\n| N/A   27C    P0              24W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n                                                                                         \r\n+---------------------------------------------------------------------------------------+\r\n| Processes:                                                                            |\r\n|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\r\n|        ID   ID                                                             Usage      |\r\n|=======================================================================================|\r\n|  No running processes found                                                           |\r\n+---------------------------------------------------------------------------------------+\r\n"
+        }
+      ],
+      "execution_count": 6,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "UU2oOJhwbIualogG1YyCMd",
+          "type": "CODE"
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Loading the data set"
+      ],
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "t45KHugmcPVaO0nuk8tGJ9",
+          "report_properties": {
+            "rowId": "40nN9Hvgi1clHNV5RAemI5"
+          },
+          "type": "MD"
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dataset = load_dataset(\"chrisvoncsefalvay/vaers-outcomes\")"
+      ],
+      "outputs": [],
+      "execution_count": 7,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706550381141
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dataset"
+      ],
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "execution_count": 8,
+          "data": {
+            "text/plain": "DatasetDict({\n    train: Dataset({\n        features: ['id', 'text', 'label'],\n        num_rows: 1270444\n    })\n    test: Dataset({\n        features: ['id', 'text', 'label'],\n        num_rows: 272238\n    })\n    val: Dataset({\n        features: ['id', 'text', 'label'],\n        num_rows: 272238\n    })\n})"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 8,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706550381303
+        },
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "SUBSAMPLING = 0.01\n",
+        "\n",
+        "if SUBSAMPLING < 1:\n",
+        "    _ = DatasetDict()\n",
+        "    for each in dataset.keys():\n",
+        "        _[each] = dataset[each].shuffle(seed=SEED).select(range(int(len(dataset[each]) * SUBSAMPLING)))\n",
+        "\n",
+        "    dataset = _"
+      ],
+      "outputs": [],
+      "execution_count": 9,
+      "metadata": {
+        "gather": {
+          "logged": 1706550381472
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Tokenisation and encoding"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def encode_ds(ds: Union[Dataset, DatasetDict], tokenizer_model: str = model_ckpt) -> Union[Dataset, DatasetDict]:\n",
+        "    return ds_enc"
+      ],
+      "outputs": [],
+      "execution_count": 10,
+      "metadata": {
+        "gather": {
+          "logged": 1706550381637
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Evaluation metrics"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "accuracy = evaluate.load(\"accuracy\")\n",
+        "precision, recall = evaluate.load(\"precision\"), evaluate.load(\"recall\")\n",
+        "f1 = evaluate.load(\"f1\")"
+      ],
+      "outputs": [],
+      "execution_count": 11,
+      "metadata": {
+        "gather": {
+          "logged": 1706550381778
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def compute_metrics(eval_pred):\n",
+        "    predictions, labels = eval_pred\n",
+        "    predictions = np.argmax(predictions, axis=1)\n",
+        "    return {\n",
+        "        'accuracy': accuracy.compute(predictions=predictions, references=labels)[\"accuracy\"],\n",
+        "        'precision_macroaverage': precision.compute(predictions=predictions, references=labels, average='macro')[\"precision\"],\n",
+        "        'precision_microaverage': precision.compute(predictions=predictions, references=labels, average='micro')[\"precision\"],\n",
+        "        'recall_macroaverage': recall.compute(predictions=predictions, references=labels, average='macro')[\"recall\"],\n",
+        "        'recall_microaverage': recall.compute(predictions=predictions, references=labels, average='micro')[\"recall\"],\n",
+        "        'f1_microaverage': f1.compute(predictions=predictions, references=labels, average='micro')[\"f1\"]\n",
+        "    }"
+      ],
+      "outputs": [],
+      "execution_count": 12,
+      "metadata": {
+        "gather": {
+          "logged": 1706550381891
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Training"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We specify a label map – this has to be done manually, even if `Datasets` has a function for it, as `AutoModelForSequenceClassification` requires an object with a length :("
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "label_map = {i: label for i, label in enumerate(dataset[\"test\"].features[\"label\"].names)}"
+      ],
+      "outputs": [],
+      "execution_count": 13,
+      "metadata": {
+        "gather": {
+          "logged": 1706550382032
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def train_from_model(model_ckpt: str, push: bool = False):\n",
+        "    print(f\"Initialising training based on {model_ckpt}...\")\n",
+        "\n",
+        "    print(\"Tokenising...\")\n",
+        "    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)\n",
+        "\n",
+        "    cols = dataset[\"train\"].column_names\n",
+        "    cols.remove(\"label\")\n",
+        "    ds_enc = dataset.map(lambda x: tokenizer(x[\"text\"], truncation=True, max_length=512), batched=True, remove_columns=cols)\n",
+        "\n",
+        "    print(\"Loading model...\")\n",
+        "    try:\n",
+        "        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, \n",
+        "                                                                    num_labels=len(dataset[\"test\"].features[\"label\"].names), \n",
+        "                                                                    id2label=label_map, \n",
+        "                                                                    label2id={v:k for k,v in label_map.items()})\n",
+        "    except OSError:\n",
+        "        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, \n",
+        "                                                            num_labels=len(dataset[\"test\"].features[\"label\"].names), \n",
+        "                                                            id2label=label_map, \n",
+        "                                                            label2id={v:k for k,v in label_map.items()},\n",
+        "                                                            from_tf=True)\n",
+        "\n",
+        "\n",
+        "    args = TrainingArguments(\n",
+        "        output_dir=\"vaers\",\n",
+        "        evaluation_strategy=\"epoch\",\n",
+        "        save_strategy=\"epoch\",\n",
+        "        learning_rate=2e-5,\n",
+        "        per_device_train_batch_size=BATCH_SIZE,\n",
+        "        per_device_eval_batch_size=BATCH_SIZE,\n",
+        "        num_train_epochs=EPOCHS,\n",
+        "        weight_decay=.01,\n",
+        "        logging_steps=1,\n",
+        "        load_best_model_at_end=True,\n",
+        "        run_name=f\"daedra-training\",\n",
+        "        report_to=[\"wandb\"])\n",
+        "\n",
+        "    trainer = Trainer(\n",
+        "            model=model,\n",
+        "            args=args,\n",
+        "            train_dataset=ds_enc[\"train\"],\n",
+        "            eval_dataset=ds_enc[\"test\"],\n",
+        "            tokenizer=tokenizer,\n",
+        "            compute_metrics=compute_metrics)\n",
+        "    \n",
+        "    if SUBSAMPLING != 1.0:\n",
+        "        wandb_tag: List[str] = [f\"subsample-{SUBSAMPLING}\"]\n",
+        "    else:\n",
+        "        wandb_tag: List[str] = [f\"full_sample\"]\n",
+        "\n",
+        "    wandb_tag.append(f\"batch_size-{BATCH_SIZE}\")\n",
+        "    wandb_tag.append(f\"base:{model_ckpt}\")\n",
+        "        \n",
+        "    wandb.init(name=f\"daedra_{SUBSAMPLING}-{model_ckpt}\", tags=wandb_tag, magic=True)\n",
+        "\n",
+        "    print(\"Starting training...\")\n",
+        "\n",
+        "    trainer.train()\n",
+        "\n",
+        "    print(\"Training finished.\")\n",
+        "\n",
+        "    if push:\n",
+        "        variant = \"full_sample\" if SUBSAMPLING == 1.0 else f\"subsample-{SUBSAMPLING}\"\n",
+        "        tokenizer._tokenizer.save(\"tokenizer.json\")\n",
+        "        tokenizer.push_to_hub(\"chrisvoncsefalvay/daedra\")\n",
+        "        sample = \"full sample\" if SUBSAMPLING == 1.0 else f\"{SUBSAMPLING * 100}% of the full sample\"\n",
+        "\n",
+        "        model.push_to_hub(\"chrisvoncsefalvay/daedra\", \n",
+        "                        variant=variant,\n",
+        "                        commit_message=f\"DAEDRA model trained on {sample} of the VAERS dataset (training set size: {dataset['train'].num_rows:,}), based on {model_ckpt}\")"
+      ],
+      "outputs": [],
+      "execution_count": 14,
+      "metadata": {
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1706550382160
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "\n",
+        "base_models = [\n",
+        "    \"bert-base-uncased\",\n",
+        "    \"distilbert-base-uncased\",\n",
+        "]"
+      ],
+      "outputs": [],
+      "execution_count": 15,
+      "metadata": {
+        "gather": {
+          "logged": 1706550382318
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "BATCH_SIZE=1\n",
+        "\n",
+        "train_from_model(\"biobert/Bio_ClinicalBERT/\")"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Initialising training based on biobert/Bio_ClinicalBERT/...\nTokenising...\nLoading model...\n"
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "Map: 100%|██████████| 2722/2722 [00:01<00:00, 2195.12 examples/s]\nAll TF 2.0 model weights were used when initializing BertForSequenceClassification.\n\nAll the weights of BertForSequenceClassification were initialized from the TF 2.0 model.\nIf your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.\n"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Finishing last run (ID:sg022tqh) before initializing another..."
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View run <strong style=\"color:#cdcd00\">daedra_0.01-biobert/Bio_ClinicalBERT/</strong> at: <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/sg022tqh' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/sg022tqh</a><br/> View job at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjEzNDcyMTQwMw==/version_details/v6' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjEzNDcyMTQwMw==/version_details/v6</a><br/>Synced 6 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Find logs at: <code>./wandb/run-20240129_174816-sg022tqh/logs</code>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Successfully finished last run (ID:sg022tqh). Initializing new run:<br/>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Tracking run with wandb version 0.16.2"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/daedra-hptrain-cvc/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240129_174936-kilkkg1j</code>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/kilkkg1j' target=\"_blank\">daedra_0.01-biobert/Bio_ClinicalBERT/</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training</a>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/kilkkg1j' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/kilkkg1j</a>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Starting training...\n"
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "\n    <div>\n      \n      <progress value='1496' max='15880' style='width:300px; height:20px; vertical-align: middle;'></progress>\n      [ 1496/15880 07:43 < 1:14:19, 3.23 it/s, Epoch 0.47/5]\n    </div>\n    <table border=\"1\" class=\"dataframe\">\n  <thead>\n <tr style=\"text-align: left;\">\n      <th>Epoch</th>\n      <th>Training Loss</th>\n      <th>Validation Loss</th>\n    </tr>\n  </thead>\n  <tbody>\n  </tbody>\n</table><p>"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 21,
+      "metadata": {
+        "gather": {
+          "logged": 1706551053473
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    }
+  ],
+  "metadata": {
+    "datalore": {
+      "base_environment": "default",
+      "computation_mode": "JUPYTER",
+      "package_manager": "pip",
+      "packages": [
+        {
+          "name": "datasets",
+          "source": "PIP",
+          "version": "2.16.1"
+        },
+        {
+          "name": "torch",
+          "source": "PIP",
+          "version": "2.1.2"
+        },
+        {
+          "name": "accelerate",
+          "source": "PIP",
+          "version": "0.26.1"
+        }
+      ],
+      "report_row_ids": [
+        "un8W7ez7ZwoGb5Co6nydEV",
+        "40nN9Hvgi1clHNV5RAemI5",
+        "TgRD90H5NSPpKS41OeXI1w",
+        "ZOm5BfUs3h1EGLaUkBGeEB",
+        "kOP0CZWNSk6vqE3wkPp7Vc",
+        "W4PWcOu2O2pRaZyoE2W80h",
+        "RolbOnQLIftk0vy9mIcz5M",
+        "8OPhUgbaNJmOdiq5D3a6vK",
+        "5Qrt3jSvSrpK6Ne1hS6shL",
+        "hTq7nFUrovN5Ao4u6dIYWZ",
+        "I8WNZLpJ1DVP2wiCW7YBIB",
+        "SawhU3I9BewSE1XBPstpNJ",
+        "80EtLEl2FIE4FqbWnUD3nT"
+      ],
+      "version": 3
+    },
+    "kernel_info": {
+      "name": "python38-azureml-pt-tf"
+    },
+    "kernelspec": {
+      "display_name": "azureml_py38_PT_TF",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.8.5",
+      "mimetype": "text/x-python",
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "pygments_lexer": "ipython3",
+      "nbconvert_exporter": "python",
+      "file_extension": ".py"
+    },
+    "microsoft": {
+      "host": {
+        "AzureML": {
+          "notebookHasBeenCompleted": true
+        }
+      },
+      "ms_spell_check": {
+        "ms_spell_check_language": "en"
+      }
+    },
+    "nteract": {
+      "version": "nteract-front-end@1.0.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}

notebooks/DAEDRA.yml ADDED Viewed

File without changes

notebooks/Dataset preparation.ipynb ADDED Viewed

	@@ -0,0 +1,524 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Dataset processing\n",
+    "\n",
+    "This notebook processes the raw csv outputs from VAERS into Huggingface datasets. It shouldn't generally need to be run by the end user. "
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "35523bbeb2e03eae"
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import datasets\n",
+    "import glob\n",
+    "import tqdm.notebook as tqdm\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from typing import Tuple\n",
+    "from datetime import datetime\n",
+    "\n",
+    "pd.set_option('future.no_silent_downcasting', True)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-01-27T22:28:38.481853Z",
+     "start_time": "2024-01-27T22:28:38.458294Z"
+    }
+   },
+   "id": "9362802d64424442",
+   "execution_count": 15
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "HF_URL: str = \"chrisvoncsefalvay/vaers-outcomes\"\n",
+    "\n",
+    "FLAG_COLUMNS: list = [\"DIED\", \"ER_VISIT\", \"HOSPITAL\", \"OFC_VISIT\", \"X_STAY\", \"DISABLE\"]\n",
+    "DEMOGRAPHIC_COLUMNS: list = [\"AGE_YRS\", \"SEX\"]\n",
+    "DERIVED_COLUMNS: list = [\"D_PRESENTED\"]\n",
+    "ID_COLUMNS: list = [\"VAERS_ID\"]\n",
+    "TEXT_COLUMNS: list = [\"SYMPTOM_TEXT\"]\n",
+    "\n",
+    "TEST_TRAIN_FRACTION: float = 0.3\n",
+    "TRAIN_VAL_FRACTION: float = 0.5"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-01-27T22:28:38.498974Z",
+     "start_time": "2024-01-27T22:28:38.486237Z"
+    }
+   },
+   "id": "34b77edf5a1fce96",
+   "execution_count": 16
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Reading data files"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "f5f84ddd06e9313e"
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "def read_aggregate(pattern: str) -> pd.DataFrame:\n",
+    "    files = glob.glob(f\"../data/{pattern}\")\n",
+    "    dfs = []\n",
+    "    for file in tqdm.tqdm(files):\n",
+    "        dfs.append(pd.read_csv(file, encoding=\"latin-1\", low_memory=False))\n",
+    "\n",
+    "    res = pd.concat(dfs, ignore_index=True)\n",
+    "    \n",
+    "    print(f\"Processed {len(dfs)} files for a total of {len(res)} records.\")\n",
+    "        \n",
+    "    return res"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-01-27T22:28:38.508227Z",
+     "start_time": "2024-01-27T22:28:38.500697Z"
+    }
+   },
+   "id": "a7772ed4b4b51868",
+   "execution_count": 17
+  },
+  {
+   "cell_type": "code",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "  0%|          | 0/1 [00:00<?, ?it/s]",
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "8a6919ed3c7e4c3a8885bb0991e856c7"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processed 1 files for a total of 105726 records.\n"
+     ]
+    }
+   ],
+   "source": [
+    "data = read_aggregate(\"*VAERSDATA.csv\")"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-01-27T22:28:39.567031Z",
+     "start_time": "2024-01-27T22:28:38.510939Z"
+    }
+   },
+   "id": "795e389489cbc6cf",
+   "execution_count": 18
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "_keep: list = ID_COLUMNS + DEMOGRAPHIC_COLUMNS + TEXT_COLUMNS + FLAG_COLUMNS + [\"ER_ED_VISIT\"]\n",
+    "data = data[_keep]"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-01-27T22:28:39.603326Z",
+     "start_time": "2024-01-27T22:28:39.569131Z"
+    }
+   },
+   "id": "5297fca83e18b502",
+   "execution_count": 19
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Recoding\n",
+    "\n",
+    "We recode as follows:\n",
+    "\n",
+    "* For the outcome flags, `NaN` is recoded as `0` and `Y` is recoded as `1`.\n",
+    "* `ER_VISIT` and `ER_ED_VISIT` are coalesced into a single column called `ER_VISIT` that is `1`-valued if either is `1`-valued, otherwise it is `0`-valued. This is to manage the renaming of the column in the VAERS data.\n",
+    "* `NaN`s in the symptom text will drop the record."
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "9467a8081810458e"
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "def recode(df: pd.DataFrame) -> pd.DataFrame:\n",
+    "    for column in FLAG_COLUMNS + [\"ER_ED_VISIT\"]:\n",
+    "        df[column] = df[column].replace(\"Y\", 1).fillna(0).astype(int)\n",
+    "    \n",
+    "    df['ER_VISIT'] = df[['ER_VISIT', 'ER_ED_VISIT']].max(axis=1)\n",
+    "    \n",
+    "    df = df.drop(columns=['ER_ED_VISIT'])\n",
+    "    \n",
+    "    df = df.dropna(subset=['SYMPTOM_TEXT'])\n",
+    "    \n",
+    "    return df"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-01-27T22:28:39.603731Z",
+     "start_time": "2024-01-27T22:28:39.590617Z"
+    }
+   },
+   "id": "9aad00c9fe40adb8",
+   "execution_count": 20
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-01-27T22:28:39.604024Z",
+     "start_time": "2024-01-27T22:28:39.593891Z"
+    }
+   },
+   "id": "b0fdcab6ee807404",
+   "execution_count": 20
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "data = recode(data)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-01-27T22:28:39.665777Z",
+     "start_time": "2024-01-27T22:28:39.597946Z"
+    }
+   },
+   "id": "f23ee0eae1b70387",
+   "execution_count": 21
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Derived fields\n",
+    "\n",
+    "We create the derived field `D_PRESENTED`. This is to provide a shorthand for patients who present in any way: ER, hospitalisation, office visit. It also comprises patients whose hospital stay is extended (`X_STAY`) as this is typically the consequence of presenting."
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "1c2f6b4fc2ae630b"
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "data['D_PRESENTED'] = data[['ER_VISIT', 'HOSPITAL', 'OFC_VISIT', 'X_STAY']].max(axis=1)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-01-27T22:28:39.679534Z",
+     "start_time": "2024-01-27T22:28:39.667363Z"
+    }
+   },
+   "id": "678847c70756695e",
+   "execution_count": 22
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Test/train/validate split\n",
+    "\n",
+    "We do a stratified split by age quintile and gender into test, train and validate sets."
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "dae902b111c8ef3c"
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "def stratified_split(df: pd.DataFrame, test_train_fraction: float, train_val_fraction: float, random_state: int = None) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:\n",
+    "    df['AGE_QUINTILE'] = pd.qcut(df['AGE_YRS'], 5, labels = False)\n",
+    "    df['STRATIFICATION_VARIABLE'] = df['SEX'].astype(str) + \"_\" + df['AGE_QUINTILE'].astype(str)\n",
+    "    df = df.drop(columns=['AGE_QUINTILE'])\n",
+    "     \n",
+    "    _, train = train_test_split(df, train_size=test_train_fraction, random_state=random_state, stratify=df.STRATIFICATION_VARIABLE)\n",
+    "    \n",
+    "    val, test = train_test_split(_, train_size=train_val_fraction, random_state=random_state, stratify=_.STRATIFICATION_VARIABLE)\n",
+    "    \n",
+    "    train = train.drop(columns=\"STRATIFICATION_VARIABLE\")\n",
+    "    val = val.drop(columns=\"STRATIFICATION_VARIABLE\")\n",
+    "    test = test.drop(columns=\"STRATIFICATION_VARIABLE\") \n",
+    "    \n",
+    "    return train, test, val"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-01-27T22:28:39.680497Z",
+     "start_time": "2024-01-27T22:28:39.678055Z"
+    }
+   },
+   "id": "ddee47653c94ff02",
+   "execution_count": 23
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "train, test, val = stratified_split(data, TEST_TRAIN_FRACTION, TRAIN_VAL_FRACTION)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-01-27T22:28:39.863489Z",
+     "start_time": "2024-01-27T22:28:39.680464Z"
+    }
+   },
+   "id": "bb16aaad0127ef7d",
+   "execution_count": 24
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Converting to labels"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "d61bfdc4a2879905"
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "def convert_to_dataset(df: pd.DataFrame) -> datasets.Dataset:\n",
+    "    df = df.loc[:, ID_COLUMNS + TEXT_COLUMNS + FLAG_COLUMNS + DERIVED_COLUMNS]\n",
+    "    \n",
+    "    # We create the labels – these have to be floats for multilabel classification that uses BCEWithLogitsLoss\n",
+    "    df.loc[:, \"labels\"] = df[FLAG_COLUMNS + DERIVED_COLUMNS].values.astype(float).tolist()\n",
+    "    \n",
+    "    print(f\"Building dataset with the following label order: {' '.join(FLAG_COLUMNS + DERIVED_COLUMNS)}\")\n",
+    "    \n",
+    "    # We drop the flag columns\n",
+    "    df = df.drop(columns=FLAG_COLUMNS).drop(columns=DERIVED_COLUMNS)\n",
+    "    \n",
+    "    # We rename the remaining columns\n",
+    "    df = df.rename(columns={\"SYMPTOM_TEXT\": \"text\", \"VAERS_ID\": \"id\"})\n",
+    "    \n",
+    "    return datasets.Dataset.from_pandas(df, preserve_index=False)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-01-27T22:28:39.867392Z",
+     "start_time": "2024-01-27T22:28:39.864829Z"
+    }
+   },
+   "id": "3d602444d33b7130",
+   "execution_count": 25
+  },
+  {
+   "cell_type": "code",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Building dataset with the following label order: DIED ER_VISIT HOSPITAL OFC_VISIT X_STAY DISABLE D_PRESENTED\n",
+      "Building dataset with the following label order: DIED ER_VISIT HOSPITAL OFC_VISIT X_STAY DISABLE D_PRESENTED\n",
+      "Building dataset with the following label order: DIED ER_VISIT HOSPITAL OFC_VISIT X_STAY DISABLE D_PRESENTED\n"
+     ]
+    }
+   ],
+   "source": [
+    "ds = datasets.DatasetDict()\n",
+    "ds[\"train\"] = convert_to_dataset(train)\n",
+    "ds[\"test\"] = convert_to_dataset(test)\n",
+    "ds[\"val\"] = convert_to_dataset(val)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-01-27T22:28:40.207548Z",
+     "start_time": "2024-01-27T22:28:39.872665Z"
+    }
+   },
+   "id": "e7c854a072956ca3",
+   "execution_count": 26
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Saving to Huggingface Hub"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "ec0167c068238f5a"
+  },
+  {
+   "cell_type": "code",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]",
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "c196be983bbc474186dad4b75347aebb"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": "Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]",
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "9bb3cbdfa4e84b96a68929fc3326536d"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]",
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "66aa46f327264d7aa8f42f4a1bcf0775"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": "Creating parquet from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]",
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "b14c57836adc4a3692f9594acc164ff0"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]",
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "d395ca6f2c9b4ee5bb49dbce3a9bd064"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": "Creating parquet from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]",
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "71780ee50ab649338bfa217f1767cca7"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": "README.md:   0%|          | 0.00/94.0 [00:00<?, ?B/s]",
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "1983f75eccf044649ab6423cad68dfdc"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": "CommitInfo(commit_url='https://huggingface.co/datasets/chrisvoncsefalvay/vaers-outcomes/commit/65fa5129a0b1eb64f8fdd1aca5490965810e4ddb', commit_message='Data set commit of 105238 records of VAERS data at 2024-01-27T15:28:40.206686.', commit_description='', oid='65fa5129a0b1eb64f8fdd1aca5490965810e4ddb', pr_url='https://huggingface.co/datasets/chrisvoncsefalvay/vaers-outcomes/discussions/1', pr_revision='refs/pr/1', pr_num=1)"
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "commit_message = f\"\"\"Data set commit of {len(train) + len(test) + len(val)} records of VAERS data at {datetime.now().isoformat()}.\"\"\"\n",
+    "\n",
+    "ds.push_to_hub(HF_URL, \n",
+    "               commit_message=commit_message,\n",
+    "               create_pr=True)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-01-27T22:28:45.264233Z",
+     "start_time": "2024-01-27T22:28:40.207690Z"
+    }
+   },
+   "id": "104ffca720a27624",
+   "execution_count": 27
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

notebooks/Untitled.ipynb ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e9c5e9c-af14-4148-86bb-b04f18e4d13e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8 - Pytorch and Tensorflow",
+   "language": "python",
+   "name": "python38-azureml-pt-tf"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

notebooks/comparisons.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f915ebb630ffd80319041ec728d9c7123b821d8f96b4745e909e937213832d21
+size 11079466

notebooks/daedra.ipynb.amltmp ADDED Viewed

	@@ -0,0 +1,671 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# DAEDRA: Determining Adverse Event Disposition for Regulatory Affairs\n",
+        "\n",
+        "DAEDRA is a language model intended to predict the disposition (outcome) of an adverse event based on the text of the event report. Intended to be used to classify reports in passive reporting systems, it is trained on the [VAERS](https://vaers.hhs.gov/) dataset, which contains reports of adverse events following vaccination in the United States."
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%pip install accelerate -U"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Requirement already satisfied: accelerate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.26.1)\nRequirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.23.5)\nRequirement already satisfied: huggingface-hub in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.20.3)\nRequirement already satisfied: pyyaml in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (6.0)\nRequirement already satisfied: psutil in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (5.9.5)\nRequirement already satisfied: torch>=1.10.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (1.12.0)\nRequirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (23.1)\nRequirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from accelerate) (0.4.2)\nRequirement already satisfied: typing_extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from torch>=1.10.0->accelerate) (4.6.3)\nRequirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2.31.0)\nRequirement already satisfied: tqdm>=4.42.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (4.65.0)\nRequirement already satisfied: fsspec>=2023.5.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (2023.10.0)\nRequirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from huggingface-hub->accelerate) (3.13.1)\nRequirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (2023.5.7)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (1.26.16)\nRequirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.4)\nRequirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->huggingface-hub->accelerate) (3.1.0)\nNote: you may need to restart the kernel to use updated packages.\n"
+        }
+      ],
+      "execution_count": 1,
+      "metadata": {
+        "gather": {
+          "logged": 1706475754655
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "tags": []
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%pip install transformers datasets shap watermark wandb evaluate codecarbon"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Requirement already satisfied: transformers in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (4.37.1)\nRequirement already satisfied: datasets in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.16.1)\nRequirement already satisfied: shap in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.44.1)\nRequirement already satisfied: watermark in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.4.3)\nRequirement already satisfied: wandb in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.16.2)\nRequirement already satisfied: evaluate in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (0.4.1)\nRequirement already satisfied: codecarbon in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (2.3.3)\nRequirement already satisfied: regex!=2019.12.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2023.12.25)\nRequirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (23.1)\nRequirement already satisfied: safetensors>=0.3.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.4.2)\nRequirement already satisfied: pyyaml>=5.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (6.0)\nRequirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (1.23.5)\nRequirement already satisfied: filelock in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (3.13.1)\nRequirement already satisfied: requests in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (2.31.0)\nRequirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.20.3)\nRequirement already satisfied: tqdm>=4.27 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (4.65.0)\nRequirement already satisfied: tokenizers<0.19,>=0.14 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from transformers) (0.15.1)\nRequirement already satisfied: pyarrow>=8.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (9.0.0)\nRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2023.10.0)\nRequirement already satisfied: pyarrow-hotfix in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.6)\nRequirement already satisfied: dill<0.3.8,>=0.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.3.7)\nRequirement already satisfied: aiohttp in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.9.1)\nRequirement already satisfied: multiprocess in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (0.70.15)\nRequirement already satisfied: xxhash in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (3.4.1)\nRequirement already satisfied: pandas in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from datasets) (2.0.2)\nRequirement already satisfied: scipy in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.10.1)\nRequirement already satisfied: slicer==0.0.7 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.0.7)\nRequirement already satisfied: cloudpickle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (2.2.1)\nRequirement already satisfied: scikit-learn in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (1.2.2)\nRequirement already satisfied: numba in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from shap) (0.58.1)\nRequirement already satisfied: setuptools in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (65.6.3)\nRequirement already satisfied: ipython>=6.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (8.12.2)\nRequirement already satisfied: importlib-metadata>=1.4 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from watermark) (6.7.0)\nRequirement already satisfied: Click!=8.0.0,>=7.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (8.1.3)\nRequirement already satisfied: typing-extensions in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (4.6.3)\nRequirement already satisfied: protobuf!=4.21.0,<5,>=3.12.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.19.6)\nRequirement already satisfied: setproctitle in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.3.3)\nRequirement already satisfied: psutil>=5.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (5.9.5)\nRequirement already satisfied: sentry-sdk>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.39.2)\nRequirement already satisfied: docker-pycreds>=0.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (0.4.0)\nRequirement already satisfied: appdirs>=1.4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (1.4.4)\nRequirement already satisfied: GitPython!=3.1.29,>=1.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from wandb) (3.1.31)\nRequirement already satisfied: responses<0.19 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from evaluate) (0.18.0)\nRequirement already satisfied: prometheus-client in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (0.19.0)\nRequirement already satisfied: py-cpuinfo in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (9.0.0)\nRequirement already satisfied: rapidfuzz in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (3.6.1)\nRequirement already satisfied: pynvml in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (11.5.0)\nRequirement already satisfied: arrow in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from codecarbon) (1.3.0)\nRequirement already satisfied: six>=1.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0)\nRequirement already satisfied: yarl<2.0,>=1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.9.4)\nRequirement already satisfied: async-timeout<5.0,>=4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (4.0.3)\nRequirement already satisfied: frozenlist>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.4.1)\nRequirement already satisfied: multidict<7.0,>=4.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (6.0.4)\nRequirement already satisfied: aiosignal>=1.1.2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.1)\nRequirement already satisfied: attrs>=17.3.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from aiohttp->datasets) (23.1.0)\nRequirement already satisfied: gitdb<5,>=4.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from GitPython!=3.1.29,>=1.0.0->wandb) (4.0.10)\nRequirement already satisfied: zipp>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from importlib-metadata>=1.4->watermark) (3.15.0)\nRequirement already satisfied: pickleshare in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.7.5)\nRequirement already satisfied: backcall in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.2.0)\nRequirement already satisfied: stack-data in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.6.2)\nRequirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (3.0.30)\nRequirement already satisfied: matplotlib-inline in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.1.6)\nRequirement already satisfied: decorator in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.1.1)\nRequirement already satisfied: traitlets>=5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (5.9.0)\nRequirement already satisfied: jedi>=0.16 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (0.18.2)\nRequirement already satisfied: pygments>=2.4.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (2.15.1)\nRequirement already satisfied: pexpect>4.3 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from ipython>=6.0->watermark) (4.8.0)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (1.26.16)\nRequirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.1.0)\nRequirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (3.4)\nRequirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from requests->transformers) (2023.5.7)\nRequirement already satisfied: types-python-dateutil>=2.8.10 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from arrow->codecarbon) (2.8.19.20240106)\nRequirement already satisfied: python-dateutil>=2.7.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from arrow->codecarbon) (2.8.2)\nRequirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from numba->shap) (0.41.1)\nRequirement already satisfied: pytz>=2020.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\nRequirement already satisfied: tzdata>=2022.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pandas->datasets) (2023.3)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (3.1.0)\nRequirement already satisfied: joblib>=1.1.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from scikit-learn->shap) (1.2.0)\nRequirement already satisfied: smmap<6,>=3.0.1 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb) (5.0.0)\nRequirement already satisfied: parso<0.9.0,>=0.8.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from jedi>=0.16->ipython>=6.0->watermark) (0.8.3)\nRequirement already satisfied: ptyprocess>=0.5 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from pexpect>4.3->ipython>=6.0->watermark) (0.7.0)\nRequirement already satisfied: wcwidth in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.0->watermark) (0.2.6)\nRequirement already satisfied: asttokens>=2.1.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (2.2.1)\nRequirement already satisfied: pure-eval in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (0.2.2)\nRequirement already satisfied: executing>=1.2.0 in /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages (from stack-data->ipython>=6.0->watermark) (1.2.0)\nNote: you may need to restart the kernel to use updated packages.\n"
+        }
+      ],
+      "execution_count": 2,
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "import torch\n",
+        "import os\n",
+        "from typing import List, Union\n",
+        "from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, DataCollatorWithPadding, pipeline, AutoModel\n",
+        "from datasets import load_dataset, Dataset, DatasetDict\n",
+        "import shap\n",
+        "import wandb\n",
+        "import evaluate\n",
+        "import logging\n",
+        "\n",
+        "wandb.finish()\n",
+        "\n",
+        "\n",
+        "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+        "\n",
+        "%load_ext watermark"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n  from .autonotebook import tqdm as notebook_tqdm\n2024-01-29 17:46:15.020290: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\nTo enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n2024-01-29 17:46:16.031641: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n2024-01-29 17:46:16.031779: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n2024-01-29 17:46:16.031793: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n"
+        }
+      ],
+      "execution_count": 3,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "caZjjFP0OyQNMVgZDiwswE",
+          "report_properties": {
+            "rowId": "un8W7ez7ZwoGb5Co6nydEV"
+          },
+          "type": "CODE"
+        },
+        "gather": {
+          "logged": 1706550378660
+        },
+        "tags": []
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "device: str = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+        "\n",
+        "SEED: int = 42\n",
+        "\n",
+        "BATCH_SIZE: int = 32\n",
+        "EPOCHS: int = 5\n",
+        "model_ckpt: str = \"distilbert-base-uncased\"\n",
+        "\n",
+        "# WandB configuration\n",
+        "os.environ[\"WANDB_PROJECT\"] = \"DAEDRA multiclass model training\" \n",
+        "os.environ[\"WANDB_LOG_MODEL\"] = \"checkpoint\"  # log all model checkpoints\n",
+        "os.environ[\"WANDB_NOTEBOOK_NAME\"] = \"DAEDRA.ipynb\""
+      ],
+      "outputs": [],
+      "execution_count": 4,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706550378812
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%watermark --iversion"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "shap    : 0.44.1\npandas  : 2.0.2\nwandb   : 0.16.2\nre      : 2.2.1\nevaluate: 0.4.1\ntorch   : 1.12.0\nnumpy   : 1.23.5\nlogging : 0.5.1.2\n\n"
+        }
+      ],
+      "execution_count": 5,
+      "metadata": {
+        "collapsed": false,
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!nvidia-smi"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Mon Jan 29 17:46:18 2024       \r\n+---------------------------------------------------------------------------------------+\r\n| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |\r\n|-----------------------------------------+----------------------+----------------------+\r\n| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\r\n| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\r\n|                                         |                      |               MIG M. |\r\n|=========================================+======================+======================|\r\n|   0  Tesla V100-PCIE-16GB           Off | 00000001:00:00.0 Off |                  Off |\r\n| N/A   26C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n|   1  Tesla V100-PCIE-16GB           Off | 00000002:00:00.0 Off |                  Off |\r\n| N/A   25C    P0              23W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n|   2  Tesla V100-PCIE-16GB           Off | 00000003:00:00.0 Off |                  Off |\r\n| N/A   25C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n|   3  Tesla V100-PCIE-16GB           Off | 00000004:00:00.0 Off |                  Off |\r\n| N/A   27C    P0              24W / 250W |      4MiB / 16384MiB |      0%      Default |\r\n|                                         |                      |                  N/A |\r\n+-----------------------------------------+----------------------+----------------------+\r\n                                                                                         \r\n+---------------------------------------------------------------------------------------+\r\n| Processes:                                                                            |\r\n|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\r\n|        ID   ID                                                             Usage      |\r\n|=======================================================================================|\r\n|  No running processes found                                                           |\r\n+---------------------------------------------------------------------------------------+\r\n"
+        }
+      ],
+      "execution_count": 6,
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": true,
+          "hide_output_from_viewers": true,
+          "node_id": "UU2oOJhwbIualogG1YyCMd",
+          "type": "CODE"
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Loading the data set"
+      ],
+      "metadata": {
+        "datalore": {
+          "hide_input_from_viewers": false,
+          "hide_output_from_viewers": false,
+          "node_id": "t45KHugmcPVaO0nuk8tGJ9",
+          "report_properties": {
+            "rowId": "40nN9Hvgi1clHNV5RAemI5"
+          },
+          "type": "MD"
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dataset = load_dataset(\"chrisvoncsefalvay/vaers-outcomes\")"
+      ],
+      "outputs": [],
+      "execution_count": 7,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706550381141
+        },
+        "jupyter": {
+          "outputs_hidden": false
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dataset"
+      ],
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "execution_count": 8,
+          "data": {
+            "text/plain": "DatasetDict({\n    train: Dataset({\n        features: ['id', 'text', 'label'],\n        num_rows: 1270444\n    })\n    test: Dataset({\n        features: ['id', 'text', 'label'],\n        num_rows: 272238\n    })\n    val: Dataset({\n        features: ['id', 'text', 'label'],\n        num_rows: 272238\n    })\n})"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 8,
+      "metadata": {
+        "collapsed": false,
+        "gather": {
+          "logged": 1706550381303
+        },
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "SUBSAMPLING = 0.01\n",
+        "\n",
+        "if SUBSAMPLING < 1:\n",
+        "    _ = DatasetDict()\n",
+        "    for each in dataset.keys():\n",
+        "        _[each] = dataset[each].shuffle(seed=SEED).select(range(int(len(dataset[each]) * SUBSAMPLING)))\n",
+        "\n",
+        "    dataset = _"
+      ],
+      "outputs": [],
+      "execution_count": 9,
+      "metadata": {
+        "gather": {
+          "logged": 1706550381472
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Tokenisation and encoding"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def encode_ds(ds: Union[Dataset, DatasetDict], tokenizer_model: str = model_ckpt) -> Union[Dataset, DatasetDict]:\n",
+        "    return ds_enc"
+      ],
+      "outputs": [],
+      "execution_count": 10,
+      "metadata": {
+        "gather": {
+          "logged": 1706550381637
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Evaluation metrics"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "accuracy = evaluate.load(\"accuracy\")\n",
+        "precision, recall = evaluate.load(\"precision\"), evaluate.load(\"recall\")\n",
+        "f1 = evaluate.load(\"f1\")"
+      ],
+      "outputs": [],
+      "execution_count": 11,
+      "metadata": {
+        "gather": {
+          "logged": 1706550381778
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def compute_metrics(eval_pred):\n",
+        "    predictions, labels = eval_pred\n",
+        "    predictions = np.argmax(predictions, axis=1)\n",
+        "    return {\n",
+        "        'accuracy': accuracy.compute(predictions=predictions, references=labels)[\"accuracy\"],\n",
+        "        'precision_macroaverage': precision.compute(predictions=predictions, references=labels, average='macro')[\"precision\"],\n",
+        "        'precision_microaverage': precision.compute(predictions=predictions, references=labels, average='micro')[\"precision\"],\n",
+        "        'recall_macroaverage': recall.compute(predictions=predictions, references=labels, average='macro')[\"recall\"],\n",
+        "        'recall_microaverage': recall.compute(predictions=predictions, references=labels, average='micro')[\"recall\"],\n",
+        "        'f1_microaverage': f1.compute(predictions=predictions, references=labels, average='micro')[\"f1\"]\n",
+        "    }"
+      ],
+      "outputs": [],
+      "execution_count": 12,
+      "metadata": {
+        "gather": {
+          "logged": 1706550381891
+        }
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Training"
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We specify a label map – this has to be done manually, even if `Datasets` has a function for it, as `AutoModelForSequenceClassification` requires an object with a length :("
+      ],
+      "metadata": {}
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "label_map = {i: label for i, label in enumerate(dataset[\"test\"].features[\"label\"].names)}"
+      ],
+      "outputs": [],
+      "execution_count": 13,
+      "metadata": {
+        "gather": {
+          "logged": 1706550382032
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def train_from_model(model_ckpt: str, push: bool = False):\n",
+        "    print(f\"Initialising training based on {model_ckpt}...\")\n",
+        "\n",
+        "    print(\"Tokenising...\")\n",
+        "    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)\n",
+        "\n",
+        "    cols = dataset[\"train\"].column_names\n",
+        "    cols.remove(\"label\")\n",
+        "    ds_enc = dataset.map(lambda x: tokenizer(x[\"text\"], truncation=True, max_length=512), batched=True, remove_columns=cols)\n",
+        "\n",
+        "    print(\"Loading model...\")\n",
+        "    try:\n",
+        "        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, \n",
+        "                                                                    num_labels=len(dataset[\"test\"].features[\"label\"].names), \n",
+        "                                                                    id2label=label_map, \n",
+        "                                                                    label2id={v:k for k,v in label_map.items()})\n",
+        "    except OSError:\n",
+        "        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, \n",
+        "                                                            num_labels=len(dataset[\"test\"].features[\"label\"].names), \n",
+        "                                                            id2label=label_map, \n",
+        "                                                            label2id={v:k for k,v in label_map.items()},\n",
+        "                                                            from_tf=True)\n",
+        "\n",
+        "\n",
+        "    args = TrainingArguments(\n",
+        "        output_dir=\"vaers\",\n",
+        "        evaluation_strategy=\"epoch\",\n",
+        "        save_strategy=\"epoch\",\n",
+        "        learning_rate=2e-5,\n",
+        "        per_device_train_batch_size=BATCH_SIZE,\n",
+        "        per_device_eval_batch_size=BATCH_SIZE,\n",
+        "        num_train_epochs=EPOCHS,\n",
+        "        weight_decay=.01,\n",
+        "        logging_steps=1,\n",
+        "        load_best_model_at_end=True,\n",
+        "        run_name=f\"daedra-training\",\n",
+        "        report_to=[\"wandb\"])\n",
+        "\n",
+        "    trainer = Trainer(\n",
+        "            model=model,\n",
+        "            args=args,\n",
+        "            train_dataset=ds_enc[\"train\"],\n",
+        "            eval_dataset=ds_enc[\"test\"],\n",
+        "            tokenizer=tokenizer,\n",
+        "            compute_metrics=compute_metrics)\n",
+        "    \n",
+        "    if SUBSAMPLING != 1.0:\n",
+        "        wandb_tag: List[str] = [f\"subsample-{SUBSAMPLING}\"]\n",
+        "    else:\n",
+        "        wandb_tag: List[str] = [f\"full_sample\"]\n",
+        "\n",
+        "    wandb_tag.append(f\"batch_size-{BATCH_SIZE}\")\n",
+        "    wandb_tag.append(f\"base:{model_ckpt}\")\n",
+        "        \n",
+        "    wandb.init(name=f\"daedra_{SUBSAMPLING}-{model_ckpt}\", tags=wandb_tag, magic=True)\n",
+        "\n",
+        "    print(\"Starting training...\")\n",
+        "\n",
+        "    trainer.train()\n",
+        "\n",
+        "    print(\"Training finished.\")\n",
+        "\n",
+        "    if push:\n",
+        "        variant = \"full_sample\" if SUBSAMPLING == 1.0 else f\"subsample-{SUBSAMPLING}\"\n",
+        "        tokenizer._tokenizer.save(\"tokenizer.json\")\n",
+        "        tokenizer.push_to_hub(\"chrisvoncsefalvay/daedra\")\n",
+        "        sample = \"full sample\" if SUBSAMPLING == 1.0 else f\"{SUBSAMPLING * 100}% of the full sample\"\n",
+        "\n",
+        "        model.push_to_hub(\"chrisvoncsefalvay/daedra\", \n",
+        "                        variant=variant,\n",
+        "                        commit_message=f\"DAEDRA model trained on {sample} of the VAERS dataset (training set size: {dataset['train'].num_rows:,}), based on {model_ckpt}\")"
+      ],
+      "outputs": [],
+      "execution_count": 14,
+      "metadata": {
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        },
+        "gather": {
+          "logged": 1706550382160
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "\n",
+        "base_models = [\n",
+        "    \"bert-base-uncased\",\n",
+        "    \"distilbert-base-uncased\",\n",
+        "]"
+      ],
+      "outputs": [],
+      "execution_count": 15,
+      "metadata": {
+        "gather": {
+          "logged": 1706550382318
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "BATCH_SIZE=1\n",
+        "\n",
+        "train_from_model(\"biobert/Bio_ClinicalBERT/\")"
+      ],
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Initialising training based on biobert/Bio_ClinicalBERT/...\nTokenising...\nLoading model...\n"
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "Map: 100%|██████████| 2722/2722 [00:01<00:00, 2195.12 examples/s]\nAll TF 2.0 model weights were used when initializing BertForSequenceClassification.\n\nAll the weights of BertForSequenceClassification were initialized from the TF 2.0 model.\nIf your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.\n"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Finishing last run (ID:sg022tqh) before initializing another..."
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View run <strong style=\"color:#cdcd00\">daedra_0.01-biobert/Bio_ClinicalBERT/</strong> at: <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/sg022tqh' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/sg022tqh</a><br/> View job at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjEzNDcyMTQwMw==/version_details/v6' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjEzNDcyMTQwMw==/version_details/v6</a><br/>Synced 6 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Find logs at: <code>./wandb/run-20240129_174816-sg022tqh/logs</code>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Successfully finished last run (ID:sg022tqh). Initializing new run:<br/>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Tracking run with wandb version 0.16.2"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Run data is saved locally in <code>/mnt/batch/tasks/shared/LS_root/mounts/clusters/daedra-hptrain-cvc/code/Users/kristof.csefalvay/daedra/notebooks/wandb/run-20240129_174936-kilkkg1j</code>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "Syncing run <strong><a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/kilkkg1j' target=\"_blank\">daedra_0.01-biobert/Bio_ClinicalBERT/</a></strong> to <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View project at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training</a>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": " View run at <a href='https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/kilkkg1j' target=\"_blank\">https://wandb.ai/chrisvoncsefalvay/DAEDRA%20multiclass%20model%20training/runs/kilkkg1j</a>"
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": "Starting training...\n"
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": "Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "<IPython.core.display.HTML object>",
+            "text/html": "\n    <div>\n      \n      <progress value='1496' max='15880' style='width:300px; height:20px; vertical-align: middle;'></progress>\n      [ 1496/15880 07:43 < 1:14:19, 3.23 it/s, Epoch 0.47/5]\n    </div>\n    <table border=\"1\" class=\"dataframe\">\n  <thead>\n <tr style=\"text-align: left;\">\n      <th>Epoch</th>\n      <th>Training Loss</th>\n      <th>Validation Loss</th>\n    </tr>\n  </thead>\n  <tbody>\n  </tbody>\n</table><p>"
+          },
+          "metadata": {}
+        }
+      ],
+      "execution_count": 21,
+      "metadata": {
+        "gather": {
+          "logged": 1706551053473
+        }
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "source_hidden": false,
+          "outputs_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      }
+    }
+  ],
+  "metadata": {
+    "datalore": {
+      "base_environment": "default",
+      "computation_mode": "JUPYTER",
+      "package_manager": "pip",
+      "packages": [
+        {
+          "name": "datasets",
+          "source": "PIP",
+          "version": "2.16.1"
+        },
+        {
+          "name": "torch",
+          "source": "PIP",
+          "version": "2.1.2"
+        },
+        {
+          "name": "accelerate",
+          "source": "PIP",
+          "version": "0.26.1"
+        }
+      ],
+      "report_row_ids": [
+        "un8W7ez7ZwoGb5Co6nydEV",
+        "40nN9Hvgi1clHNV5RAemI5",
+        "TgRD90H5NSPpKS41OeXI1w",
+        "ZOm5BfUs3h1EGLaUkBGeEB",
+        "kOP0CZWNSk6vqE3wkPp7Vc",
+        "W4PWcOu2O2pRaZyoE2W80h",
+        "RolbOnQLIftk0vy9mIcz5M",
+        "8OPhUgbaNJmOdiq5D3a6vK",
+        "5Qrt3jSvSrpK6Ne1hS6shL",
+        "hTq7nFUrovN5Ao4u6dIYWZ",
+        "I8WNZLpJ1DVP2wiCW7YBIB",
+        "SawhU3I9BewSE1XBPstpNJ",
+        "80EtLEl2FIE4FqbWnUD3nT"
+      ],
+      "version": 3
+    },
+    "kernel_info": {
+      "name": "python38-azureml-pt-tf"
+    },
+    "kernelspec": {
+      "display_name": "azureml_py38_PT_TF",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.8.5",
+      "mimetype": "text/x-python",
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "pygments_lexer": "ipython3",
+      "nbconvert_exporter": "python",
+      "file_extension": ".py"
+    },
+    "microsoft": {
+      "host": {
+        "AzureML": {
+          "notebookHasBeenCompleted": true
+        }
+      },
+      "ms_spell_check": {
+        "ms_spell_check_language": "en"
+      }
+    },
+    "nteract": {
+      "version": "nteract-front-end@1.0.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}

notebooks/daedra.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import pandas as pd
+import numpy as np
+import torch
+import os
+from typing import List, Union
+from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, DataCollatorWithPadding, pipeline, AutoModel
+from datasets import load_dataset, Dataset, DatasetDict
+import shap
+import wandb
+import evaluate
+import logging
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
+SEED: int = 42
+BATCH_SIZE: int = 16
+EPOCHS: int = 3
+SUBSAMPLING: float = 0.1
+# WandB configuration
+os.environ["WANDB_PROJECT"] = "DAEDRA multiclass model training"
+os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints
+os.environ["WANDB_NOTEBOOK_NAME"] = "DAEDRA.ipynb"
+dataset = load_dataset("chrisvoncsefalvay/vaers-outcomes")
+if SUBSAMPLING < 1:
+    _ = DatasetDict()
+    for each in dataset.keys():
+        _[each] = dataset[each].shuffle(seed=SEED).select(range(int(len(dataset[each]) * SUBSAMPLING)))
+    dataset = _
+accuracy = evaluate.load("accuracy")
+precision, recall = evaluate.load("precision"), evaluate.load("recall")
+f1 = evaluate.load("f1")
+def compute_metrics(eval_pred):
+    predictions, labels = eval_pred
+    predictions = np.argmax(predictions, axis=1)
+    return {
+        'accuracy': accuracy.compute(predictions=predictions, references=labels)["accuracy"],
+        'precision_macroaverage': precision.compute(predictions=predictions, references=labels, average='macro')["precision"],
+        'precision_microaverage': precision.compute(predictions=predictions, references=labels, average='micro')["precision"],
+        'recall_macroaverage': recall.compute(predictions=predictions, references=labels, average='macro')["recall"],
+        'recall_microaverage': recall.compute(predictions=predictions, references=labels, average='micro')["recall"],
+        'f1_microaverage': f1.compute(predictions=predictions, references=labels, average='micro')["f1"]
+    }
+label_map = {i: label for i, label in enumerate(dataset["test"].features["label"].names)}
+def train_from_model(model_ckpt: str, push: bool = False):
+    print(f"Initialising training based on {model_ckpt}...")
+    print("Tokenising...")
+    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
+    cols = dataset["train"].column_names
+    cols.remove("label")
+    ds_enc = dataset.map(lambda x: tokenizer(x["text"], truncation=True, max_length=512), batched=True, remove_columns=cols)
+    print("Loading model...")
+    try:
+        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,
+                                                                    num_labels=len(dataset["test"].features["label"].names),
+                                                                    id2label=label_map,
+                                                                    label2id={v:k for k,v in label_map.items()})
+    except OSError:
+        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,
+                                                            num_labels=len(dataset["test"].features["label"].names),
+                                                            id2label=label_map,
+                                                            label2id={v:k for k,v in label_map.items()},
+                                                            from_tf=True)
+    args = TrainingArguments(
+        output_dir="vaers",
+        evaluation_strategy="steps",
+        eval_steps=100,
+        save_strategy="epoch",
+        learning_rate=2e-5,
+        per_device_train_batch_size=BATCH_SIZE,
+        per_device_eval_batch_size=BATCH_SIZE,
+        num_train_epochs=EPOCHS,
+        weight_decay=.01,
+        logging_steps=1,
+        run_name=f"daedra-minisample-comparison-{SUBSAMPLING}",
+        report_to=["wandb"])
+    trainer = Trainer(
+            model=model,
+            args=args,
+            train_dataset=ds_enc["train"],
+            eval_dataset=ds_enc["test"],
+            tokenizer=tokenizer,
+            compute_metrics=compute_metrics)
+    if SUBSAMPLING != 1.0:
+        wandb_tag: List[str] = [f"subsample-{SUBSAMPLING}"]
+    else:
+        wandb_tag: List[str] = [f"full_sample"]
+    wandb_tag.append(f"batch_size-{BATCH_SIZE}")
+    wandb_tag.append(f"base:{model_ckpt}")
+    if "/" in model_ckpt:
+        sanitised_model_name = model_ckpt.split("/")[1]
+    else:
+        sanitised_model_name = model_ckpt
+    wandb.init(name=f"daedra_{SUBSAMPLING}-{sanitised_model_name}", tags=wandb_tag, magic=True)
+    print("Starting training...")
+    trainer.train()
+    print("Training finished.")
+    wandb.finish()
+if __name__ == "__main__":
+    wandb.finish()
+    for mname in (
+        #"dmis-lab/biobert-base-cased-v1.2",
+        "emilyalsentzer/Bio_ClinicalBERT",
+        "bert-base-uncased",
+        "distilbert-base-uncased"
+    ):
+        print(f"Now training on subsample with {mname}...")
+        train_from_model(mname)

notebooks/daedra.py.amltmp ADDED Viewed

	@@ -0,0 +1,134 @@

+import pandas as pd
+import numpy as np
+import torch
+import os
+from typing import List, Union
+from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, DataCollatorWithPadding, pipeline, AutoModel
+from datasets import load_dataset, Dataset, DatasetDict
+import shap
+import wandb
+import evaluate
+import logging
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
+SEED: int = 42
+BATCH_SIZE: int = 16
+EPOCHS: int = 3
+SUBSAMPLING: float = 0.1
+# WandB configuration
+os.environ["WANDB_PROJECT"] = "DAEDRA multiclass model training"
+os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints
+os.environ["WANDB_NOTEBOOK_NAME"] = "DAEDRA.ipynb"
+dataset = load_dataset("chrisvoncsefalvay/vaers-outcomes")
+if SUBSAMPLING < 1:
+    _ = DatasetDict()
+    for each in dataset.keys():
+        _[each] = dataset[each].shuffle(seed=SEED).select(range(int(len(dataset[each]) * SUBSAMPLING)))
+    dataset = _
+accuracy = evaluate.load("accuracy")
+precision, recall = evaluate.load("precision"), evaluate.load("recall")
+f1 = evaluate.load("f1")
+def compute_metrics(eval_pred):
+    predictions, labels = eval_pred
+    predictions = np.argmax(predictions, axis=1)
+    return {
+        'accuracy': accuracy.compute(predictions=predictions, references=labels)["accuracy"],
+        'precision_macroaverage': precision.compute(predictions=predictions, references=labels, average='macro')["precision"],
+        'precision_microaverage': precision.compute(predictions=predictions, references=labels, average='micro')["precision"],
+        'recall_macroaverage': recall.compute(predictions=predictions, references=labels, average='macro')["recall"],
+        'recall_microaverage': recall.compute(predictions=predictions, references=labels, average='micro')["recall"],
+        'f1_microaverage': f1.compute(predictions=predictions, references=labels, average='micro')["f1"]
+    }
+label_map = {i: label for i, label in enumerate(dataset["test"].features["label"].names)}
+def train_from_model(model_ckpt: str, push: bool = False):
+    print(f"Initialising training based on {model_ckpt}...")
+    print("Tokenising...")
+    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
+    cols = dataset["train"].column_names
+    cols.remove("label")
+    ds_enc = dataset.map(lambda x: tokenizer(x["text"], truncation=True, max_length=512), batched=True, remove_columns=cols)
+    print("Loading model...")
+    try:
+        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,
+                                                                    num_labels=len(dataset["test"].features["label"].names),
+                                                                    id2label=label_map,
+                                                                    label2id={v:k for k,v in label_map.items()})
+    except OSError:
+        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,
+                                                            num_labels=len(dataset["test"].features["label"].names),
+                                                            id2label=label_map,
+                                                            label2id={v:k for k,v in label_map.items()},
+                                                            from_tf=True)
+    args = TrainingArguments(
+        output_dir="vaers",
+        evaluation_strategy="steps",
+        eval_steps=100,
+        save_strategy="epoch",
+        learning_rate=2e-5,
+        per_device_train_batch_size=BATCH_SIZE,
+        per_device_eval_batch_size=BATCH_SIZE,
+        num_train_epochs=EPOCHS,
+        weight_decay=.01,
+        logging_steps=1,
+        run_name=f"daedra-minisample-comparison-{SUBSAMPLING}",
+        report_to=["wandb"])
+    trainer = Trainer(
+            model=model,
+            args=args,
+            train_dataset=ds_enc["train"],
+            eval_dataset=ds_enc["test"],
+            tokenizer=tokenizer,
+            compute_metrics=compute_metrics)
+    if SUBSAMPLING != 1.0:
+        wandb_tag: List[str] = [f"subsample-{SUBSAMPLING}"]
+    else:
+        wandb_tag: List[str] = [f"full_sample"]
+    wandb_tag.append(f"batch_size-{BATCH_SIZE}")
+    wandb_tag.append(f"base:{model_ckpt}")
+    if "/" in model_ckpt:
+        sanitised_model_name = model_ckpt.split("/")[1]
+    else:
+        sanitised_model_name = model_ckpt
+    wandb.init(name=f"daedra_{SUBSAMPLING}-{sanitised_model_name}", tags=wandb_tag, magic=True)
+    print("Starting training...")
+    trainer.train()
+    print("Training finished.")
+    wandb.finish()
+if __name__ == "__main__":
+    wandb.finish()
+    for mname in (
+        #"dmis-lab/biobert-base-cased-v1.2",
+        "emilyalsentzer/Bio_ClinicalBERT",
+        "bert-base-uncased",
+        "distilbert-base-uncased"
+    ):
+        print(f"Now training on subsample with {mname}...")
+        train_from_model(mname)

notebooks/daedra_final_training.py.amltmp ADDED Viewed

	@@ -0,0 +1,136 @@

+import pandas as pd
+import numpy as np
+import torch
+import os
+from typing import List, Union
+from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, DataCollatorWithPadding, pipeline, AutoModel
+from datasets import load_dataset, Dataset, DatasetDict
+import shap
+import wandb
+import evaluate
+import logging
+from codecarbon import EmissionsTracker
+tracker = EmissionsTracker()
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
+SEED: int = 42
+BATCH_SIZE: int = 16
+EPOCHS: int = 3
+SUBSAMPLING: float = 1
+# WandB configuration
+os.environ["WANDB_PROJECT"] = "DAEDRA final model training"
+os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints
+dataset = load_dataset("chrisvoncsefalvay/vaers-outcomes")
+if SUBSAMPLING < 1:
+    _ = DatasetDict()
+    for each in dataset.keys():
+        _[each] = dataset[each].shuffle(seed=SEED).select(range(int(len(dataset[each]) * SUBSAMPLING)))
+    dataset = _
+accuracy = evaluate.load("accuracy")
+precision, recall = evaluate.load("precision"), evaluate.load("recall")
+f1 = evaluate.load("f1")
+def compute_metrics(eval_pred):
+    predictions, labels = eval_pred
+    predictions = np.argmax(predictions, axis=1)
+    return {
+        'accuracy': accuracy.compute(predictions=predictions, references=labels)["accuracy"],
+        'precision_macroaverage': precision.compute(predictions=predictions, references=labels, average='macro')["precision"],
+        'precision_microaverage': precision.compute(predictions=predictions, references=labels, average='micro')["precision"],
+        'recall_macroaverage': recall.compute(predictions=predictions, references=labels, average='macro')["recall"],
+        'recall_microaverage': recall.compute(predictions=predictions, references=labels, average='micro')["recall"],
+        'f1_microaverage': f1.compute(predictions=predictions, references=labels, average='micro')["f1"]
+    }
+label_map = {i: label for i, label in enumerate(dataset["test"].features["label"].names)}
+def train_from_model(model_ckpt: str, push: bool = False):
+    print(f"Initialising training based on {model_ckpt}...")
+    print("Tokenising...")
+    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
+    cols = dataset["train"].column_names
+    cols.remove("label")
+    ds_enc = dataset.map(lambda x: tokenizer(x["text"], truncation=True, max_length=512), batched=True, remove_columns=cols)
+    print("Loading model...")
+    try:
+        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,
+                                                                    num_labels=len(dataset["test"].features["label"].names),
+                                                                    id2label=label_map,
+                                                                    label2id={v:k for k,v in label_map.items()})
+    except OSError:
+        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,
+                                                            num_labels=len(dataset["test"].features["label"].names),
+                                                            id2label=label_map,
+                                                            label2id={v:k for k,v in label_map.items()},
+                                                            from_tf=True)
+    args = TrainingArguments(
+        output_dir="daedra",
+        evaluation_strategy="steps",
+        eval_steps=1000,
+        save_steps=2000,
+        save_strategy="steps",
+        learning_rate=2e-5,
+        per_device_train_batch_size=BATCH_SIZE,
+        per_device_eval_batch_size=BATCH_SIZE,
+        num_train_epochs=EPOCHS,
+        weight_decay=.01,
+        logging_steps=1,
+        run_name=f"daedra-full-train",
+        report_to=["wandb", "codecarbon"],
+        save_total_limit=2,
+        load_best_model_at_end=True,
+        push_to_hub=True,
+        push_to_hub_model_id="daedra",
+        hub_strategy="every_save",
+        metric_for_best_model="f1_microaverage")
+    trainer = Trainer(
+            model=model,
+            args=args,
+            train_dataset=ds_enc["train"],
+            eval_dataset=ds_enc["test"],
+            tokenizer=tokenizer,
+            compute_metrics=compute_metrics)
+    wandb_tag: List[str] = ["full_sample"]
+    wandb_tag.append(f"batch_size-{BATCH_SIZE}")
+    wandb_tag.append(f"base:{model_ckpt}")
+    if "/" in model_ckpt:
+        sanitised_model_name = model_ckpt.split("/")[1]
+    else:
+        sanitised_model_name = model_ckpt
+    wandb.init(name=f"daedra_{SUBSAMPLING}-{sanitised_model_name}", tags=wandb_tag, magic=True)
+    print("Starting training...")
+    tracker.start()
+    trainer.train()
+    tracker.stop()
+    print("Training finished.")
+    wandb.finish()
+if __name__ == "__main__":
+    wandb.finish()
+    train_from_model("dmis-lab/biobert-base-cased-v1.2")

notebooks/emissions.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+timestamp,project_name,run_id,duration,emissions,emissions_rate,cpu_power,gpu_power,ram_power,cpu_energy,gpu_energy,ram_energy,energy_consumed,country_name,country_iso_code,region,cloud_provider,cloud_region,os,python_version,codecarbon_version,cpu_count,cpu_model,gpu_count,gpu_model,longitude,latitude,ram_total_size,tracking_mode,on_cloud,pue
+2024-01-29T03:05:13,codecarbon,6bfec408-4fcc-427a-8e94-0cabc9332665,10637.685039520264,0.9110964852888171,8.564800348045516e-05,42.5,148.01654697980965,165.33123922348022,0.1255709600533049,1.8546672673437372,0.4879591008899785,2.468197328287024,United States,USA,virginia,,,Linux-5.15.0-1040-azure-x86_64-with-glibc2.10,3.8.5,2.3.3,24,Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz,4,4 x Tesla V100-PCIE-16GB,-76.8545,37.9273,2f,35031.49867606163,3.0209420758007166,8.623502247892816e-05,42.5,148.9286737800754,165.33123922348022,0.413520891640749,6.163406995166088,1.6069267112465608,8.183854598053408,United States,USA,virginia,,,Linux-5.15.0-1040-azure-x86_64-with-glibc2.10,3.8.5,2.3.3,24,Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz,4,4 x Tesla V100-PCIE-16GB,-76.8545,37.9273,440440.88330459594727,machine,N,1.0
+2024-01-29T14:29:57,codecarbon,484aeab5-8bdc-4fbc-8f66-0c204b0f2a.88330459594727,machine,N,1.0

notebooks/emissions.csv.amltmp ADDED Viewed

	@@ -0,0 +1,3 @@

+timestamp,project_name,run_id,duration,emissions,emissions_rate,cpu_power,gpu_power,ram_power,cpu_energy,gpu_energy,ram_energy,energy_consumed,country_name,country_iso_code,region,cloud_provider,cloud_region,os,python_version,codecarbon_version,cpu_count,cpu_model,gpu_count,gpu_model,longitude,latitude,ram_total_size,tracking_mode,on_cloud,pue
+2024-01-29T03:05:13,codecarbon,6bfec408-4fcc-427a-8e94-0cabc9332665,10637.685039520264,0.9110964852888171,8.564800348045516e-05,42.5,148.01654697980965,165.33123922348022,0.1255709600533049,1.8546672673437372,0.4879591008899785,2.468197328287024,United States,USA,virginia,,,Linux-5.15.0-1040-azure-x86_64-with-glibc2.10,3.8.5,2.3.3,24,Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz,4,4 x Tesla V100-PCIE-16GB,-76.8545,37.9273,2f,35031.49867606163,3.0209420758007166,8.623502247892816e-05,42.5,148.9286737800754,165.33123922348022,0.413520891640749,6.163406995166088,1.6069267112465608,8.183854598053408,United States,USA,virginia,,,Linux-5.15.0-1040-azure-x86_64-with-glibc2.10,3.8.5,2.3.3,24,Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz,4,4 x Tesla V100-PCIE-16GB,-76.8545,37.9273,440440.88330459594727,machine,N,1.0
+2024-01-29T14:29:57,codecarbon,484aeab5-8bdc-4fbc-8f66-0c204b0f2a.88330459594727,machine,N,1.0

notebooks/microsample_model_comparison.ipynb ADDED Viewed

File without changes

notebooks/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

notebooks/wandb/.amlignore ADDED Viewed

	@@ -0,0 +1,6 @@

+## This file was auto generated by the Azure Machine Learning Studio. Please do not remove.
+## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots
+.ipynb_aml_checkpoints/
+*.amltmp
+*.amltemp

notebooks/wandb/.amlignore.amltmp ADDED Viewed

	@@ -0,0 +1,6 @@

+## This file was auto generated by the Azure Machine Learning Studio. Please do not remove.
+## Read more about the .amlignore file here: https://docs.microsoft.com/azure/machine-learning/how-to-save-write-experiment-files#storage-limits-of-experiment-snapshots
+.ipynb_aml_checkpoints/
+*.amltmp
+*.amltemp

paper/.gitkeep ADDED Viewed

File without changes

tokenizer.json CHANGED Viewed

@@ -1,6 +1,11 @@
 {
   "version": "1.0",
-  "truncation": null,
   "padding": null,
   "added_tokens": [
     {

 {
   "version": "1.0",
+  "truncation": {
+    "direction": "Right",
+    "max_length": 512,
+    "strategy": "LongestFirst",
+    "stride": 0
+  },
   "padding": null,
   "added_tokens": [
     {

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f5d7ea168393ffb21085d167d41727aa4ff418441e573afbcbbf468e3ccd8d1c
 size 4728

 version https://git-lfs.github.com/spec/v1
+oid sha256:c847edf58c0470c1a32d6d0f580f3c732e43c689025195de8e292f71fbb85be6
 size 4728