{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Training and deploying Hugging Face models on Amazon SageMaker\n",
    "\n",
    "* https://huggingface.co/distilbert-base-uncased\n",
    "* https://huggingface.co/transformers/model_doc/distilbert.html\n",
    "* https://huggingface.co/datasets/generated_reviews_enth"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1 - Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "%%sh\n",
    "pip -q install torch transformers datasets widgetsnbextension ipywidgets huggingface_hub sacremoses==0.0.49"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sagemaker\n",
    "import transformers\n",
    "import datasets\n",
    "\n",
    "print(sagemaker.__version__)\n",
    "print(transformers.__version__)\n",
    "print(datasets.__version__)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2 - Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "train_dataset, valid_dataset = load_dataset('generated_reviews_enth', split=['train', 'validation'])\n",
    "\n",
    "print(train_dataset.shape)\n",
    "print(valid_dataset.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def map_stars_to_sentiment(row):\n",
    "    return {\n",
    "        'labels': 1 if row['review_star'] >= 4 else 0\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset = train_dataset.map(map_stars_to_sentiment)\n",
    "valid_dataset = valid_dataset.map(map_stars_to_sentiment)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset = train_dataset.flatten()\n",
    "valid_dataset = valid_dataset.flatten()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset = train_dataset.remove_columns(['correct', 'translation.th', 'review_star'])\n",
    "valid_dataset = valid_dataset.remove_columns(['correct', 'translation.th', 'review_star'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset = train_dataset.rename_column('translation.en', 'text')\n",
    "valid_dataset = valid_dataset.rename_column('translation.en', 'text')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')\n",
    "\n",
    "def tokenize(batch):\n",
    "    return tokenizer(batch['text'], padding='max_length', truncation=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_dataset = valid_dataset.map(tokenize, batched=True, batch_size=len(valid_dataset))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "json.dumps(train_dataset[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset = train_dataset.remove_columns(['text'])\n",
    "valid_dataset = valid_dataset.remove_columns(['text'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3 - Upload data to S3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets.filesystems import S3FileSystem\n",
    "\n",
    "s3 = S3FileSystem()  \n",
    "\n",
    "s3_prefix = 'hugging-face/sentiment-analysis'\n",
    "bucket = sagemaker.Session().default_bucket()\n",
    "\n",
    "train_input_path = 's3://{}/{}/training'.format(bucket, s3_prefix)\n",
    "train_dataset.save_to_disk(train_input_path, fs=s3)\n",
    "\n",
    "valid_input_path = 's3://{}/{}/validation'.format(bucket, s3_prefix)\n",
    "valid_dataset.save_to_disk(valid_input_path, fs=s3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(train_input_path)\n",
    "print(valid_input_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4 - Fine-tune a Hugging Face model on SageMaker"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pygmentize train.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hyperparameters={\n",
    "    'epochs': 1,\n",
    "    'train-batch_size': 32,\n",
    "    'model-name':'distilbert-base-uncased'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sagemaker.huggingface import HuggingFace\n",
    "\n",
    "huggingface_estimator = HuggingFace(\n",
    "    role=sagemaker.get_execution_role(),\n",
    "    # Fine-tuning script\n",
    "    entry_point='train.py',\n",
    "    hyperparameters=hyperparameters,\n",
    "    # Infrastructure\n",
    "    transformers_version='4.10',\n",
    "    pytorch_version='1.9',\n",
    "    py_version='py38',\n",
    "    instance_type='ml.p3.2xlarge',  # 1 GPUs, $4.131/hour in eu-west-1\n",
    "    instance_count=1,\n",
    "    # Enable spot instances\n",
    "    #use_spot_instances=True,        # 70% discount is typical\n",
    "    #max_run = 3600,\n",
    "    #max_wait = 7200\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "huggingface_estimator.fit({'train': train_input_path, 'valid': valid_input_path})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5 - Deploy the model on SageMaker"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "huggingface_predictor = huggingface_estimator.deploy(\n",
    "    initial_instance_count=1,\n",
    "    instance_type='ml.m5.xlarge')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_data = {\n",
    "   \"inputs\": \"This is a very nice camera, I'm super happy with it.\"\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prediction = huggingface_predictor.predict(test_data)\n",
    "print(prediction)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_data = {\n",
    "   \"inputs\": \"Terrible purchase, I want my money back!\"\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prediction = huggingface_predictor.predict(test_data)\n",
    "print(prediction)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "huggingface_predictor.delete_endpoint()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 6 - Push our model to the Hugging Face hub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# In a terminal, login to the Hub with 'huggingface-cli login' and your hub credentials"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create a new repo on the Hugging Face hub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "repo_name='reviews-sentiment-analysis'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%sh -s $repo_name\n",
    "huggingface-cli repo create -y $1\n",
    "git clone https://huggingface.co/juliensimon/$1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extract our model and push files to our hub repo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%sh -s $huggingface_estimator.model_data $repo_name\n",
    "aws s3 cp $1 .\n",
    "tar xvz -C $2 -f model.tar.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%sh -s $repo_name\n",
    "cd $1\n",
    "git add .\n",
    "git commit -m 'Initial version'\n",
    "git push"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Grab our model from the hub and work locally"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# With the Auto* API\n",
    "from transformers import AutoTokenizer, AutoModelForSequenceClassification \n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained('juliensimon/'+repo_name)\n",
    "model = AutoModelForSequenceClassification.from_pretrained('juliensimon/'+repo_name)\n",
    "\n",
    "# With the pipeline API\n",
    "from transformers import pipeline\n",
    "\n",
    "classifier = pipeline('sentiment-analysis', model='juliensimon/'+repo_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "classifier(\"This is a very nice camera, I'm super happy with it.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "classifier(\"Terrible purchase, I want my money back!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Grab our model from the hub and deploy it on a SageMaker endpoint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sagemaker.huggingface.model import HuggingFaceModel\n",
    "\n",
    "hub = {\n",
    "  'HF_MODEL_ID':'juliensimon/'+repo_name, \n",
    "  'HF_TASK':'sentiment-analysis'\n",
    "}\n",
    "\n",
    "huggingface_model = HuggingFaceModel(\n",
    "   env=hub,                                                \n",
    "   role=sagemaker.get_execution_role(),                                        \n",
    "   transformers_version='4.10',                           \n",
    "   pytorch_version='1.9',                                \n",
    "   py_version='py38'                                      \n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "huggingface_predictor = huggingface_model.deploy(\n",
    "   initial_instance_count=1,\n",
    "   instance_type='ml.m5.xlarge'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_data = {\n",
    "   'inputs': \"This is a very nice camera, I'm super happy with it.\"\n",
    "}\n",
    "\n",
    "prediction = huggingface_predictor.predict(test_data)\n",
    "print(prediction)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "huggingface_predictor.delete_endpoint()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "instance_type": "ml.m5.4xlarge",
  "kernelspec": {
   "display_name": "Python 3 (Data Science)",
   "language": "python",
   "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/datascience-1.0"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}