{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Training and deploying Hugging Face models on Amazon SageMaker\n", "\n", "* https://huggingface.co/distilbert-base-uncased\n", "* https://huggingface.co/transformers/model_doc/distilbert.html\n", "* https://huggingface.co/datasets/generated_reviews_enth" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1 - Setup" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "%%sh\n", "pip -q install torch transformers datasets widgetsnbextension ipywidgets huggingface_hub sacremoses==0.0.49" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import sagemaker\n", "import transformers\n", "import datasets\n", "\n", "print(sagemaker.__version__)\n", "print(transformers.__version__)\n", "print(datasets.__version__)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2 - Preprocessing" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "train_dataset, valid_dataset = load_dataset('generated_reviews_enth', split=['train', 'validation'])\n", "\n", "print(train_dataset.shape)\n", "print(valid_dataset.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_dataset[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def map_stars_to_sentiment(row):\n", " return {\n", " 'labels': 1 if row['review_star'] >= 4 else 0\n", " }" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_dataset = train_dataset.map(map_stars_to_sentiment)\n", "valid_dataset = valid_dataset.map(map_stars_to_sentiment)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_dataset[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_dataset = train_dataset.flatten()\n", "valid_dataset = valid_dataset.flatten()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_dataset[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_dataset = train_dataset.remove_columns(['correct', 'translation.th', 'review_star'])\n", "valid_dataset = valid_dataset.remove_columns(['correct', 'translation.th', 'review_star'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_dataset = train_dataset.rename_column('translation.en', 'text')\n", "valid_dataset = valid_dataset.rename_column('translation.en', 'text')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_dataset[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tokenize" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')\n", "\n", "def tokenize(batch):\n", " return tokenizer(batch['text'], padding='max_length', truncation=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "valid_dataset = valid_dataset.map(tokenize, batched=True, batch_size=len(valid_dataset))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "json.dumps(train_dataset[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_dataset = train_dataset.remove_columns(['text'])\n", "valid_dataset = valid_dataset.remove_columns(['text'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3 - Upload data to S3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets.filesystems import S3FileSystem\n", "\n", "s3 = S3FileSystem() \n", "\n", "s3_prefix = 'hugging-face/sentiment-analysis'\n", "bucket = sagemaker.Session().default_bucket()\n", "\n", "train_input_path = 's3://{}/{}/training'.format(bucket, s3_prefix)\n", "train_dataset.save_to_disk(train_input_path, fs=s3)\n", "\n", "valid_input_path = 's3://{}/{}/validation'.format(bucket, s3_prefix)\n", "valid_dataset.save_to_disk(valid_input_path, fs=s3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(train_input_path)\n", "print(valid_input_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 4 - Fine-tune a Hugging Face model on SageMaker" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pygmentize train.py" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "hyperparameters={\n", " 'epochs': 1,\n", " 'train-batch_size': 32,\n", " 'model-name':'distilbert-base-uncased'\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sagemaker.huggingface import HuggingFace\n", "\n", "huggingface_estimator = HuggingFace(\n", " role=sagemaker.get_execution_role(),\n", " # Fine-tuning script\n", " entry_point='train.py',\n", " hyperparameters=hyperparameters,\n", " # Infrastructure\n", " transformers_version='4.10',\n", " pytorch_version='1.9',\n", " py_version='py38',\n", " instance_type='ml.p3.2xlarge', # 1 GPUs, $4.131/hour in eu-west-1\n", " instance_count=1,\n", " # Enable spot instances\n", " #use_spot_instances=True, # 70% discount is typical\n", " #max_run = 3600,\n", " #max_wait = 7200\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "huggingface_estimator.fit({'train': train_input_path, 'valid': valid_input_path})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5 - Deploy the model on SageMaker" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "huggingface_predictor = huggingface_estimator.deploy(\n", " initial_instance_count=1,\n", " instance_type='ml.m5.xlarge')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_data = {\n", " \"inputs\": \"This is a very nice camera, I'm super happy with it.\"\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "prediction = huggingface_predictor.predict(test_data)\n", "print(prediction)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_data = {\n", " \"inputs\": \"Terrible purchase, I want my money back!\"\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "prediction = huggingface_predictor.predict(test_data)\n", "print(prediction)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "huggingface_predictor.delete_endpoint()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 6 - Push our model to the Hugging Face hub" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# In a terminal, login to the Hub with 'huggingface-cli login' and your hub credentials" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create a new repo on the Hugging Face hub" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "repo_name='reviews-sentiment-analysis'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%sh -s $repo_name\n", "huggingface-cli repo create -y $1\n", "git clone https://huggingface.co/juliensimon/$1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Extract our model and push files to our hub repo" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%sh -s $huggingface_estimator.model_data $repo_name\n", "aws s3 cp $1 .\n", "tar xvz -C $2 -f model.tar.gz" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%sh -s $repo_name\n", "cd $1\n", "git add .\n", "git commit -m 'Initial version'\n", "git push" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Grab our model from the hub and work locally" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# With the Auto* API\n", "from transformers import AutoTokenizer, AutoModelForSequenceClassification \n", "\n", "tokenizer = AutoTokenizer.from_pretrained('juliensimon/'+repo_name)\n", "model = AutoModelForSequenceClassification.from_pretrained('juliensimon/'+repo_name)\n", "\n", "# With the pipeline API\n", "from transformers import pipeline\n", "\n", "classifier = pipeline('sentiment-analysis', model='juliensimon/'+repo_name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "classifier(\"This is a very nice camera, I'm super happy with it.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "classifier(\"Terrible purchase, I want my money back!\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Grab our model from the hub and deploy it on a SageMaker endpoint" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sagemaker.huggingface.model import HuggingFaceModel\n", "\n", "hub = {\n", " 'HF_MODEL_ID':'juliensimon/'+repo_name, \n", " 'HF_TASK':'sentiment-analysis'\n", "}\n", "\n", "huggingface_model = HuggingFaceModel(\n", " env=hub, \n", " role=sagemaker.get_execution_role(), \n", " transformers_version='4.10', \n", " pytorch_version='1.9', \n", " py_version='py38' \n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "huggingface_predictor = huggingface_model.deploy(\n", " initial_instance_count=1,\n", " instance_type='ml.m5.xlarge'\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_data = {\n", " 'inputs': \"This is a very nice camera, I'm super happy with it.\"\n", "}\n", "\n", "prediction = huggingface_predictor.predict(test_data)\n", "print(prediction)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "huggingface_predictor.delete_endpoint()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "instance_type": "ml.m5.4xlarge", "kernelspec": { "display_name": "Python 3 (Data Science)", "language": "python", "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/datascience-1.0" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 4 }