# Training and deploying Hugging Face models on Amazon SageMaker

* https://huggingface.co/distilbert-base-uncased
* https://huggingface.co/transformers/model_doc/distilbert.html
* https://huggingface.co/datasets/generated_reviews_enth

# 1 - Setup

In [None]:
%%sh
pip -q install torch transformers datasets widgetsnbextension ipywidgets huggingface_hub sacremoses==0.0.49

In [None]:
import sagemaker
import transformers
import datasets

print(sagemaker.__version__)
print(transformers.__version__)
print(datasets.__version__)

# 2 - Preprocessing

In [None]:
from datasets import load_dataset

train_dataset, valid_dataset = load_dataset('generated_reviews_enth', split=['train', 'validation'])

print(train_dataset.shape)
print(valid_dataset.shape)

In [None]:
train_dataset[0]

In [None]:
def map_stars_to_sentiment(row):
 return {
 'labels': 1 if row['review_star'] >= 4 else 0
 }

In [None]:
train_dataset = train_dataset.map(map_stars_to_sentiment)
valid_dataset = valid_dataset.map(map_stars_to_sentiment)

In [None]:
train_dataset[0]

In [None]:
train_dataset = train_dataset.flatten()
valid_dataset = valid_dataset.flatten()

In [None]:
train_dataset[0]

In [None]:
train_dataset = train_dataset.remove_columns(['correct', 'translation.th', 'review_star'])
valid_dataset = valid_dataset.remove_columns(['correct', 'translation.th', 'review_star'])

In [None]:
train_dataset = train_dataset.rename_column('translation.en', 'text')
valid_dataset = valid_dataset.rename_column('translation.en', 'text')

In [None]:
train_dataset[0]

## Tokenize

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
 return tokenizer(batch['text'], padding='max_length', truncation=True)

In [None]:
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))

In [None]:
valid_dataset = valid_dataset.map(tokenize, batched=True, batch_size=len(valid_dataset))

In [None]:
import json

json.dumps(train_dataset[0])

In [None]:
train_dataset = train_dataset.remove_columns(['text'])
valid_dataset = valid_dataset.remove_columns(['text'])

# 3 - Upload data to S3

In [None]:
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem() 

s3_prefix = 'hugging-face/sentiment-analysis'
bucket = sagemaker.Session().default_bucket()

train_input_path = 's3://{}/{}/training'.format(bucket, s3_prefix)
train_dataset.save_to_disk(train_input_path, fs=s3)

valid_input_path = 's3://{}/{}/validation'.format(bucket, s3_prefix)
valid_dataset.save_to_disk(valid_input_path, fs=s3)

In [None]:
print(train_input_path)
print(valid_input_path)

# 4 - Fine-tune a Hugging Face model on SageMaker

In [None]:
!pygmentize train.py

In [None]:
hyperparameters={
 'epochs': 1,
 'train-batch_size': 32,
 'model-name':'distilbert-base-uncased'
}

In [None]:
from sagemaker.huggingface import HuggingFace

huggingface_estimator = HuggingFace(
 role=sagemaker.get_execution_role(),
 # Fine-tuning script
 entry_point='train.py',
 hyperparameters=hyperparameters,
 # Infrastructure
 transformers_version='4.10',
 pytorch_version='1.9',
 py_version='py38',
 instance_type='ml.p3.2xlarge', # 1 GPUs, $4.131/hour in eu-west-1
 instance_count=1,
 # Enable spot instances
 #use_spot_instances=True, # 70% discount is typical
 #max_run = 3600,
 #max_wait = 7200
)

In [None]:
huggingface_estimator.fit({'train': train_input_path, 'valid': valid_input_path})

# 5 - Deploy the model on SageMaker

In [None]:
huggingface_predictor = huggingface_estimator.deploy(
 initial_instance_count=1,
 instance_type='ml.m5.xlarge')

In [None]:
test_data = {
 "inputs": "This is a very nice camera, I'm super happy with it."
}

In [None]:
prediction = huggingface_predictor.predict(test_data)
print(prediction)

In [None]:
test_data = {
 "inputs": "Terrible purchase, I want my money back!"
}

In [None]:
prediction = huggingface_predictor.predict(test_data)
print(prediction)

In [None]:
huggingface_predictor.delete_endpoint()

# 6 - Push our model to the Hugging Face hub

In [None]:
# In a terminal, login to the Hub with 'huggingface-cli login' and your hub credentials

## Create a new repo on the Hugging Face hub

In [None]:
repo_name='reviews-sentiment-analysis'

In [None]:
%%sh -s $repo_name
huggingface-cli repo create -y $1
git clone https://huggingface.co/juliensimon/$1

## Extract our model and push files to our hub repo

In [None]:
%%sh -s $huggingface_estimator.model_data $repo_name
aws s3 cp $1 .
tar xvz -C $2 -f model.tar.gz

In [None]:
%%sh -s $repo_name
cd $1
git add .
git commit -m 'Initial version'
git push

## Grab our model from the hub and work locally

In [None]:
# With the Auto* API
from transformers import AutoTokenizer, AutoModelForSequenceClassification 

tokenizer = AutoTokenizer.from_pretrained('juliensimon/'+repo_name)
model = AutoModelForSequenceClassification.from_pretrained('juliensimon/'+repo_name)

# With the pipeline API
from transformers import pipeline

classifier = pipeline('sentiment-analysis', model='juliensimon/'+repo_name)

In [None]:
classifier("This is a very nice camera, I'm super happy with it.")

In [None]:
classifier("Terrible purchase, I want my money back!")

## Grab our model from the hub and deploy it on a SageMaker endpoint

In [None]:
from sagemaker.huggingface.model import HuggingFaceModel

hub = {
 'HF_MODEL_ID':'juliensimon/'+repo_name, 
 'HF_TASK':'sentiment-analysis'
}

huggingface_model = HuggingFaceModel(
 env=hub, 
 role=sagemaker.get_execution_role(), 
 transformers_version='4.10', 
 pytorch_version='1.9', 
 py_version='py38' 
)

In [None]:
huggingface_predictor = huggingface_model.deploy(
 initial_instance_count=1,
 instance_type='ml.m5.xlarge'
)

In [None]:
test_data = {
 'inputs': "This is a very nice camera, I'm super happy with it."
}

prediction = huggingface_predictor.predict(test_data)
print(prediction)

In [None]:
huggingface_predictor.delete_endpoint()