File size: 5,637 Bytes

a487597

# %% Importing the dependencies we need
import numpy as np
import torch
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import (accuracy_score, f1_score, confusion_matrix, 
                            ConfusionMatrixDisplay, classification_report)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from skops import card, hub_utils
from skorch import NeuralNetClassifier
from skorch.callbacks import LRScheduler, ProgressBar
from skorch.hf import HuggingfacePretrainedTokenizer
from torch import nn
from torch.optim.lr_scheduler import LambdaLR
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
# for model hosting and requirements
from pathlib import Path 
import transformers
import skorch
import sklearn 
import torch

# %%
# Choose a tokenizer and BERT model that work together
TOKENIZER = "distilbert-base-uncased"
PRETRAINED_MODEL = "distilbert-base-uncased"

# model hyper-parameters
OPTMIZER = torch.optim.AdamW
LR = 5e-5
MAX_EPOCHS = 3
CRITERION = nn.CrossEntropyLoss
BATCH_SIZE = 8

# device
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# %% Load the dataset, define features & labels and split
dataset = fetch_20newsgroups()

print(dataset.DESCR.split('Usage')[0])

dataset.target_names

X = dataset.data
y = dataset.target
X_train, X_test, y_train, y_test, = train_test_split(X, y, stratify=y, random_state=0)
num_training_steps = MAX_EPOCHS * (len(X_train) // BATCH_SIZE + 1)

# %% 
# Defining learning rate scheduler & BERT in nn.Module

def lr_schedule(current_step):
    factor = float(num_training_steps - current_step) / float(max(1, num_training_steps))
    assert factor > 0
    return factor

class BertModule(nn.Module):
    def __init__(self, name, num_labels):
        super().__init__()
        self.name = name
        self.num_labels = num_labels
        
        self.reset_weights()
        
    def reset_weights(self):
        self.bert = AutoModelForSequenceClassification.from_pretrained(
            self.name, num_labels=self.num_labels
        )
        
    def forward(self, **kwargs):
        pred = self.bert(**kwargs)
        return pred.logits

# %% Chaining tokenizer and BERT in one pipeline
pipeline = Pipeline([
    ('tokenizer', HuggingfacePretrainedTokenizer(TOKENIZER)),
    ('net', NeuralNetClassifier(
        BertModule,
        module__name=PRETRAINED_MODEL,
        module__num_labels=len(set(y_train)),
        optimizer=OPTMIZER,
        lr=LR,
        max_epochs=MAX_EPOCHS,
        criterion=CRITERION,
        batch_size=BATCH_SIZE,
        iterator_train__shuffle=True,
        device=DEVICE,
        callbacks=[
            LRScheduler(LambdaLR, lr_lambda=lr_schedule, step_every='batch'),
            ProgressBar(),
        ],
    )),
])

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)

# %% Training
%time pipeline.fit(X_train, y_train)

# %% Evaluate the model
%%time
with torch.inference_mode():
    y_pred = pipeline.predict(X_test)

accuracy_score(y_test, y_pred)

# %% Save the model
import pickle
with open("model.pkl", mode="bw") as f:
    pickle.dump(pipeline, file=f)

# %% Initialize the repository for Hub
local_repo = "model_repo"
hub_utils.init(
    model="model.pkl",
    requirements=[f"scikit-learn={sklearn.__version__}", f"transformers={transformers.__version__}",
                  f"torch={torch.__version__}", f"skorch={skorch.__version__}"],
    dst=local_repo,
    task="text-classification",
    data=X_test,
)

# %% Create model card
model_card = card.Card(pipeline, metadata=card.metadata_from_config(Path("model_repo")))

# %% We will add information related to model 
model_description = (
    "This is a neural net classifier and distilbert model chained with sklearn Pipeline trained on 20 news groups dataset."
)
limitations = "This model is trained for a tutorial and is not ready to be used in production."
model_card.add(
    model_description=model_description,
    limitations=limitations
)

# %% We can add plots, evaluation results and more!
eval_descr = (
    "The model is evaluated on validation data from 20 news group's test split,"
    " using accuracy and F1-score with micro average."
)
model_card.add(eval_method=eval_descr)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="micro")
model_card.add_metrics(**{"accuracy": accuracy, "f1 score": f1})


cm = confusion_matrix(y_test, y_pred, labels=pipeline.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=pipeline.classes_)
disp.plot()

disp.figure_.savefig(Path(local_repo) / "confusion_matrix.png")
model_card.add_plot(**{"Confusion matrix": "confusion_matrix.png"})

clf_report = classification_report(
    y_test, y_pred, output_dict=True, target_names=dataset.target_names
)
# %% We can add classification report as a table
# We first need to convert classification report to DataFrame to add it as a table
import pandas as pd
del clf_report["accuracy"]
clf_report = pd.DataFrame(clf_report).T.reset_index()
model_card.add_table(
    folded=True,
    **{
        "Classification Report": clf_report,
    },
)

# %% We will save our model card
model_card.save(Path(local_repo) / "README.md")

# %% We will add the training script to our repository
hub_utils.add_files(__file__, dst=local_repo) 

# %% Push to Hub! This requires us to authenticate ourselves first.
from huggingface_hub import notebook_login
notebook_login()

hub_utils.push(
    repo_id="scikit-learn/skorch-text-classification",
    source=local_repo,
    create_remote=True,
)