Edit model card

Model Card for AC/MiniLM-L12-H384-uncased_Nvidia-Aegis-AI-Safety

A microsoft/MiniLM-L12-H384-uncased model fine-tuned on the nvidia/Aegis-AI-Content-Safety-Dataset-1.0 dataset. A total of 3099 examples are in the training set.

This is a multi-label text classifier that has 14 categories:

  • "0": "Controlled/Regulated Substances"
  • "1": "Criminal Planning/Confessions"
  • "2": "Deception/Fraud"
  • "3": "Guns and Illegal Weapons"
  • "4": "Harassment"
  • "5": "Hate/Identity Hate"
  • "6": "Needs Caution"
  • "7": "PII/Privacy"
  • "8": "Profanity"
  • "9": "Sexual"
  • "10": "Sexual (minor)"
  • "11": "Suicide and Self Harm"
  • "12": "Threat"
  • "13": "Violence"

How to Get Started with the Model

from accelerate import Accelerator
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
import torch

accelerator = Accelerator()
device = accelerator.device

def load_model(model_path, accelerator_device=None):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path, 
        problem_type="multi_label_classification", 
    )

    if accelerator_device:
        model.to(accelerator_device)
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    return model, tokenizer

def predict(model, tokenizer, text, accelerator_device=None, threshold=0.5):
    if accelerator_device:
        inputs = tokenizer([text], return_tensors="pt").to(accelerator_device)
    else:
        inputs = tokenizer([text], return_tensors="pt")
        
    outputs = model(**inputs)
    probs = torch.nn.Sigmoid()((outputs.logits.squeeze().cpu()))
    predictions = np.zeros(probs.shape)
    predictions[np.where(probs >= threshold)] = 1
    return [model.config.id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]

# USING CPU
hf_model, tokenizer = load_model("AC/MiniLM-L12-H384-uncased_Nvidia-Aegis-AI-Safety")
predict(hf_model, tokenizer, "How to make a bomb?")

# USING GPU
hf_model, tokenizer = load_model("AC/MiniLM-L12-H384-uncased_Nvidia-Aegis-AI-Safety", device)
predict(hf_model, tokenizer, "How to make a bomb?", device)

Evaluation

Evaluation is conducted on the test set in nvidia/Aegis-AI-Content-Safety-Dataset-1.0 dataset. A total of 359 examples are in the test set.

For AI safety use case, having false negatives (text was actually toxic but model predicted it as safe) is worse than having false positives (text was actually safe but model predicted it as unsafe)

Precision: Out of all text predicted as toxic, how many were actually toxic? Recall: Out of all text that were actually toxic, how many were predicted toxic?

As we want to reduce false negatives, we will focus on recall.

Metric Value
accuracy 0.9514524472741743
f1 0.5325670498084292
precision 0.668269230769
recall 0.442675159235668
TP 4643
TN 139
FP 69
FN 175

Finetuning

from accelerate import Accelerator
from datasets import load_dataset, Dataset, DatasetDict
from datetime import datetime
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, EvalPrediction, DataCollatorWithPadding
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, coverage_error

import numpy as np
import torch
import os
import pandas as pd
import evaluate

accelerator = Accelerator()
device = accelerator.device

def load_model(model_path, accelerator_device):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path, 
        problem_type="multi_label_classification", 
        num_labels=len(all_labels),
        id2label=id2label,
        label2id=label2id
    )

    model.to(accelerator_device)
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    return model, tokenizer

def predict(model, tokenizer, text, threshold=0.5):
    inputs = tokenizer([text], return_tensors="pt").to(device)
    outputs = model(**inputs)
    probs = torch.nn.Sigmoid()((outputs.logits.squeeze().cpu()))
    predictions = np.zeros(probs.shape)
    predictions[np.where(probs >= threshold)] = 1
    return [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]

def tokenize_text(examples):
    final_labels = np.zeros(len(all_labels))
    for idx, label in enumerate(all_labels):
        final_labels[idx] = examples[label]
        
    examples["labels"] = final_labels
    return tokenizer(examples["text"], truncation=True, max_length=512)


### Data Preprocessing

all_labels = [
    'Controlled/Regulated Substances',
    'Criminal Planning/Confessions',
    'Deception/Fraud',
    'Guns and Illegal Weapons',
    'Harassment',
    'Hate/Identity Hate',
    'Needs Caution',
    'PII/Privacy',
    'Profanity',
    'Sexual',
    'Sexual (minor)',
    'Suicide and Self Harm',
    'Threat',
    'Violence'
]

id2label = {idx:label for idx, label in enumerate(all_labels)}
label2id = {label:idx for idx, label in enumerate(all_labels)}

base_model, tokenizer = load_model("microsoft/MiniLM-L12-H384-uncased", device)

train_df = pd.read_csv("nvidia_train.csv")
test_df = pd.read_csv("nvidia_test.csv")

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'test': Dataset.from_pandas(test_df)}
)

preprocessed_dataset = dataset.map(tokenize_text)


### Metrics for multi-label classification

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
   return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):
   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))


### Finetuning

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

output_dir = f'./minilm_finetuned/minilm-{datetime.now().strftime("%d-%m-%Y_%H-%M")}' # Output directory where the training checkpoints will be stored

final_output_dir = './minilm_finetuned' # Best model from trainer will be saved here

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=2e-5,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    num_train_epochs=20,
    weight_decay=0.01,
    fp16=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=preprocessed_dataset["train"],
    eval_dataset=preprocessed_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

print("Saving model...")
trainer.save_model(final_output_dir)


### Evaluate model
base_model, tokenizer = load_model(final_output_dir, device)
predict(base_model, tokenizer, "How to make a bomb?")
Downloads last month
27,590
Safetensors
Model size
33.4M params
Tensor type
F32
·
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.

Dataset used to train AC/MiniLM-L12-H384-uncased_Nvidia-Aegis-AI-Safety