In [1]:
# First, let's install the required packages
%pip install torch intel_extension_for_pytorch
%pip install transformers datasets accelerate peft
%pip install bitsandbytes bigdl-llm 
%pip install -U pandas
# Fixes
%pip install setuptools==69.5.1

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import sys
import logging
import warnings
from pathlib import Path

import torch
import intel_extension_for_pytorch as ipex
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
)
from peft import LoraConfig, get_peft_model

# Constants
BASE_MODEL = "mistralai/Mistral-7B-v0.1"
DATA_PATH = "b-mc2/sql-create-context"
MODEL_PATH = "./final_model"
ADAPTER_PATH = "./lora_adapters"
DEVICE = torch.device("xpu" if torch.xpu.is_available() else "cpu")
MODEL_CACHE_PATH = "/home/common/data/Big_Data/GenAI/llm_models"

# Weights & Biases Configuration
ENABLE_WANDB = False

if ENABLE_WANDB:
    print("installing wandb...")
    !{sys.executable} -m pip install -U --force "wandb==0.15.12" > /dev/null 2>&1
    print("installation complete...")

    import wandb
    os.environ["WANDB_NOTEBOOK_NAME"] = os.path.abspath('')
    os.environ["WANDB_PROJECT"] = f"finetune-model-name_{BASE_MODEL.replace('/', '_')}"
    os.environ["WANDB_LOG_MODEL"] = "checkpoint"
    wandb.login()

# Configuration
warnings.filterwarnings("ignore", category=UserWarning)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["NUMEXPR_MAX_THREADS"] = "28"
os.environ["ENABLE_SDP_FUSION"] = "true"
os.environ["SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS"] = "1"

logging.getLogger("transformers").setLevel(logging.ERROR)

2024-06-23 08:23:18,051 - datasets - INFO - PyTorch version 2.1.0.post0+cxx11.abi available.


In [3]:
def setup_model_and_tokenizer(base_model_id: str):
    local_model_id = base_model_id.replace("/", "--")
    local_model_path = os.path.join(MODEL_CACHE_PATH, local_model_id)
    
    try:
        print(f"Attempting to load model and tokenizer from: {local_model_path}")
        model = AutoModelForCausalLM.from_pretrained(
            local_model_path,
            torch_dtype=torch.bfloat16,
            low_cpu_mem_usage=True,
        )
        tokenizer = AutoTokenizer.from_pretrained(local_model_path)
    except (OSError, PermissionError) as e:
        print(f"Failed to load from {local_model_path}. Attempting to download...")
        model = AutoModelForCausalLM.from_pretrained(
            base_model_id,
            torch_dtype=torch.bfloat16,
            low_cpu_mem_usage=True,
        )
        tokenizer = AutoTokenizer.from_pretrained(base_model_id)

    tokenizer.pad_token_id = 0
    tokenizer.padding_side = "left"
    return model, tokenizer

def generate_prompt_911(messages):
    prompt = "You are a 911 operator. Your job is to handle emergency calls professionally and efficiently.\n\n"
    for message in messages:
        role = "Operator" if message['role'] == 'assistant' else "Caller"
        prompt += f"{role}: {message['content']}\n"
    prompt += "Operator:"
    return prompt

class FineTuner:
    def __init__(self, base_model_id: str, model_path: str, device: torch.device):
        self.base_model_id = base_model_id
        self.model_path = model_path
        self.device = device
        self.model, self.tokenizer = setup_model_and_tokenizer(base_model_id)

    def tokenize_data(self, data_point, add_eos_token=True, cutoff_len=512):
        prompt = generate_prompt_911(data_point["messages"])
        tokenized = self.tokenizer(
            prompt,
            truncation=True,
            max_length=cutoff_len,
            padding=False,
            return_tensors=None,
        )
        
        if (
            tokenized["input_ids"][-1] != self.tokenizer.eos_token_id
            and add_eos_token
            and len(tokenized["input_ids"]) < cutoff_len
        ):
            tokenized["input_ids"].append(self.tokenizer.eos_token_id)
            tokenized["attention_mask"].append(1)
            
        tokenized["labels"] = tokenized["input_ids"].copy()
        return tokenized

    def prepare_data(self, data, val_set_size=100):
        train_val_split = data["train"].train_test_split(
            test_size=val_set_size, shuffle=True, seed=42
        )
        train_data = train_val_split["train"].shuffle().map(self.tokenize_data)
        val_data = train_val_split["test"].shuffle().map(self.tokenize_data)
        return train_data, val_data

    def train_model(self, train_data, val_data, training_args):
        self.model = self.model.to(self.device)
        self.model.gradient_checkpointing_enable()
        
        lora_config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=["q_proj", "k_proj", "v_proj"],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM",
        )
        
        self.model = get_peft_model(self.model, lora_config)
        
        trainer = Trainer(
            model=self.model,
            train_dataset=train_data,
            eval_dataset=val_data,
            args=training_args,
            data_collator=DataCollatorForSeq2Seq(
                self.tokenizer,
                pad_to_multiple_of=8,
                return_tensors="pt",
                padding=True,
            ),
        )
        
        self.model.config.use_cache = False
        trainer.train()
        self.model.save_pretrained(self.model_path)

    def finetune(self, data_path, training_args):
        print("‚ãÑ LOADING DATASET")
        data = load_dataset("json", data_files="calls.jsonl")
        print("‚ãÑ DONE LOADING DATASET")
        train_data, val_data = self.prepare_data(data)
        self.train_model(train_data, val_data, training_args)

def lets_finetune(
    device=DEVICE,
    model=BASE_MODEL,
    per_device_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=20,
    learning_rate=2e-5,
    max_steps=200,
):
    print(f"\n{'='*60}")
    print("Training Parameters:")
    print(f"Foundation model: {BASE_MODEL}")
    print(f"Model save path: {MODEL_PATH}")
    print(f"Device used: {DEVICE}")
    if DEVICE.type.startswith("xpu"):
        print(f"Intel GPU: {torch.xpu.get_device_name()}")
    print(f"Batch size per device: {per_device_batch_size}")
    print(f"Gradient accum. steps: {gradient_accumulation_steps}")
    print(f"Warmup steps: {warmup_steps}")
    print(f"Max steps: {max_steps}")
    print(f"Learning rate: {learning_rate}")
    print(f"{'='*60}\n")

    finetuner = FineTuner(base_model_id=model, model_path=MODEL_PATH, device=device)

    training_args = TrainingArguments(
        per_device_train_batch_size=per_device_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=warmup_steps,
        max_steps=max_steps,
        learning_rate=learning_rate,
        bf16=True,
        use_ipex=True,
        logging_steps=20,
        save_strategy="steps",
        save_steps=20,
        evaluation_strategy="steps",
        eval_steps=20,
        optim="adamw_hf",
        output_dir=ADAPTER_PATH,
        save_total_limit=3,
        load_best_model_at_end=True,
        ddp_find_unused_parameters=False,
        group_by_length=True,
        report_to="wandb" if ENABLE_WANDB else [],
    )

    finetuner.finetune(DATA_PATH, training_args)

In [4]:
if __name__ == "__main__":
    lets_finetune()


Training Parameters:
Foundation model: mistralai/Mistral-7B-v0.1
Model save path: ./final_model
Device used: xpu
Intel GPU: Intel(R) Data Center GPU Max 1100
Batch size per device: 4
Gradient accum. steps: 4
Warmup steps: 20
Max steps: 200
Learning rate: 2e-05

Attempting to load model and tokenizer from: /home/common/data/Big_Data/GenAI/llm_models/mistralai--Mistral-7B-v0.1


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

‚ãÑ LOADING DATASET




‚ãÑ DONE LOADING DATASET


Map:   0%|          | 0/418 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


[2024-06-23 08:23:29,740] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to xpu (auto detect)
{'loss': 1.4396, 'grad_norm': 0.828125, 'learning_rate': 2e-05, 'epoch': 0.76}
{'eval_loss': 1.4161535501480103, 'eval_runtime': 7.1353, 'eval_samples_per_second': 14.015, 'eval_steps_per_second': 1.822, 'epoch': 0.76}
{'loss': 1.384, 'grad_norm': 0.515625, 'learning_rate': 1.7777777777777777e-05, 'epoch': 1.52}
{'eval_loss': 1.3492103815078735, 'eval_runtime': 7.1398, 'eval_samples_per_second': 14.006, 'eval_steps_per_second': 1.821, 'epoch': 1.52}
{'loss': 1.3029, 'grad_norm': 0.51953125, 'learning_rate': 1.555555555555556e-05, 'epoch': 2.29}
{'eval_loss': 1.3188127279281616, 'eval_runtime': 7.1273, 'eval_samples_per_second': 14.031, 'eval_steps_per_second': 1.824, 'epoch': 2.29}
{'loss': 1.2973, 'grad_norm': 0.458984375, 'learning_rate': 1.3333333333333333e-05, 'epoch': 3.05}
{'eval_loss': 1.3028552532196045, 'eval_runtime': 7.138, 'eval_samples_per_second': 14.009,

# Run Inference!

In [5]:
# Add these imports at the top of your script if not already present
from peft import PeftModel
import logging
import json

os.environ["WANDB_DISABLED"] = "true"
INFERENCE_DEVICE = torch.device("xpu" if torch.xpu.is_available() else "cpu")
print("INFERENCE_DEVICE = " + ("xpu" if torch.xpu.is_available() else "cpu"))

def generate_prompt_911(messages):
    """
    Generates a prompt for fine-tuning the LLM model for 911 operator tasks.

    Parameters:
        messages (list): List of message dictionaries containing 'role' and 'content'.

    Returns:
        str: A formatted string serving as the prompt for the fine-tuning task.
    """
    prompt = "You are a 911 operator. Your job is to handle emergency calls professionally and efficiently.\n\n"
    for message in messages:
        role = "Operator" if message['role'] == 'assistant' else "Caller"
        prompt += f"{role}: {message['content']}\n"
    prompt += "Operator:"
    return prompt

def setup_model_and_tokenizer(base_model_id: str):
    """Downloads / Loads the pre-trained model and tokenizer in nf4 based on the given base model ID for training, 
    with fallbacks for permission errors to use default cache."""
    local_model_id = base_model_id.replace("/", "--")
    local_model_path = os.path.join(MODEL_CACHE_PATH, local_model_id)

    try:
        print(f"Attempting to load model and tokenizer from: {local_model_path}")
        model = AutoModelForCausalLM.from_pretrained(local_model_path)
        tokenizer_class = LlamaTokenizer if "llama" in base_model_id.lower() else AutoTokenizer
        tokenizer = tokenizer_class.from_pretrained(local_model_path)
    except (OSError, PermissionError) as e:
        print(f"Failed to load from {local_model_path} due to {e}. Attempting to download...")
        model = AutoModelForCausalLM.from_pretrained(base_model_id)
        tokenizer_class = LlamaTokenizer if "llama" in base_model_id.lower() else AutoTokenizer
        tokenizer = tokenizer_class.from_pretrained(base_model_id)

    tokenizer.pad_token_id = 0
    tokenizer.padding_side = "left"
    return model.to(INFERENCE_DEVICE), tokenizer

class NineOneOneOperatorModel:
    """Handles 911 operator response generation for given call transcripts."""

    def __init__(
        self, base_model_id=BASE_MODEL, use_adapter=False, lora_checkpoint=None, loaded_base_model=None
    ):
        try:
            if loaded_base_model:
                self.model = loaded_base_model.model
                self.tokenizer = loaded_base_model.tokenizer
            else:
                self.model, self.tokenizer = setup_model_and_tokenizer(base_model_id)
            if use_adapter:
                self.model = PeftModel.from_pretrained(self.model, lora_checkpoint)
        except Exception as e:
            logging.error(f"Exception occurred during model initialization: {e}")
            raise

        self.model.to(INFERENCE_DEVICE)
        self.max_length = 512

    def generate(self, messages, **kwargs):
        """Generates a 911 operator response based on the given call transcript.
        
        Parameters:
            messages (list): List of message dictionaries containing 'role' and 'content'.
        
        Returns:
            str: The generated 911 operator response.
        """
        try:
            prompt = generate_prompt_911(messages)
            encoded_prompt = self.tokenizer(
                prompt,
                truncation=True,
                max_length=self.max_length,
                padding=False,
                return_tensors="pt",
            ).input_ids.to(INFERENCE_DEVICE)
            with torch.no_grad():
                with torch.cuda.amp.autocast():
                    outputs = self.model.generate(
                        input_ids=encoded_prompt,
                        do_sample=True,
                        max_length=self.max_length,
                        temperature=0.3,
                        repetition_penalty=1.2,
                    )
            generated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            return generated.split("Operator:")[-1].strip()
        except Exception as e:
            logging.error(f"Exception occurred during response generation: {e}")
            raise

INFERENCE_DEVICE = xpu


In [6]:
from IPython.display import display, HTML

# Initialize models
base_model = NineOneOneOperatorModel(
    use_adapter=False,
    lora_checkpoint="",
)
finetuned_model = None

# Sample 911 call data
samples = """
[
  {
    "messages": [
      {"role": "assistant", "content": "9-1-1, what's your emergency?"},
      {"role": "user", "content": "I hear strange noises coming from my neighbor's house."},
      {"role": "assistant", "content": "Can you tell me your location?"},
      {"role": "user", "content": "I'm at 123 Main Street."}
    ]
  },
  {
    "messages": [
      {"role": "assistant", "content": "9-1-1, what's your emergency?"},
      {"role": "user", "content": "There's a fire in my kitchen!"},
      {"role": "assistant", "content": "Are you in a safe location?"},
      {"role": "user", "content": "Yes, I'm outside the house now."}
    ]
  },
  {
    "messages": [
      {"role": "assistant", "content": "9-1-1, what's your emergency?"},
      {"role": "user", "content": "I think someone's breaking into my car."},
      {"role": "assistant", "content": "What's your current location?"},
      {"role": "user", "content": "I'm at the shopping mall on 5th Avenue."}
    ]
  }
]
"""

def run_inference(sample_data, model, finetuned=False):
    if INFERENCE_DEVICE.type.startswith("cuda"):
        torch.cuda.empty_cache()
    
    color = "#4CAF52" if finetuned else "#2196F4"
    model_type = "finetuned" if finetuned else "base"
    display(HTML(f"<div style='color:{color};'>Processing 911 calls on {INFERENCE_DEVICE} please wait...</div>"))
    
    for index, sample in enumerate(sample_data):
        try:
            messages = sample["messages"]
            output = model.generate(messages)
            
            tabbed_output = f"""
            <details>
                <summary style='color: {color};'><b>{model_type} model - Sample {index+1}</b> (Click to expand)</summary>
                <div style='padding-left: 20px;'>
                    <p><b>Call Transcript üìû:</b><br>{"<br>".join([f"{m['role'].capitalize()}: {m['content']}" for m in messages])}</p>
                    <p><b>Generated response üí°:</b><br>{output}</p>
                </div>
            </details>
            <hr style='border-top: 1px solid #bbb;'>"""
            display(HTML(tabbed_output))
        except Exception as e:
            logging.error(f"Exception occurred during sample processing: {e}")

# checkpoints are saved to `./lora_adapters`.
# Update the USING_CHECKPOINT to the one you want to use.
USING_CHECKPOINT=200
# if the kernel is interrupted the latest adapter (LORA_CHECKPOINT) is `./final_model_interrupted/`
# or else, the final model LORA_CHECKPOINT is `./final_model`
LORA_CHECKPOINT = f"./lora_adapters/checkpoint-{USING_CHECKPOINT}/"

if os.path.exists(LORA_CHECKPOINT):
    sample_data = json.loads(samples)
    run_inference(sample_data, model=base_model)
    if not finetuned_model:
        finetuned_model = NineOneOneOperatorModel(
            use_adapter=True,
            lora_checkpoint=LORA_CHECKPOINT,
            loaded_base_model=base_model
        )
    run_inference(sample_data, model=finetuned_model, finetuned=True)

Attempting to load model and tokenizer from: /home/common/data/Big_Data/GenAI/llm_models/mistralai--Mistral-7B-v0.1


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]