{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install torch transformers scikit-learn wandb accelerate tqdm\n", "from IPython.display import clear_output\n", "clear_output(wait=True)\n", "print(\".\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!apt-get update\n", "!apt-get install zstd\n", "!tar --use-compress-program=unzstd -xvf bert_streamed_dataset.tar.zst\n", "clear_output(wait=True)\n", "print(\".\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import torch\n", "from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments\n", "from sklearn.model_selection import train_test_split\n", "from tqdm import tqdm\n", "import wandb\n", "import json\n", "\n", "# Initialize W&B\n", "wandb.init(project=\"distilbert-ai-text-classification\")\n", "\n", "# Check if MPS is available and set the device\n", "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", "print(device)\n", "\n", "# Load pre-trained DistilBERT tokenizer and model\n", "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n", "model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)\n", "model.to(device)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load the JSONL dataset\n", "data = []\n", "total_num_of_lines = 0\n", "with open('bert_reddit_vs_synth_writing_prompts.jsonl', 'r') as infile:\n", " for line in tqdm(infile, desc=\"Checking dataset size\"):\n", " total_num_of_lines += 1\n", "\n", "with open('bert_reddit_vs_synth_writing_prompts.jsonl', 'r') as infile:\n", " for line in tqdm(infile, desc=\"Loading dataset\", total=total_num_of_lines):\n", " data.append(json.loads(line))\n", "\n", "# Extract texts and labels\n", "print(\"Extracting texts and labels\")\n", "texts = [entry['text'] for entry in data]\n", "labels = [entry['label'] for entry in data]\n", "\n", "# Tokenize the text\n", "print(\"Tokenizing text\")\n", "inputs = tokenizer(texts, padding=True, truncation=True, return_tensors=\"pt\")\n", "\n", "# Move input tensors to the device\n", "print(\"Moving input tensors\")\n", "inputs = {key: val for key, val in inputs.items()}\n", "\n", "# Split the data into training and validation sets\n", "print(\"Splitting data into train and validation\")\n", "train_inputs, val_inputs, train_labels, val_labels = train_test_split(\n", " inputs['input_ids'], labels, test_size=0.2, random_state=42)\n", "\n", "train_attention_masks, val_attention_masks, _, _ = train_test_split(\n", " inputs['attention_mask'], labels, test_size=0.2, random_state=42)\n", "\n", "# Create a PyTorch dataset\n", "class TextDataset(torch.utils.data.Dataset):\n", " def __init__(self, input_ids, attention_masks, labels):\n", " self.input_ids = input_ids\n", " self.attention_masks = attention_masks\n", " self.labels = labels\n", "\n", " def __len__(self):\n", " return len(self.labels)\n", "\n", " def __getitem__(self, idx):\n", " return {\n", " 'input_ids': self.input_ids[idx],\n", " 'attention_mask': self.attention_masks[idx],\n", " 'labels': torch.tensor(self.labels[idx])\n", " }\n", "\n", "print(\"Creating pytorch datasets\")\n", "train_dataset = TextDataset(train_inputs, train_attention_masks, train_labels)\n", "val_dataset = TextDataset(val_inputs, val_attention_masks, val_labels)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Reduce eval set to X examples to speed up training\n", "NUM_OF_EVAL_EXAMPLES = 1000\n", "val_inputs_subset = val_inputs[:NUM_OF_EVAL_EXAMPLES]\n", "val_attention_masks_subset = val_attention_masks[:NUM_OF_EVAL_EXAMPLES]\n", "val_labels_subset = val_labels[:NUM_OF_EVAL_EXAMPLES]\n", "\n", "# Create a TextDataset with only X examples\n", "val_dataset = Textdataset(val_inputs_subset, val_attention_masks_subset, val_labels_subset)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Define the training arguments\n", "training_args = TrainingArguments(\n", " output_dir='./distil-bert-train-results', \n", " num_train_epochs=3, \n", " per_device_train_batch_size=16, \n", " per_device_eval_batch_size=16, \n", " warmup_steps=500, # number of warmup steps for learning rate scheduler\n", " weight_decay=0.01, \n", " logging_dir='./logs', \n", " logging_steps=10, \n", " report_to=\"wandb\", \n", " evaluation_strategy=\"steps\", # Evaluate every logging step\n", " eval_steps=100, # Evaluate every 10 steps\n", " fp16=True,\n", ")\n", "\n", "# Create the Trainer\n", "trainer = Trainer(\n", " model=model, # the instantiated 🤗 Transformers model to be trained\n", " args=training_args, # training arguments, defined above\n", " train_dataset=train_dataset, # training dataset\n", " eval_dataset=val_dataset # evaluation dataset\n", ")\n", "\n", "# Train the model\n", "trainer.train()\n", "\n", "# Save the model\n", "model.save_pretrained('./distil-bert-train-final-result')\n", "\n", "# Finish the W&B run\n", "wandb.finish()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 4 }