File size: 12,836 Bytes

bcffb9c

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cf86fed9cae54700b31a616cd82b7180",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "# load dataset from data.jsonl file:\n",
    "eli5 = load_dataset(\"json\", data_files=\"data3.jsonl\", split=\"train[:80%]\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "eli5 = eli5.train_test_split(test_size=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'text': \"Extract the calendar events from the following text, the text will contain a place, time , land possibly a location. Here is the text: : Board meeting next Tuesday at 10 AM.\\nThe Details are as follows: {'datetime': '2024-03-19T10:00:00', 'description': 'Board meeting', 'location': ''}\\n\"}"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "eli5[\"train\"][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"distilbert/distilgpt2\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "eli5 = eli5.flatten()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'text': \"Extract the calendar events from the following text, the text will contain a place, time , land possibly a location. Here is the text: : Board meeting next Tuesday at 10 AM.\\nThe Details are as follows: {'datetime': '2024-03-19T10:00:00', 'description': 'Board meeting', 'location': ''}\\n\"}"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "eli5[\"train\"][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess_function(examples):\n",
    "    return tokenizer([\" \".join(x) for x in examples])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7d326a1d4117454f98bfd6c7f575120c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map (num_proc=4):   0%|          | 0/49 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "13f25ef46a43486ea69fec77f62f7c9a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map (num_proc=4):   0%|          | 0/13 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "tokenized_eli5 = eli5.map(\n",
    "    preprocess_function,\n",
    "    batched=True,\n",
    "    num_proc=4,\n",
    "    remove_columns=eli5[\"train\"].column_names,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "block_size = 128\n",
    "\n",
    "\n",
    "def group_texts(examples):\n",
    "    # Concatenate all texts.\n",
    "    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}\n",
    "    total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
    "    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can\n",
    "    # customize this part to your needs.\n",
    "    if total_length >= block_size:\n",
    "        total_length = (total_length // block_size) * block_size\n",
    "    # Split by chunks of block_size.\n",
    "    result = {\n",
    "        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n",
    "        for k, t in concatenated_examples.items()\n",
    "    }\n",
    "    result[\"labels\"] = result[\"input_ids\"].copy()\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a571ed26269640278514bfb2b02b1e03",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map (num_proc=4):   0%|          | 0/4 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3385544255ec4af79d74e2d131845e07",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map (num_proc=4):   0%|          | 0/4 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import DataCollatorForLanguageModeling\n",
    "\n",
    "tokenizer.pad_token = tokenizer.eos_token\n",
    "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoModelForCausalLM, TrainingArguments, Trainer\n",
    "\n",
    "model = AutoModelForCausalLM.from_pretrained(\"distilbert/distilgpt2\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model moved to MPS device\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "\n",
    "# Check that MPS is available\n",
    "if not torch.backends.mps.is_available():\n",
    "    if not torch.backends.mps.is_built():\n",
    "        print(\"MPS not available because the current PyTorch install was not \"\n",
    "              \"built with MPS enabled.\")\n",
    "    else:\n",
    "        print(\"MPS not available because the current MacOS version is not 12.3+ \"\n",
    "              \"and/or you do not have an MPS-enabled device on this machine.\")\n",
    "\n",
    "else:\n",
    "    mps_device = torch.device(\"mps\")\n",
    "    model.to(mps_device)\n",
    "    print(\"Model moved to MPS device\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a71332654a414bfe87d416ce502c9cdc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "eab9e9bcc2814d3e8ba55806dc9d4a4f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'eval_loss': 6.667893886566162, 'eval_runtime': 0.0262, 'eval_samples_per_second': 152.548, 'eval_steps_per_second': 38.137, 'epoch': 1.0}\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b9634b37ea2d436c9700ce311651fdae",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'eval_loss': 6.2145514488220215, 'eval_runtime': 0.1232, 'eval_samples_per_second': 32.47, 'eval_steps_per_second': 8.118, 'epoch': 2.0}\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3af25ad29ad04c319677ec04dc22d3d1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'eval_loss': 5.993268966674805, 'eval_runtime': 0.0204, 'eval_samples_per_second': 196.346, 'eval_steps_per_second': 49.087, 'epoch': 3.0}\n",
      "{'train_runtime': 1.588, 'train_samples_per_second': 7.556, 'train_steps_per_second': 1.889, 'train_loss': 6.412024815877278, 'epoch': 3.0}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=3, training_loss=6.412024815877278, metrics={'train_runtime': 1.588, 'train_samples_per_second': 7.556, 'train_steps_per_second': 1.889, 'train_loss': 6.412024815877278, 'epoch': 3.0})"
      ]
     },
     "execution_count": 107,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "training_args = TrainingArguments(\n",
    "    output_dir=\"my_awesome_eli5_clm-model\",\n",
    "    evaluation_strategy=\"epoch\",\n",
    "    learning_rate=2e-5,\n",
    "    weight_decay=0.01,\n",
    "    push_to_hub=True,\n",
    ")\n",
    "\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    train_dataset=lm_dataset[\"train\"],\n",
    "    eval_dataset=lm_dataset[\"test\"],\n",
    "    data_collator=data_collator,\n",
    ")\n",
    "\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[{'generated_text': \"Extract the calendar events from the following text, the text will contain a place, time , land possibly a location. Here is the text: : Let's meet for lunch tomorrow at 12 PM at the Italian restaurant on Main Street.\\nThe Details are as follows: { if (is_empty(the_time()) : return next_day_long(_.length(this); } } }\\nThe Time is: 12 PM on Sunday 12th at the Italian restaurant on Main Street.\\nTaste: 12 PM on Sunday 8th at the Italian restaurant on Main Street.\\nThe Time is: 11 PM on Monday 9th at the Italian restaurant on Main Street.\\nThe Time is: 11 AM on Monday 9th at the Italian restaurant on Main Street.\\nTaste: 11 AM on Sunday 8th at the Italian restaurant on Main Street.\\nThe Time is: 11 AM on Monday 9th at the Italian restaurant on Main Street.\\nThe Time is: 11 AM\"}]"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prompt =  \"Extract the calendar events from the following text, the text will contain a place, time , land possibly a location. Here is the text: : Let's meet for lunch tomorrow at 12 PM at the Italian restaurant on Main Street.\\nThe Details are as follows: {\"\n",
    "from transformers import pipeline\n",
    "\n",
    "generator = pipeline(\"text-generation\", model=model, tokenizer=tokenizer, max_length=200)\n",
    "generator(prompt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}