{ "cells": [ { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "cf86fed9cae54700b31a616cd82b7180", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating train split: 0 examples [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from datasets import load_dataset\n", "# load dataset from data.jsonl file:\n", "eli5 = load_dataset(\"json\", data_files=\"data3.jsonl\", split=\"train[:80%]\")" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [], "source": [ "eli5 = eli5.train_test_split(test_size=0.2)" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'text': \"Extract the calendar events from the following text, the text will contain a place, time , land possibly a location. Here is the text: : Board meeting next Tuesday at 10 AM.\\nThe Details are as follows: {'datetime': '2024-03-19T10:00:00', 'description': 'Board meeting', 'location': ''}\\n\"}" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "eli5[\"train\"][0]" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"distilbert/distilgpt2\")" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [], "source": [ "eli5 = eli5.flatten()" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'text': \"Extract the calendar events from the following text, the text will contain a place, time , land possibly a location. Here is the text: : Board meeting next Tuesday at 10 AM.\\nThe Details are as follows: {'datetime': '2024-03-19T10:00:00', 'description': 'Board meeting', 'location': ''}\\n\"}" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "eli5[\"train\"][0]" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [], "source": [ "def preprocess_function(examples):\n", " return tokenizer([\" \".join(x) for x in examples])" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7d326a1d4117454f98bfd6c7f575120c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map (num_proc=4): 0%| | 0/49 [00:00= block_size:\n", " total_length = (total_length // block_size) * block_size\n", " # Split by chunks of block_size.\n", " result = {\n", " k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n", " for k, t in concatenated_examples.items()\n", " }\n", " result[\"labels\"] = result[\"input_ids\"].copy()\n", " return result" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a571ed26269640278514bfb2b02b1e03", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map (num_proc=4): 0%| | 0/4 [00:00