{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from transformers import GPT2LMHeadModel, GPT2Tokenizer\n", "import numpy as np\n", "import random\n", "import torch\n", "from torch.utils.data import Dataset, DataLoader\n", "from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup\n", "from tqdm import tqdm, trange\n", "import torch.nn.functional as F\n", "import csv\n", "from transformers import TextDataset, DataCollatorForLanguageModeling\n", "from transformers import GPT2Tokenizer, GPT2LMHeadModel\n", "from transformers import Trainer, TrainingArguments" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100000\n" ] } ], "source": [ "with open(\"./titulos.txt\") as file:\n", " manchetes = [line.rstrip() for line in file]\n", "print(len(manchetes))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def load_dataset(file_path, tokenizer, block_size = 128):\n", " dataset = TextDataset(\n", " tokenizer = tokenizer,\n", " file_path = file_path,\n", " block_size = block_size,\n", " )\n", " return dataset\n", "\n", "\n", "def load_data_collator(tokenizer, mlm = False):\n", " data_collator = DataCollatorForLanguageModeling(\n", " tokenizer=tokenizer,\n", " mlm=mlm,\n", " )\n", " return data_collator\n", "\n", "\n", "def train(train_file_path,model_name,\n", " output_dir,\n", " overwrite_output_dir,\n", " per_device_train_batch_size,\n", " num_train_epochs,\n", " save_steps):\n", " tokenizer = GPT2Tokenizer.from_pretrained(model_name)\n", " train_dataset = load_dataset(train_file_path, tokenizer)\n", " data_collator = load_data_collator(tokenizer)\n", "\n", " tokenizer.save_pretrained(output_dir)\n", "\n", " model = GPT2LMHeadModel.from_pretrained(model_name)\n", "\n", " model.save_pretrained(output_dir)\n", "\n", " training_args = TrainingArguments(\n", " output_dir=output_dir,\n", " overwrite_output_dir=overwrite_output_dir,\n", " per_device_train_batch_size=per_device_train_batch_size,\n", " num_train_epochs=num_train_epochs,\n", " )\n", "\n", " trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " data_collator=data_collator,\n", " train_dataset=train_dataset,\n", " )\n", "\n", " trainer.train()\n", " trainer.save_model()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "train_file_path = \"./titulos.txt\"\n", "model_name = 'gpt2'\n", "output_dir = './result'\n", "overwrite_output_dir = False\n", "per_device_train_batch_size = 8\n", "num_train_epochs = 5.0\n", "save_steps = 500" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\yamgc\\miniconda3\\envs\\choqueianotebook1\\lib\\site-packages\\transformers\\data\\datasets\\language_modeling.py:53: FutureWarning: This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py\n", " warnings.warn(\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f791b828d56f4844a369b75f66ec071a", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/11455 [00:00