In [1]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [2]:
with open("./titulos.txt") as file:
    manchetes = [line.rstrip() for line in file]
print(len(manchetes))

100000


In [3]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator


def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()

In [4]:
train_file_path = "./titulos.txt"
model_name = 'gpt2'
output_dir = './result'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 5.0
save_steps = 500

In [5]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)



  0%|          | 0/11455 [00:00<?, ?it/s]

{'loss': 4.0109, 'learning_rate': 4.7817546922741165e-05, 'epoch': 0.22}
{'loss': 3.3335, 'learning_rate': 4.563509384548232e-05, 'epoch': 0.44}
{'loss': 3.0906, 'learning_rate': 4.3452640768223485e-05, 'epoch': 0.65}
{'loss': 2.9653, 'learning_rate': 4.127018769096465e-05, 'epoch': 0.87}
{'loss': 2.855, 'learning_rate': 3.9087734613705804e-05, 'epoch': 1.09}
{'loss': 2.764, 'learning_rate': 3.690528153644697e-05, 'epoch': 1.31}
{'loss': 2.7204, 'learning_rate': 3.472282845918813e-05, 'epoch': 1.53}
{'loss': 2.6883, 'learning_rate': 3.2540375381929286e-05, 'epoch': 1.75}
{'loss': 2.6503, 'learning_rate': 3.035792230467045e-05, 'epoch': 1.96}
{'loss': 2.5871, 'learning_rate': 2.817546922741161e-05, 'epoch': 2.18}
{'loss': 2.5675, 'learning_rate': 2.5993016150152772e-05, 'epoch': 2.4}
{'loss': 2.5442, 'learning_rate': 2.3810563072893935e-05, 'epoch': 2.62}
{'loss': 2.5271, 'learning_rate': 2.1628109995635095e-05, 'epoch': 2.84}
{'loss': 2.4952, 'learning_rate': 1.9445656918376258e-05, 'e

In [8]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = "./result"
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [None]:
sequence = "Juliette"
max_len = 19
generate_text(sequence, max_len)

Juliette rebate crÃ­ticas Ã  mostra em clique sexy em evento

