gpthack / main.py
shivam001's picture
Upload 20 files
f494faf
raw
history blame
3.44 kB
import pandas as pd
import numpy as np
import os
def read_txt(file_path):
text = ""
try:
with open(file_path, "r") as file:
text = file.read()
except:
text = ""
return text
with open("train.txt", "w") as f:
f.write('')
data = ""
for filename in os.listdir("./"):
file_path = os.path.join("./", filename)
if file_path.endswith(".txt") and (file_path != "train.txt"):
data += read_txt(file_path)
data = ' '.join(data.split('\n'))
with open("train.txt", "a") as f:
f.write(data)
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
def load_dataset(file_path, tokenizer, block_size = 128):
dataset = TextDataset(
tokenizer = tokenizer,
file_path = file_path,
block_size = block_size,
)
return dataset
def load_data_collator(tokenizer, mlm = False):
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=mlm,
)
return data_collator
def train(train_file_path,model_name,
output_dir,
overwrite_output_dir,
per_device_train_batch_size,
num_train_epochs,
save_steps):
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
train_dataset = load_dataset(train_file_path, tokenizer)
data_collator = load_data_collator(tokenizer)
tokenizer.save_pretrained(output_dir)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.save_pretrained(output_dir)
training_args = TrainingArguments(
output_dir=output_dir,
overwrite_output_dir=overwrite_output_dir,
per_device_train_batch_size=per_device_train_batch_size,
num_train_epochs=num_train_epochs,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
)
trainer.train()
trainer.save_model()
train_file_path = "train.txt"
model_name = 'gpt2'
output_dir = 'model'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 50.0
save_steps = 50000
train(
train_file_path=train_file_path,
model_name=model_name,
output_dir=output_dir,
overwrite_output_dir=overwrite_output_dir,
per_device_train_batch_size=per_device_train_batch_size,
num_train_epochs=num_train_epochs,
save_steps=save_steps
)
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer
def load_model(model_path):
model = GPT2LMHeadModel.from_pretrained(model_path)
return model
def load_tokenizer(tokenizer_path):
tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
return tokenizer
def generate_text(model_path, sequence, max_length):
model = load_model(model_path)
tokenizer = load_tokenizer(model_path)
ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
final_outputs = model.generate(
ids,
do_sample=True,
max_length=max_length,
pad_token_id=model.config.eos_token_id,
top_k=50,
top_p=0.95,
)
print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))
model_path = "/model/"
sequence = "Hello!"
max_len = 50
generate_text(model_path, sequence, max_len)