import pandas as pd import numpy as np import os def read_txt(file_path): text = "" try: with open(file_path, "r") as file: text = file.read() except: text = "" return text with open("train.txt", "w") as f: f.write('') data = "" for filename in os.listdir("./"): file_path = os.path.join("./", filename) if file_path.endswith(".txt") and (file_path != "train.txt"): data += read_txt(file_path) data = ' '.join(data.split('\n')) with open("train.txt", "a") as f: f.write(data) from transformers import TextDataset, DataCollatorForLanguageModeling from transformers import GPT2Tokenizer, GPT2LMHeadModel from transformers import Trainer, TrainingArguments def load_dataset(file_path, tokenizer, block_size = 128): dataset = TextDataset( tokenizer = tokenizer, file_path = file_path, block_size = block_size, ) return dataset def load_data_collator(tokenizer, mlm = False): data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=mlm, ) return data_collator def train(train_file_path,model_name, output_dir, overwrite_output_dir, per_device_train_batch_size, num_train_epochs, save_steps): tokenizer = GPT2Tokenizer.from_pretrained(model_name) train_dataset = load_dataset(train_file_path, tokenizer) data_collator = load_data_collator(tokenizer) tokenizer.save_pretrained(output_dir) model = GPT2LMHeadModel.from_pretrained(model_name) model.save_pretrained(output_dir) training_args = TrainingArguments( output_dir=output_dir, overwrite_output_dir=overwrite_output_dir, per_device_train_batch_size=per_device_train_batch_size, num_train_epochs=num_train_epochs, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, ) trainer.train() trainer.save_model() train_file_path = "train.txt" model_name = 'gpt2' output_dir = 'model' overwrite_output_dir = False per_device_train_batch_size = 8 num_train_epochs = 50.0 save_steps = 50000 train( train_file_path=train_file_path, model_name=model_name, output_dir=output_dir, overwrite_output_dir=overwrite_output_dir, per_device_train_batch_size=per_device_train_batch_size, num_train_epochs=num_train_epochs, save_steps=save_steps ) from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer def load_model(model_path): model = GPT2LMHeadModel.from_pretrained(model_path) return model def load_tokenizer(tokenizer_path): tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path) return tokenizer def generate_text(model_path, sequence, max_length): model = load_model(model_path) tokenizer = load_tokenizer(model_path) ids = tokenizer.encode(f'{sequence}', return_tensors='pt') final_outputs = model.generate( ids, do_sample=True, max_length=max_length, pad_token_id=model.config.eos_token_id, top_k=50, top_p=0.95, ) print(tokenizer.decode(final_outputs[0], skip_special_tokens=True)) model_path = "/model/" sequence = "Hello!" max_len = 50 generate_text(model_path, sequence, max_len)