|
import os |
|
import time |
|
import datetime |
|
|
|
import pandas as pd |
|
import seaborn as sns |
|
import numpy as np |
|
import random |
|
|
|
import matplotlib.pyplot as plt |
|
|
|
import torch |
|
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler |
|
|
|
|
|
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, GPT2LMHeadModel |
|
from transformers import AdamW, get_linear_schedule_with_warmup |
|
|
|
import nltk |
|
nltk.download('punkt') |
|
|
|
import sys |
|
|
|
import pytz |
|
IST = pytz.timezone('Asia/Kolkata') |
|
stamp = datetime.datetime.now(IST).strftime("%c") |
|
|
|
print('\n') |
|
print('='*100) |
|
print('='*100) |
|
print('\t\t=Experiment6=',stamp) |
|
print('='*100) |
|
print('='*100) |
|
|
|
out_path = '/media/data_dump/Ritwik/ggpt/' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hyper_params = {'rseed': 123} |
|
|
|
import torch, numpy as np, random, transformers, psutil, time |
|
os.environ['PYTHONHASHSEED'] = str(hyper_params['rseed']) |
|
|
|
torch.manual_seed(hyper_params['rseed']) |
|
torch.cuda.manual_seed(hyper_params['rseed']) |
|
torch.cuda.manual_seed_all(hyper_params['rseed']) |
|
|
|
np.random.seed(hyper_params['rseed']) |
|
random.seed(hyper_params['rseed']) |
|
transformers.set_seed(hyper_params['rseed']) |
|
|
|
|
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') |
|
|
|
sfile = '/media/nas_mount/Ritwik/Ai4Bharat_text_corpora/data/en/en_clean.txt' |
|
print(sfile) |
|
file = open(sfile,'r') |
|
lines = file.readlines() |
|
file.close() |
|
lines = [[x.strip()] for x in lines] |
|
|
|
df = pd.DataFrame(lines, columns=['bio_main']) |
|
|
|
print('Dataframe created') |
|
df.dropna(inplace=True) |
|
bios = df.bio_main.copy() |
|
print(datetime.datetime.now(IST).strftime("%c")) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("The max model length is {} for this model, although the actual embedding size for GPT small is 768".format(tokenizer.model_max_length)) |
|
print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id)) |
|
print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id)) |
|
print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id)) |
|
print(datetime.datetime.now(IST).strftime("%c")) |
|
|
|
batch_size = 8 |
|
|
|
class GPT2Dataset(Dataset): |
|
|
|
def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768): |
|
|
|
self.tokenizer = tokenizer |
|
self.max_length = max_length |
|
|
|
|
|
self.sents = list(txt_list) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __len__(self): |
|
|
|
return len(self.sents) |
|
|
|
def __getitem__(self, idx): |
|
|
|
txt = self.sents[idx] |
|
encodings_dict = self.tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=self.max_length, padding="max_length") |
|
input_ids = torch.tensor(encodings_dict['input_ids']) |
|
attn_masks = torch.tensor(encodings_dict['attention_mask']) |
|
return input_ids, attn_masks |
|
|
|
dataset = GPT2Dataset(bios, tokenizer, max_length=500) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_size = int(0.9 * len(dataset)) |
|
val_size = len(dataset) - train_size |
|
|
|
train_dataset, val_dataset = random_split(dataset, [train_size, val_size]) |
|
|
|
print('{:>5,} training samples'.format(train_size)) |
|
print('{:>5,} validation samples'.format(val_size)) |
|
print(datetime.datetime.now(IST).strftime("%c")) |
|
|
|
|
|
|
|
train_dataloader = DataLoader( |
|
train_dataset, |
|
sampler = RandomSampler(train_dataset), |
|
batch_size = batch_size |
|
) |
|
|
|
|
|
validation_dataloader = DataLoader( |
|
val_dataset, |
|
sampler = SequentialSampler(val_dataset), |
|
batch_size = batch_size |
|
) |
|
|
|
|
|
|
|
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False) |
|
|
|
|
|
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration) |
|
|
|
|
|
|
|
model.resize_token_embeddings(len(tokenizer)) |
|
|
|
|
|
device = torch.device("cuda") |
|
|
|
model = model.to(device) |
|
|
|
print('Model loaded to GPU') |
|
print(datetime.datetime.now(IST).strftime("%c")) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
epochs = 1 |
|
learning_rate = 5e-4 |
|
warmup_steps = 1e2 |
|
epsilon = 1e-8 |
|
|
|
|
|
sample_every = 1000 |
|
|
|
|
|
optimizer = AdamW(model.parameters(), |
|
lr = learning_rate, |
|
eps = epsilon |
|
) |
|
|
|
|
|
|
|
total_steps = len(train_dataloader) * epochs |
|
|
|
|
|
|
|
scheduler = get_linear_schedule_with_warmup(optimizer, |
|
num_warmup_steps = warmup_steps, |
|
num_training_steps = total_steps) |
|
|
|
|
|
|
|
|
|
def format_time(elapsed): |
|
return str(datetime.timedelta(seconds=int(round((elapsed))))) |
|
|
|
output_dir = '/media/data_dump/Ritwik/ggpt/model_save/' |
|
|
|
|
|
if not os.path.exists(output_dir): |
|
os.makedirs(output_dir) |
|
|
|
total_t0 = time.time() |
|
|
|
training_stats = [] |
|
|
|
last_epoch, last_step = -1, -1 |
|
try: |
|
file = open(out_path+'model_save/checkpoint_state_pretraining.txt','r') |
|
content = [x.split(':') for x in file.read().split('|')] |
|
file.close() |
|
except: |
|
content = [] |
|
|
|
if len(content) == 2: |
|
last_epoch = int(content[1][1]) |
|
last_step = int(content[0][1]) |
|
|
|
checkpoint = torch.load(out_path+'model_save/model_checkpoint_pretraining.pth.tar') |
|
print(model.load_state_dict(checkpoint['state_dict'])) |
|
tokenizer = torch.load(out_path+'model_save/tokenizer_checkpoint_pretraining.pth.tar') |
|
print(datetime.datetime.now(IST).strftime("%c")) |
|
|
|
|
|
|
|
|
|
|
|
for epoch_i in range(0, epochs): |
|
|
|
|
|
|
|
|
|
|
|
print("") |
|
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) |
|
print('Training...') |
|
|
|
if last_epoch!=-1: |
|
if epoch_i < last_epoch: |
|
continue |
|
|
|
t0 = time.time() |
|
|
|
total_train_loss = 0 |
|
|
|
model.train() |
|
|
|
for step, batch in enumerate(train_dataloader): |
|
|
|
if last_step != -1: |
|
if step <= last_step: |
|
continue |
|
|
|
b_input_ids = batch[0].to(device) |
|
b_labels = batch[0].to(device) |
|
b_masks = batch[1].to(device) |
|
|
|
model.zero_grad() |
|
|
|
outputs = model( b_input_ids, |
|
labels=b_labels, |
|
attention_mask = b_masks, |
|
token_type_ids=None |
|
) |
|
|
|
loss = outputs[0] |
|
|
|
batch_loss = loss.item() |
|
total_train_loss += batch_loss |
|
|
|
|
|
if step % sample_every == 0 and not step == 0: |
|
|
|
elapsed = format_time(time.time() - t0) |
|
print(' Batch {:>5,} of {:>5,}. Loss: {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed)) |
|
|
|
model.eval() |
|
|
|
sample_outputs = model.generate( |
|
bos_token_id=random.randint(1,30000), |
|
do_sample=True, |
|
top_k=50, |
|
max_length = 200, |
|
top_p=0.95, |
|
num_return_sequences=1 |
|
) |
|
for i, sample_output in enumerate(sample_outputs): |
|
print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True))) |
|
|
|
model.train() |
|
|
|
try: |
|
torch.save({'state_dict': model.state_dict()}, out_path+'model_save/model_checkpoint_pretraining.pth.tar') |
|
torch.save(tokenizer, out_path+'model_save/tokenizer_checkpoint_pretraining.pth.tar') |
|
file = open(out_path+'model_save/checkpoint_state_pretraining.txt','w') |
|
file.write('step:'+str(step)+'|epoch:'+str(epoch_i)) |
|
file.close() |
|
except: |
|
torch.save({'state_dict': model.state_dict()}, out_path+'model_save/model_checkpoint_pretraining.pth.tar') |
|
torch.save(tokenizer, out_path+'model_save/tokenizer_checkpoint_pretraining.pth.tar') |
|
file = open(out_path+'model_save/checkpoint_state_pretraining.txt','w') |
|
file.write('step:'+str(step)+'|epoch:'+str(epoch_i)) |
|
file.close() |
|
|
|
loss.backward() |
|
|
|
optimizer.step() |
|
|
|
scheduler.step() |
|
|
|
last_epoch, last_step = -1, -1 |
|
|
|
avg_train_loss = total_train_loss / len(train_dataloader) |
|
|
|
|
|
training_time = format_time(time.time() - t0) |
|
|
|
print("") |
|
print(" Average training loss: {0:.2f}".format(avg_train_loss)) |
|
print(" Training epoch took: {:}".format(training_time)) |
|
print(datetime.datetime.now(IST).strftime("%c")) |
|
|
|
|
|
|
|
|
|
|
|
print("") |
|
print("Running Validation...") |
|
|
|
t0 = time.time() |
|
|
|
model.eval() |
|
|
|
total_eval_loss = 0 |
|
nb_eval_steps = 0 |
|
|
|
|
|
for batch in validation_dataloader: |
|
|
|
b_input_ids = batch[0].to(device) |
|
b_labels = batch[0].to(device) |
|
b_masks = batch[1].to(device) |
|
|
|
with torch.no_grad(): |
|
|
|
outputs = model(b_input_ids, |
|
|
|
attention_mask = b_masks, |
|
labels=b_labels) |
|
|
|
loss = outputs[0] |
|
|
|
batch_loss = loss.item() |
|
total_eval_loss += batch_loss |
|
|
|
avg_val_loss = total_eval_loss / len(validation_dataloader) |
|
|
|
validation_time = format_time(time.time() - t0) |
|
|
|
print(" Validation Loss: {0:.2f}".format(avg_val_loss)) |
|
print(" Validation took: {:}".format(validation_time)) |
|
|
|
|
|
training_stats.append( |
|
{ |
|
'epoch': epoch_i + 1, |
|
'Training Loss': avg_train_loss, |
|
'Valid. Loss': avg_val_loss, |
|
'Training Time': training_time, |
|
'Validation Time': validation_time |
|
} |
|
) |
|
|
|
print("") |
|
print("Training complete!") |
|
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0))) |
|
print(datetime.datetime.now(IST).strftime("%c")) |
|
|
|
try: |
|
|
|
pd.set_option('precision', 2) |
|
|
|
|
|
df_stats = pd.DataFrame(data=training_stats) |
|
|
|
|
|
df_stats = df_stats.set_index('epoch') |
|
|
|
|
|
|
|
|
|
|
|
print(df_stats) |
|
|
|
|
|
sns.set(style='darkgrid') |
|
|
|
|
|
sns.set(font_scale=1.5) |
|
plt.rcParams["figure.figsize"] = (12,6) |
|
|
|
|
|
plt.plot(df_stats['Training Loss'], 'b-o', label="Training") |
|
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation") |
|
|
|
|
|
plt.title("Training & Validation Loss") |
|
plt.xlabel("Epoch") |
|
plt.ylabel("Loss") |
|
plt.legend() |
|
plt.xticks([1, 2, 3, 4]) |
|
|
|
|
|
plt.savefig(out_path+"training.png") |
|
|
|
|
|
params = list(model.named_parameters()) |
|
|
|
print('The GPT-2 model has {:} different named parameters.\n'.format(len(params))) |
|
|
|
print('==== Embedding Layer ====\n') |
|
|
|
for p in params[0:2]: |
|
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) |
|
|
|
print('\n==== First Transformer ====\n') |
|
|
|
for p in params[2:14]: |
|
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) |
|
|
|
print('\n==== Output Layer ====\n') |
|
|
|
for p in params[-2:]: |
|
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) |
|
|
|
|
|
|
|
print("Saving model to %s" % output_dir) |
|
|
|
|
|
|
|
|
|
|
|
model.save_pretrained(output_dir) |
|
tokenizer.save_pretrained(output_dir) |
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
print(e) |
|
print('Waiting for 10 seconds') |
|
time.sleep(10) |
|
|
|
|
|
|
|
sfile = 'all_tc_sents_768.txt' |
|
print(sfile) |
|
file = open(sfile,'r') |
|
lines = file.readlines() |
|
file.close() |
|
lines = [[x.strip()] for x in lines] |
|
|
|
df = pd.DataFrame(lines, columns=['bio_main']) |
|
|
|
print('Dataframe created') |
|
df.dropna(inplace=True) |
|
bios = df.bio_main.copy() |
|
|
|
doc_lengths = [] |
|
for bio in bios: |
|
|
|
tokens = nltk.word_tokenize(bio) |
|
doc_lengths.append(len(tokens)) |
|
doc_lengths = np.array(doc_lengths) |
|
a = sns.distplot(doc_lengths) |
|
a.get_figure().savefig(out_path+"out.png") |
|
print('len(doc_lengths[doc_lengths > 768])/len(doc_lengths)',len(doc_lengths[doc_lengths > 768])/len(doc_lengths)) |
|
print('np.average(doc_lengths)',np.average(doc_lengths)) |
|
print(datetime.datetime.now(IST).strftime("%c")) |
|
|
|
|
|
print("The max model length is {} for this model, although the actual embedding size for GPT small is 768".format(tokenizer.model_max_length)) |
|
print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id)) |
|
print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id)) |
|
print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id)) |
|
print(datetime.datetime.now(IST).strftime("%c")) |
|
|
|
batch_size = 4 |
|
|
|
class GPT2Dataset(Dataset): |
|
|
|
def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768): |
|
|
|
self.tokenizer = tokenizer |
|
self.input_ids = [] |
|
self.attn_masks = [] |
|
|
|
for txt in txt_list: |
|
|
|
encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length") |
|
|
|
self.input_ids.append(torch.tensor(encodings_dict['input_ids'])) |
|
self.attn_masks.append(torch.tensor(encodings_dict['attention_mask'])) |
|
|
|
def __len__(self): |
|
return len(self.input_ids) |
|
|
|
def __getitem__(self, idx): |
|
return self.input_ids[idx], self.attn_masks[idx] |
|
|
|
dataset = GPT2Dataset(bios, tokenizer, max_length=768) |
|
|
|
|
|
train_size = int(0.9 * len(dataset)) |
|
val_size = len(dataset) - train_size |
|
|
|
train_dataset, val_dataset = random_split(dataset, [train_size, val_size]) |
|
|
|
print('{:>5,} training samples'.format(train_size)) |
|
print('{:>5,} validation samples'.format(val_size)) |
|
print(datetime.datetime.now(IST).strftime("%c")) |
|
|
|
|
|
|
|
train_dataloader = DataLoader( |
|
train_dataset, |
|
sampler = RandomSampler(train_dataset), |
|
batch_size = batch_size |
|
) |
|
|
|
|
|
validation_dataloader = DataLoader( |
|
val_dataset, |
|
sampler = SequentialSampler(val_dataset), |
|
batch_size = batch_size |
|
) |
|
|
|
|
|
''' |
|
# I'm not really doing anything with the config buheret |
|
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False) |
|
|
|
# instantiate the model |
|
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration) |
|
|
|
# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings |
|
# otherwise the tokenizer and model tensors won't match up |
|
model.resize_token_embeddings(len(tokenizer)) |
|
|
|
# Tell pytorch to run this model on the GPU. |
|
device = torch.device("cuda") |
|
|
|
model = model.to(device) |
|
''' |
|
|
|
print('Model loaded to GPU') |
|
print(datetime.datetime.now(IST).strftime("%c")) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
epochs = 3 |
|
learning_rate = 5e-4 |
|
warmup_steps = 1e2 |
|
epsilon = 1e-8 |
|
|
|
|
|
sample_every = 1000 |
|
|
|
|
|
optimizer = AdamW(model.parameters(), |
|
lr = learning_rate, |
|
eps = epsilon |
|
) |
|
|
|
|
|
|
|
total_steps = len(train_dataloader) * epochs |
|
|
|
|
|
|
|
scheduler = get_linear_schedule_with_warmup(optimizer, |
|
num_warmup_steps = warmup_steps, |
|
num_training_steps = total_steps) |
|
|
|
|
|
|
|
|
|
def format_time(elapsed): |
|
return str(datetime.timedelta(seconds=int(round((elapsed))))) |
|
|
|
output_dir = '/media/data_dump/Ritwik/ggpt/model_save/' |
|
|
|
|
|
if not os.path.exists(output_dir): |
|
os.makedirs(output_dir) |
|
|
|
total_t0 = time.time() |
|
|
|
training_stats = [] |
|
|
|
last_epoch, last_step = -1, -1 |
|
try: |
|
file = open(out_path+'model_save/checkpoint_state.txt','r') |
|
content = [x.split(':') for x in file.read().split('|')] |
|
file.close() |
|
except: |
|
content = [] |
|
|
|
if len(content) == 2: |
|
last_epoch = int(content[1][1]) |
|
last_step = int(content[0][1]) |
|
|
|
checkpoint = torch.load(out_path+'model_save/model_checkpoint.pth.tar') |
|
print(model.load_state_dict(checkpoint['state_dict'])) |
|
tokenizer = torch.load(out_path+'model_save/tokenizer_checkpoint.pth.tar') |
|
print(datetime.datetime.now(IST).strftime("%c")) |
|
|
|
|
|
|
|
|
|
|
|
for epoch_i in range(0, epochs): |
|
|
|
|
|
|
|
|
|
|
|
print("") |
|
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) |
|
print('Training...') |
|
|
|
if last_epoch!=-1: |
|
if epoch_i < last_epoch: |
|
continue |
|
|
|
t0 = time.time() |
|
|
|
total_train_loss = 0 |
|
|
|
model.train() |
|
|
|
for step, batch in enumerate(train_dataloader): |
|
|
|
if last_step != -1: |
|
if step <= last_step: |
|
continue |
|
|
|
b_input_ids = batch[0].to(device) |
|
b_labels = batch[0].to(device) |
|
b_masks = batch[1].to(device) |
|
|
|
model.zero_grad() |
|
|
|
outputs = model( b_input_ids, |
|
labels=b_labels, |
|
attention_mask = b_masks, |
|
token_type_ids=None |
|
) |
|
|
|
loss = outputs[0] |
|
|
|
batch_loss = loss.item() |
|
total_train_loss += batch_loss |
|
|
|
|
|
if step % sample_every == 0 and not step == 0: |
|
|
|
elapsed = format_time(time.time() - t0) |
|
print(' Batch {:>5,} of {:>5,}. Loss: {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed)) |
|
|
|
model.eval() |
|
|
|
sample_outputs = model.generate( |
|
bos_token_id=random.randint(1,30000), |
|
do_sample=True, |
|
top_k=50, |
|
max_length = 200, |
|
top_p=0.95, |
|
num_return_sequences=1 |
|
) |
|
for i, sample_output in enumerate(sample_outputs): |
|
print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True))) |
|
|
|
model.train() |
|
|
|
torch.save({'state_dict': model.state_dict()}, out_path+'model_save/model_checkpoint.pth.tar') |
|
torch.save(tokenizer, out_path+'model_save/tokenizer_checkpoint.pth.tar') |
|
file = open(out_path+'model_save/checkpoint_state.txt','w') |
|
file.write('step:'+str(step)+'|epoch:'+str(epoch_i)) |
|
file.close() |
|
|
|
loss.backward() |
|
|
|
optimizer.step() |
|
|
|
scheduler.step() |
|
|
|
last_epoch, last_step = -1, -1 |
|
|
|
avg_train_loss = total_train_loss / len(train_dataloader) |
|
|
|
|
|
training_time = format_time(time.time() - t0) |
|
|
|
print("") |
|
print(" Average training loss: {0:.2f}".format(avg_train_loss)) |
|
print(" Training epoch took: {:}".format(training_time)) |
|
print(datetime.datetime.now(IST).strftime("%c")) |
|
|
|
|
|
|
|
|
|
|
|
print("") |
|
print("Running Validation...") |
|
|
|
t0 = time.time() |
|
|
|
model.eval() |
|
|
|
total_eval_loss = 0 |
|
nb_eval_steps = 0 |
|
|
|
|
|
for batch in validation_dataloader: |
|
|
|
b_input_ids = batch[0].to(device) |
|
b_labels = batch[0].to(device) |
|
b_masks = batch[1].to(device) |
|
|
|
with torch.no_grad(): |
|
|
|
outputs = model(b_input_ids, |
|
|
|
attention_mask = b_masks, |
|
labels=b_labels) |
|
|
|
loss = outputs[0] |
|
|
|
batch_loss = loss.item() |
|
total_eval_loss += batch_loss |
|
|
|
avg_val_loss = total_eval_loss / len(validation_dataloader) |
|
|
|
validation_time = format_time(time.time() - t0) |
|
|
|
print(" Validation Loss: {0:.2f}".format(avg_val_loss)) |
|
print(" Validation took: {:}".format(validation_time)) |
|
|
|
|
|
training_stats.append( |
|
{ |
|
'epoch': epoch_i + 1, |
|
'Training Loss': avg_train_loss, |
|
'Valid. Loss': avg_val_loss, |
|
'Training Time': training_time, |
|
'Validation Time': validation_time |
|
} |
|
) |
|
|
|
print("") |
|
print("Training complete!") |
|
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0))) |
|
print(datetime.datetime.now(IST).strftime("%c")) |
|
|
|
|
|
pd.set_option('precision', 2) |
|
|
|
|
|
df_stats = pd.DataFrame(data=training_stats) |
|
|
|
|
|
df_stats = df_stats.set_index('epoch') |
|
|
|
|
|
|
|
|
|
|
|
print(df_stats) |
|
|
|
|
|
sns.set(style='darkgrid') |
|
|
|
|
|
sns.set(font_scale=1.5) |
|
plt.rcParams["figure.figsize"] = (12,6) |
|
|
|
|
|
plt.plot(df_stats['Training Loss'], 'b-o', label="Training") |
|
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation") |
|
|
|
|
|
plt.title("Training & Validation Loss") |
|
plt.xlabel("Epoch") |
|
plt.ylabel("Loss") |
|
plt.legend() |
|
plt.xticks([1, 2, 3, 4]) |
|
|
|
|
|
plt.savefig(out_path+"training.png") |
|
|
|
|
|
params = list(model.named_parameters()) |
|
|
|
print('The GPT-2 model has {:} different named parameters.\n'.format(len(params))) |
|
|
|
print('==== Embedding Layer ====\n') |
|
|
|
for p in params[0:2]: |
|
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) |
|
|
|
print('\n==== First Transformer ====\n') |
|
|
|
for p in params[2:14]: |
|
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) |
|
|
|
print('\n==== Output Layer ====\n') |
|
|
|
for p in params[-2:]: |
|
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) |
|
|
|
|
|
|
|
print("Saving model to %s" % output_dir) |
|
|
|
|
|
|
|
|
|
|
|
model.save_pretrained(output_dir) |
|
tokenizer.save_pretrained(output_dir) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print('Model and tokenizer loaded!') |
|
print(datetime.datetime.now(IST).strftime("%c")) |
|
|
|
model.eval() |
|
|
|
prompt = "<|startoftext|> I wish to say that" |
|
|
|
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0) |
|
generated = generated.to(device) |
|
|
|
print(generated) |
|
|
|
sample_outputs = model.generate( |
|
generated, |
|
|
|
do_sample=True, |
|
top_k=50, |
|
max_length = 500, |
|
top_p=0.95, |
|
num_return_sequences=3 |
|
) |
|
|
|
for i, sample_output in enumerate(sample_outputs): |
|
print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True))) |
|
|
|
print(datetime.datetime.now(IST).strftime("%c")) |
|
|