Spaces:
Sleeping
Sleeping
File size: 10,960 Bytes
28de1fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 |
#!/usr/bin/env python
# coding: utf-8
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
import pandas as pd
from tqdm import tqdm
import time
from .Utilities import LanguageDataset
class Seq2Seq():
"""
Base class for Seq2Seq (text-generation models). This class will be inherited by wrappers of transformers like GPT2
and T5.
Attributes:
Methods:
"""
def __init__(self, gpu=0, max_length=0, model_path=None):
# Load Seq2Seq to device based on available hardware
if torch.cuda.is_available():
self.device = torch.device('cuda')
else:
try:
self.device = torch.device('mps') # Apple Silicon
except Exception:
self.device = torch.device('cpu')
# GPU that model will run on
self.gpu = gpu
# Model specs
if model_path: self.model = torch.load(model_path).to(self.device)
else: self.model = None
self.model_name = ""
self.tokenizer = None
self.max_length = max_length
# Training specs
self.train_loader = None
self.valid_loader = None
self.results = pd.DataFrame(columns=['epoch', 'model_arch', 'batch_size', 'gpu', 'training_loss', 'validation_loss', 'epoch_duration_sec'])
def load_data(self, df, batch_size, train_ratio=0.8):
self.batch_size = batch_size
dataset = LanguageDataset(df, self.tokenizer)
train_size = int(0.8*len(dataset))
valid_size = len(dataset) - train_size
train_data, valid_data = random_split(dataset, [train_size, valid_size])
self.max_length = dataset.max_length
self.train_loader = DataLoader(train_data, batch_size=self.batch_size, shuffle=True)
self.valid_loader = DataLoader(valid_data, batch_size=self.batch_size)
""" Return training results """
def summary(self):
return self.results
""" Save model to path """
def to_pt(self, path):
torch.save(self.model, path)
class GPT2(Seq2Seq):
"""
This is the GPT2 implementation of Seq2Seq.
"""
def __init__(self, gpu, model_name, batch_size=16):
super().__init__(gpu, max_length=0)
from transformers import GPT2Tokenizer, GPT2LMHeadModel
self.model_name = model_name
self.model = GPT2LMHeadModel.from_pretrained(self.model_name).to(self.device)
self.tokenizer = GPT2Tokenizer.from_pretrained(self.model_name)
self.tokenizer.pad_token = self.tokenizer.eos_token
def train(self, num_epochs=3, train_ratio=0.8):
criterion = nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
optimizer = optim.Adam(self.model.parameters(), lr=5e-4)
# Init a results dataframe
results = pd.DataFrame(columns=['epoch', 'transformer', 'batch_size', 'gpu',
'training_loss', 'validation_loss', 'epoch_duration_sec'])
# The training loop
for epoch in range(num_epochs):
start_time = time.time() # Start the timer for the epoch
# Training
## This line tells the self.model we're in 'learning mode'
self.model.train()
epoch_training_loss = 0
train_iterator = tqdm(self.train_loader,
desc=f"Training Epoch {epoch + 1}/{num_epochs} Batch Size: {self.batch_size}, Transformer: {self.model_name}")
for batch in train_iterator:
optimizer.zero_grad()
inputs = batch['input_ids'].squeeze(1).to(self.device)
targets = inputs.clone()
outputs = self.model(input_ids=inputs, labels=targets)
loss = outputs.loss
loss.backward()
optimizer.step()
train_iterator.set_postfix({'Training Loss': loss.item()})
epoch_training_loss += loss.item()
avg_epoch_training_loss = epoch_training_loss / len(train_iterator)
# Validation
## This line below tells the self.model to 'stop learning'
self.model.eval()
epoch_validation_loss = 0
total_loss = 0
valid_iterator = tqdm(self.valid_loader, desc=f"Validation Epoch {epoch + 1}/{num_epochs}")
with torch.no_grad():
for batch in valid_iterator:
inputs = batch['input_ids'].squeeze(1).to(self.device)
targets = inputs.clone()
outputs = self.model(input_ids=inputs, labels=targets)
loss = outputs.loss
total_loss += loss
valid_iterator.set_postfix({'Validation Loss': loss.item()})
epoch_validation_loss += loss.item()
avg_epoch_validation_loss = epoch_validation_loss / len(self.valid_loader)
end_time = time.time() # End the timer for the epoch
epoch_duration_sec = end_time - start_time # Calculate the duration in seconds
new_row = {'transformer': self.model_name,
'batch_size': self.batch_size,
'gpu': self.gpu,
'epoch': epoch + 1,
'training_loss': avg_epoch_training_loss,
'validation_loss': avg_epoch_validation_loss,
'epoch_duration_sec': epoch_duration_sec} # Add epoch_duration to the dataframe
self.results.loc[len(self.results)] = new_row
print(f"Epoch: {epoch + 1}, Validation Loss: {total_loss / len(self.valid_loader)}")
def generate_text(self, input_str, top_k=16, top_p=0.95, temperature=1.0, repetition_penalty=1.2):
# Encode string to tokens
input_ids= self.tokenizer.encode(input_str, return_tensors='pt').to(self.device)
# Feed tokens to model and get outcome tokens
output = self.model.generate(
input_ids,
max_length=self.max_length,
num_return_sequences=1,
do_sample=True,
top_k=top_k,
top_p=top_p,
temperature=temperature,
repetition_penalty=repetition_penalty
)
# Decode tokens to string
decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)
return decoded_output
class FlanT5(Seq2Seq):
"""
This is the T5 implementation of Seq2Seq - it is designed to support T5 models of various sizes.
"""
def __init__(self, gpu, model_name, batch_size=16):
super().__init__(gpu, max_length=0)
from transformers import T5ForConditionalGeneration, T5Tokenizer
self.model_name = model_name
self.model = T5ForConditionalGeneration.from_pretrained(self.model_name).to(self.device)
self.tokenizer = T5Tokenizer.from_pretrained(self.model_name)
self.tokenizer.pad_token = self.tokenizer.eos_token
def train(self, num_epochs=3, train_ratio=0.8):
criterion = nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
optimizer = optim.Adam(self.model.parameters(), lr=5e-4)
# Init a results dataframe
self.results = pd.DataFrame(columns=['epoch', 'transformer', 'batch_size', 'gpu',
'training_loss', 'validation_loss', 'epoch_duration_sec'])
# The training loop
for epoch in range(num_epochs):
start_time = time.time() # Start the timer for the epoch
# Training
## This line tells the model we're in 'learning mode'
self.model.train()
epoch_training_loss = 0
train_iterator = tqdm(self.train_loader,
desc=f"Training Epoch {epoch + 1}/{num_epochs} Batch Size: {self.batch_size}, Transformer: {self.model_name}")
for batch in train_iterator:
optimizer.zero_grad()
inputs = batch['input_ids'].squeeze(1).to(self.device)
targets = batch['labels'].squeeze(1).to(self.device)
outputs = self.model(input_ids=inputs, labels=targets)
loss = outputs.loss
loss.backward()
optimizer.step()
train_iterator.set_postfix({'Training Loss': loss.item()})
epoch_training_loss += loss.item()
avg_epoch_training_loss = epoch_training_loss / len(train_iterator)
# Validation
## This line below tells the model to 'stop learning'
self.model.eval()
epoch_validation_loss = 0
total_loss = 0
valid_iterator = tqdm(self.valid_loader, desc=f"Validation Epoch {epoch + 1}/{num_epochs}")
with torch.no_grad():
for batch in valid_iterator:
inputs = batch['input_ids'].squeeze(1).to(self.device)
targets = batch['labels'].squeeze(1).to(self.device)
outputs = self.model(input_ids=inputs, labels=targets)
loss = outputs.loss
total_loss += loss
valid_iterator.set_postfix({'Validation Loss': loss.item()})
epoch_validation_loss += loss.item()
avg_epoch_validation_loss = epoch_validation_loss / len(self.valid_loader)
end_time = time.time() # End the timer for the epoch
epoch_duration_sec = end_time - start_time # Calculate the duration in seconds
new_row = {'transformer': self.model_name,
'batch_size': self.batch_size,
'gpu': self.gpu,
'epoch': epoch + 1,
'training_loss': avg_epoch_training_loss,
'validation_loss': avg_epoch_validation_loss,
'epoch_duration_sec': epoch_duration_sec} # Add epoch_duration to the dataframe
self.results.loc[len(self.results)] = new_row
print(f"Epoch: {epoch + 1}, Validation Loss: {total_loss / len(self.valid_loader)}")
def generate_text(self, input_str, top_k=16, top_p=0.95, temperature=1.0, repetition_penalty=1.2):
# Encode input string into tensors via the FlanT5 tokenizer
input_ids = self.tokenizer.encode(input_str, return_tensors='pt', max_length=self.max_length, truncation=True).to(self.device)
# Run tensors through model to get output tensor values
output_ids = self.model.generate(input_ids,
max_length=self.max_length,
do_sample=True,
top_k=top_k,
top_p=top_p,
temperature=temperature,
repetition_penalty=repetition_penalty)
# Decode output tensors to text vi
output_str = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
return output_str |