YAML Metadata Warning:empty or missing yaml metadata in repo card
Check out the documentation for more information.
- Load English-Estonian dataset from HuggingFace
- Initialize tokenizer
- Tokenize datasets with batched processing
- DataLoader creation
- Extract a single sample from the training data
- Decode the input and target
- Print the formatted sample
- Load BLEU and ChrF evaluation metrics
- Training and evaluation function
- Compute BLEU and ChrF scores
- Load dataset
- Tokenization
- Model, optimizer, and criterion
- Train model
- Save Model State
- Define the exact same LSTMModel class for loading
- Reinitialize model with the exact same parameters
- Load saved state dict
- Set to evaluation mode for inference
from datasets import load_dataset from transformers import AutoTokenizer import torch from torch.utils.data import DataLoader
Load English-Estonian dataset from HuggingFace
dataset = load_dataset("opus100", "en-et")
Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # Adjust if needed for your language model
Tokenization function with batched input handling
def tokenize_data(batch): # Extract English and Estonian translations en_texts = [example["en"] for example in batch["translation"]] et_texts = [example["et"] for example in batch["translation"]]
# Tokenize the extracted texts
inputs = tokenizer(en_texts, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
targets = tokenizer(et_texts, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
return {"input_ids": inputs["input_ids"].squeeze(), "labels": targets["input_ids"].squeeze()}
Tokenize datasets with batched processing
train_data = dataset["train"].map(tokenize_data, batched=True) val_data = dataset["validation"].map(tokenize_data, batched=True) test_data = dataset["test"].map(tokenize_data, batched=True)
DataLoader creation
def create_dataloader(data, batch_size=32): return DataLoader(data.with_format("torch"), batch_size=batch_size, shuffle=True)
train_dataloader = create_dataloader(train_data, batch_size=32) val_dataloader = create_dataloader(val_data, batch_size=32) test_dataloader = create_dataloader(test_data, batch_size=32)
Extract a single sample from the training data
sample = train_data[0] # Get the first example from train_data
Decode the input and target
input_text = tokenizer.decode(sample["input_ids"], skip_special_tokens=True) label_text = tokenizer.decode(sample["labels"], skip_special_tokens=True)
Print the formatted sample
print("EN to ET\n")
print("")
print(f" {input_text} ")
print(f" {label_text} ")
print("\n")
print("ET to EN\n")
print("")
print(f" {label_text} ")
print(f" {input_text} ")
print("")
import torch.nn as nn
class LSTMTranslator(nn.Module): def init(self, input_dim, hidden_dim, output_dim, n_layers=2): # Default 2 layers super(LSTMTranslator, self).init() self.embedding = nn.Embedding(input_dim, hidden_dim) self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers=n_layers, batch_first=True) self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, src):
embedded = self.embedding(src)
outputs, _ = self.lstm(embedded)
predictions = self.fc(outputs)
return predictions
import time import math import torch import pandas as pd import evaluate from torch import nn, optim from torch.utils.data import DataLoader
Load BLEU and ChrF evaluation metrics
bleu_metric = evaluate.load("bleu") chrf_metric = evaluate.load("chrf")
class LSTMModel(nn.Module): def init(self, input_dim, embed_dim, hidden_dim, output_dim): super(LSTMModel, self).init() self.embedding = nn.Embedding(input_dim, embed_dim) self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True) self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
embedded = self.embedding(x)
lstm_out, _ = self.lstm(embedded)
output = self.fc(lstm_out)
return output
Training and evaluation function
def train_lstm(model, train_dataloader, val_dataloader, criterion, optimizer, test_dataloader, tokenizer, device, n_epochs=5): model.to(device) train_losses, val_losses = [], [] bleu_scores, chrf_scores = [], []
for epoch in range(n_epochs):
model.train()
train_loss = 0
start_time = time.time()
for batch in train_dataloader:
src = batch["input_ids"].to(device)
tgt = batch["labels"].to(device)
optimizer.zero_grad()
# Forward pass
outputs = model(src)
tgt = tgt[:, :outputs.size(1)] # Align target length with output length
outputs = outputs.reshape(-1, outputs.size(-1))
tgt = tgt.reshape(-1)
# Compute loss
loss = criterion(outputs, tgt)
loss.backward()
optimizer.step()
train_loss += loss.item()
train_loss /= len(train_dataloader)
train_losses.append(train_loss)
# Validation phase
model.eval()
val_loss = 0
with torch.no_grad():
for batch in val_dataloader:
src = batch["input_ids"].to(device)
tgt = batch["labels"].to(device)
outputs = model(src)
tgt = tgt[:, :outputs.size(1)]
outputs = outputs.reshape(-1, outputs.size(-1))
tgt = tgt.reshape(-1)
loss = criterion(outputs, tgt)
val_loss += loss.item()
val_loss /= len(val_dataloader)
val_losses.append(val_loss)
# Compute BLEU and ChrF scores on test set
bleu_score, chrf_score = compute_bleu_chrf_scores(model, test_dataloader, tokenizer, device)
bleu_scores.append({"Epoch": epoch + 1, "BLEU Score": bleu_score})
chrf_scores.append({"Epoch": epoch + 1, "ChrF Score": chrf_score})
epoch_time = time.time() - start_time
print(f"Epoch {epoch+1}/{n_epochs} completed in {epoch_time:.2f}s")
print(f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
print(f"BLEU Score: {bleu_score:.4f}, ChrF Score: {chrf_score:.4f}")
# Save BLEU and ChrF scores to CSV
bleu_df = pd.DataFrame(bleu_scores)
bleu_df.to_csv("lstm_bleu_scores.csv", index=False)
chrf_df = pd.DataFrame(chrf_scores)
chrf_df.to_csv("lstm_chrf_scores.csv", index=False)
print("Training completed. BLEU and ChrF scores saved to CSV files.")
return train_losses, val_losses
Compute BLEU and ChrF scores
def compute_bleu_chrf_scores(model, dataloader, tokenizer, device): model.eval() predictions, references = [], []
with torch.no_grad():
for batch in dataloader:
src = batch["input_ids"].to(device)
tgt = batch["labels"].to(device)
outputs = model(src)
predicted_ids = outputs.argmax(dim=-1)
predicted_texts = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)
target_texts = tokenizer.batch_decode(tgt, skip_special_tokens=True)
# Extend predictions and references only if valid
if predicted_texts and target_texts:
predictions.extend(predicted_texts)
references.extend([[text] for text in target_texts])
# Handle cases where predictions or references are empty
if not predictions or not references:
print("Warning: Empty predictions or references. Returning BLEU and ChrF scores as 0.")
return 0.0, 0.0
bleu_score = bleu_metric.compute(predictions=predictions, references=references)['bleu']
chrf_score = chrf_metric.compute(predictions=predictions, references=references)['score']
return bleu_score, chrf_score
Load dataset
from datasets import load_dataset from transformers import AutoTokenizer
dataset = load_dataset("opus100", "en-et") tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
Tokenization
def tokenize_data(batch): en_texts = [example["en"] for example in batch["translation"]] et_texts = [example["et"] for example in batch["translation"]] inputs = tokenizer(en_texts, truncation=True, padding="max_length", max_length=128, return_tensors="pt") targets = tokenizer(et_texts, truncation=True, padding="max_length", max_length=128, return_tensors="pt") return {"input_ids": inputs["input_ids"], "labels": targets["input_ids"]}
train_data = dataset["train"].map(tokenize_data, batched=True) val_data = dataset["validation"].map(tokenize_data, batched=True) test_data = dataset["test"].map(tokenize_data, batched=True)
def create_dataloader(data, batch_size=32): return DataLoader(data.with_format("torch"), batch_size=batch_size, shuffle=True)
train_dataloader = create_dataloader(train_data) val_dataloader = create_dataloader(val_data) test_dataloader = create_dataloader(test_data)
Model, optimizer, and criterion
input_dim = tokenizer.vocab_size embed_dim = 128 hidden_dim = 256 output_dim = tokenizer.vocab_size
lstm_model = LSTMModel(input_dim, embed_dim, hidden_dim, output_dim) criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id) optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)
Train model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") train_losses, val_losses = train_lstm( lstm_model, train_dataloader, val_dataloader, criterion, optimizer, test_dataloader, tokenizer, device, n_epochs=5 )
Save Model State
torch.save(lstm_model.state_dict(), "lstm_model_state_dict.pth")
Define the exact same LSTMModel class for loading
class LSTMModel(nn.Module): def init(self, input_dim, embed_dim, hidden_dim, output_dim): super(LSTMModel, self).init() self.embedding = nn.Embedding(input_dim, embed_dim) self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True) self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
embedded = self.embedding(x)
lstm_out, _ = self.lstm(embedded)
output = self.fc(lstm_out)
return output
Reinitialize model with the exact same parameters
loaded_model = LSTMModel(input_dim, embed_dim, hidden_dim, output_dim)
Load saved state dict
loaded_model.load_state_dict(torch.load("lstm_model_state_dict.pth"))
Set to evaluation mode for inference
loaded_model.eval()
print("Model successfully loaded and set to evaluation mode.")