YAML Metadata Warning:empty or missing yaml metadata in repo card
Check out the documentation for more information.
- Load English-Estonian dataset from HuggingFace
- Initialize tokenizer
- Tokenization function with batched input handling
- Tokenize datasets with batched processing
- DataLoader creation
- Load your dataset
- Define a function to format a sample data point
- Extract a sample data point
- Show the sample data in the specified format
- Load BLEU and ChrF evaluation metrics
from datasets import load_dataset from transformers import AutoTokenizer from torch.utils.data import DataLoader import torch
Load English-Estonian dataset from HuggingFace
dataset = load_dataset("opus100", "en-et")
Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # Adjust if needed for your language model
Tokenization function with batched input handling
def tokenize_data(batch): en_texts = [example["en"] for example in batch["translation"]] et_texts = [example["et"] for example in batch["translation"]]
# Tokenize the extracted texts
inputs = tokenizer(en_texts, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
targets = tokenizer(et_texts, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
return {"input_ids": inputs["input_ids"].squeeze(), "labels": targets["input_ids"].squeeze()}
Tokenize datasets with batched processing
train_data = dataset["train"].map(tokenize_data, batched=True) val_data = dataset["validation"].map(tokenize_data, batched=True) test_data = dataset["test"].map(tokenize_data, batched=True)
DataLoader creation
def create_dataloader(data, batch_size=32): return DataLoader(data.with_format("torch"), batch_size=batch_size, shuffle=True)
train_dataloader = create_dataloader(train_data, batch_size=32) val_dataloader = create_dataloader(val_data, batch_size=32) test_dataloader = create_dataloader(test_data, batch_size=32) from datasets import load_dataset
Load your dataset
dataset = load_dataset("opus100", "en-et")
Define a function to format a sample data point
def show_sample_data(sample): en_text = sample["translation"]["en"] # The key was 'et' for Estonian, not 'es' for English et_text = sample["translation"]["et"]
# Display the sample in the required format
print("EN to ET") # Changed to EN to HA
print("<s>")
print(f"<en> {en_text} </en>")
print(f"<et> {et_text} </et>") # Changed <es> to <et> and es_text to et_text
print("</s>")
print("\nET to EN") # Changed to HA to EN
print("<s>")
print(f"<et> {et_text} </et>") # Changed <es> to <et> and es_text to et_text
print(f"<en> {en_text} </en>")
print("</s>")
Extract a sample data point
sample_data = dataset["train"][0] # Get the first sample from the training set
Show the sample data in the specified format
show_sample_data(sample_data) import torch.nn as nn
class Seq2SeqModel(nn.Module): def init(self, input_dim, hidden_dim, output_dim, n_layers=2): super(Seq2SeqModel, self).init() self.embedding = nn.Embedding(input_dim, hidden_dim) self.encoder = nn.LSTM(hidden_dim, hidden_dim, num_layers=n_layers, batch_first=True) self.decoder = nn.LSTM(hidden_dim, hidden_dim, num_layers=n_layers, batch_first=True) self.fc_out = nn.Linear(hidden_dim, output_dim)
def forward(self, src, tgt):
embedded_src = self.embedding(src)
_, (hidden, cell) = self.encoder(embedded_src)
embedded_tgt = self.embedding(tgt)
output, _ = self.decoder(embedded_tgt, (hidden, cell))
return self.fc_out(output)
input_dim = tokenizer.vocab_size output_dim = tokenizer.vocab_size hidden_dim = 256
seq2seq_model = Seq2SeqModel(input_dim, hidden_dim, output_dim) import time import pandas as pd import evaluate
Load BLEU and ChrF evaluation metrics
bleu_metric = evaluate.load("bleu") chrf_metric = evaluate.load("chrf")
def train_seq2seq(model, train_dataloader, val_dataloader, criterion, optimizer, test_dataloader, tokenizer, device, n_epochs=10): model = model.to(device) train_losses, val_losses = [], [] bleu_scores, chrf_scores = [], []
for epoch in range(n_epochs):
model.train()
epoch_train_loss = 0
start_time = time.time()
for batch in train_dataloader:
src = batch["input_ids"].to(device)
tgt = batch["labels"].to(device)
optimizer.zero_grad()
output = model(src, tgt)
tgt = tgt[:, :output.size(1)]
output = output.reshape(-1, output.shape[-1])
tgt = tgt.reshape(-1)
loss = criterion(output, tgt)
loss.backward()
optimizer.step()
epoch_train_loss += loss.item()
avg_train_loss = epoch_train_loss / len(train_dataloader)
train_losses.append(avg_train_loss)
# Validation loop
model.eval()
epoch_val_loss = 0
with torch.no_grad():
for batch in val_dataloader:
src = batch["input_ids"].to(device)
tgt = batch["labels"].to(device)
output = model(src, tgt)
tgt = tgt[:, :output.size(1)]
output = output.reshape(-1, output.shape[-1])
tgt = tgt.reshape(-1)
loss = criterion(output, tgt)
epoch_val_loss += loss.item()
avg_val_loss = epoch_val_loss / len(val_dataloader)
val_losses.append(avg_val_loss)
epoch_time = time.time() - start_time
print(f"Epoch {epoch+1}/{n_epochs} completed in {epoch_time:.2f}s")
print(f"Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")
# Compute BLEU and ChrF scores
bleu_score, chrf_score = compute_bleu_chrf_scores(model, test_dataloader, tokenizer, device)
bleu_scores.append({"Epoch": epoch + 1, "BLEU Score": bleu_score})
chrf_scores.append({"Epoch": epoch + 1, "ChrF Score": chrf_score})
print(f"Epoch {epoch+1} BLEU Score: {bleu_score:.4f}, ChrF Score: {chrf_score:.4f}")
# Save BLEU and ChrF scores to CSV files
bleu_df = pd.DataFrame(bleu_scores)
bleu_df.to_csv("seq2seq_bleu_scores.csv", index=False)
chrf_df = pd.DataFrame(chrf_scores)
chrf_df.to_csv("seq2seq_chrf_scores.csv", index=False)
print("All BLEU and ChrF scores have been saved in seq2seq_bleu_scores.csv and seq2seq_chrf_scores.csv")
return train_losses, val_losses
def compute_bleu_chrf_scores(model, dataloader, tokenizer, device): model.eval() predictions, references = [], []
with torch.no_grad():
for batch in dataloader:
src = batch["input_ids"].to(device)
tgt = batch["labels"].to(device)
outputs = model(src, tgt)
predicted_ids = outputs.argmax(dim=-1)
predicted_texts = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)
target_texts = tokenizer.batch_decode(tgt, skip_special_tokens=True)
predictions.extend(predicted_texts)
references.extend([[text] for text in target_texts])
bleu_score = bleu_metric.compute(predictions=predictions, references=references)['bleu']
chrf_score = chrf_metric.compute(predictions=predictions, references=references)['score']
return bleu_score, chrf_score