YAML Metadata Warning:empty or missing yaml metadata in repo card

Check out the documentation for more information.

from datasets import load_dataset from transformers import AutoTokenizer from torch.utils.data import DataLoader import torch

Load English-Estonian dataset from HuggingFace

dataset = load_dataset("opus100", "en-et")

Initialize tokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # Adjust if needed for your language model

Tokenization function with batched input handling

def tokenize_data(batch): en_texts = [example["en"] for example in batch["translation"]] et_texts = [example["et"] for example in batch["translation"]]

# Tokenize the extracted texts
inputs = tokenizer(en_texts, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
targets = tokenizer(et_texts, truncation=True, padding="max_length", max_length=128, return_tensors="pt")

return {"input_ids": inputs["input_ids"].squeeze(), "labels": targets["input_ids"].squeeze()}

Tokenize datasets with batched processing

train_data = dataset["train"].map(tokenize_data, batched=True) val_data = dataset["validation"].map(tokenize_data, batched=True) test_data = dataset["test"].map(tokenize_data, batched=True)

DataLoader creation

def create_dataloader(data, batch_size=32): return DataLoader(data.with_format("torch"), batch_size=batch_size, shuffle=True)

train_dataloader = create_dataloader(train_data, batch_size=32) val_dataloader = create_dataloader(val_data, batch_size=32) test_dataloader = create_dataloader(test_data, batch_size=32) from datasets import load_dataset

Load your dataset

dataset = load_dataset("opus100", "en-et")

Define a function to format a sample data point

def show_sample_data(sample): en_text = sample["translation"]["en"] # The key was 'et' for Estonian, not 'es' for English et_text = sample["translation"]["et"]

# Display the sample in the required format
print("EN to ET") # Changed to EN to HA
print("<s>")
print(f"<en> {en_text} </en>")
print(f"<et> {et_text} </et>") # Changed <es> to <et> and es_text to et_text
print("</s>")
print("\nET to EN") # Changed to HA to EN
print("<s>")
print(f"<et> {et_text} </et>") # Changed <es> to <et> and es_text to et_text
print(f"<en> {en_text} </en>")
print("</s>")

Extract a sample data point

sample_data = dataset["train"][0] # Get the first sample from the training set

Show the sample data in the specified format

show_sample_data(sample_data) import torch.nn as nn

class Seq2SeqModel(nn.Module): def init(self, input_dim, hidden_dim, output_dim, n_layers=2): super(Seq2SeqModel, self).init() self.embedding = nn.Embedding(input_dim, hidden_dim) self.encoder = nn.LSTM(hidden_dim, hidden_dim, num_layers=n_layers, batch_first=True) self.decoder = nn.LSTM(hidden_dim, hidden_dim, num_layers=n_layers, batch_first=True) self.fc_out = nn.Linear(hidden_dim, output_dim)

def forward(self, src, tgt):
    embedded_src = self.embedding(src)
    _, (hidden, cell) = self.encoder(embedded_src)

    embedded_tgt = self.embedding(tgt)
    output, _ = self.decoder(embedded_tgt, (hidden, cell))

    return self.fc_out(output)

input_dim = tokenizer.vocab_size output_dim = tokenizer.vocab_size hidden_dim = 256

seq2seq_model = Seq2SeqModel(input_dim, hidden_dim, output_dim) import time import pandas as pd import evaluate

Load BLEU and ChrF evaluation metrics

bleu_metric = evaluate.load("bleu") chrf_metric = evaluate.load("chrf")

def train_seq2seq(model, train_dataloader, val_dataloader, criterion, optimizer, test_dataloader, tokenizer, device, n_epochs=10): model = model.to(device) train_losses, val_losses = [], [] bleu_scores, chrf_scores = [], []

for epoch in range(n_epochs):
    model.train()
    epoch_train_loss = 0
    start_time = time.time()

    for batch in train_dataloader:
        src = batch["input_ids"].to(device)
        tgt = batch["labels"].to(device)

        optimizer.zero_grad()
        output = model(src, tgt)
        tgt = tgt[:, :output.size(1)]

        output = output.reshape(-1, output.shape[-1])
        tgt = tgt.reshape(-1)

        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()
        epoch_train_loss += loss.item()

    avg_train_loss = epoch_train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)

    # Validation loop
    model.eval()
    epoch_val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            src = batch["input_ids"].to(device)
            tgt = batch["labels"].to(device)

            output = model(src, tgt)
            tgt = tgt[:, :output.size(1)]

            output = output.reshape(-1, output.shape[-1])
            tgt = tgt.reshape(-1)

            loss = criterion(output, tgt)
            epoch_val_loss += loss.item()

    avg_val_loss = epoch_val_loss / len(val_dataloader)
    val_losses.append(avg_val_loss)

    epoch_time = time.time() - start_time
    print(f"Epoch {epoch+1}/{n_epochs} completed in {epoch_time:.2f}s")
    print(f"Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

    # Compute BLEU and ChrF scores
    bleu_score, chrf_score = compute_bleu_chrf_scores(model, test_dataloader, tokenizer, device)
    bleu_scores.append({"Epoch": epoch + 1, "BLEU Score": bleu_score})
    chrf_scores.append({"Epoch": epoch + 1, "ChrF Score": chrf_score})
    print(f"Epoch {epoch+1} BLEU Score: {bleu_score:.4f}, ChrF Score: {chrf_score:.4f}")

# Save BLEU and ChrF scores to CSV files
bleu_df = pd.DataFrame(bleu_scores)
bleu_df.to_csv("seq2seq_bleu_scores.csv", index=False)
chrf_df = pd.DataFrame(chrf_scores)
chrf_df.to_csv("seq2seq_chrf_scores.csv", index=False)

print("All BLEU and ChrF scores have been saved in seq2seq_bleu_scores.csv and seq2seq_chrf_scores.csv")

return train_losses, val_losses

def compute_bleu_chrf_scores(model, dataloader, tokenizer, device): model.eval() predictions, references = [], []

with torch.no_grad():
    for batch in dataloader:
        src = batch["input_ids"].to(device)
        tgt = batch["labels"].to(device)

        outputs = model(src, tgt)
        predicted_ids = outputs.argmax(dim=-1)

        predicted_texts = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)
        target_texts = tokenizer.batch_decode(tgt, skip_special_tokens=True)

        predictions.extend(predicted_texts)
        references.extend([[text] for text in target_texts])

bleu_score = bleu_metric.compute(predictions=predictions, references=references)['bleu']
chrf_score = chrf_metric.compute(predictions=predictions, references=references)['score']

return bleu_score, chrf_score
Downloads last month

-

Downloads are not tracked for this model. How to track
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support