Baseline code using BERT and recording of code walkthrough

#3
by janbelke - opened

Hi everyone,

Here's the link to a recording (YouTube) of the baseline code walkthrough using BERT.

Below the code for your reference:

import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from sklearn import metrics
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments


class ClassificationDataset:
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        text = str(self.data[item]["synopsis"])
        target = int(self.data[item]["genre"])
        inputs = self.tokenizer(text, max_length=128, padding="max_length", truncation=True)

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
            "labels": torch.tensor(target, dtype=torch.long),
        }


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = metrics.accuracy_score(labels, predictions)
    return {"accuracy": accuracy}


def train():
    ds = load_dataset("datadrivenscience/movie-genre-prediction")
    ds = ds.class_encode_column("genre")

    ds_train = ds["train"]
    ds_test = ds["test"]

    temp_ds = ds_train.train_test_split(test_size=0.2, stratify_by_column="genre")
    ds_train = temp_ds["train"]
    ds_val = temp_ds["test"]

    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=len(ds_train.features["genre"]._int2str),
    )

    train_dataset = ClassificationDataset(ds_train, tokenizer)
    valid_dataset = ClassificationDataset(ds_val, tokenizer)
    test_dataset = ClassificationDataset(ds_test, tokenizer)

    args = TrainingArguments(
        "model",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        report_to="none",
        save_total_limit=1,
    )

    trainer = Trainer(
        model,
        args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    preds = trainer.predict(test_dataset).predictions
    preds = np.argmax(preds, axis=1)

    # generate submission file
    submission = pd.DataFrame({"id": ds_test["id"], "genre": preds})
    submission.loc[:, "genre"] = submission.genre.apply(lambda x: ds_train.features["genre"].int2str(x))
    submission.to_csv("submission.csv", index=False)


if __name__ == "__main__":
    train()
abhishek pinned discussion
Competitions org
This comment has been hidden

Sign up or log in to comment