| import pandas as pd
|
| from datasets import Dataset
|
| from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
|
|
|
|
|
| train_df = pd.read_csv("data/train.csv")
|
| test_df = pd.read_csv("data/test.csv")
|
|
|
| train_dataset = Dataset.from_pandas(train_df)
|
| test_dataset = Dataset.from_pandas(test_df)
|
|
|
|
|
| tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
|
|
|
| def tokenize(batch):
|
| return tokenizer(batch["text"], padding=True, truncation=True, max_length=128)
|
|
|
| train_dataset = train_dataset.map(tokenize, batched=True)
|
| test_dataset = test_dataset.map(tokenize, batched=True)
|
|
|
|
|
| num_labels = len(train_df["label"].unique())
|
| model = DistilBertForSequenceClassification.from_pretrained(
|
| "distilbert-base-uncased",
|
| num_labels=num_labels
|
| )
|
|
|
|
|
| training_args = TrainingArguments(
|
| output_dir="models/distilbert",
|
| eval_strategy="epoch",
|
| save_strategy="epoch",
|
| logging_dir="logs",
|
| per_device_train_batch_size=16,
|
| per_device_eval_batch_size=16,
|
| num_train_epochs=3,
|
| weight_decay=0.01
|
| )
|
|
|
| trainer = Trainer(
|
| model=model,
|
| args=training_args,
|
| train_dataset=train_dataset,
|
| eval_dataset=test_dataset
|
| )
|
|
|
| trainer.train()
|
| trainer.save_model("models/distilbert")
|
|
|
| print("✅ Model trained and saved at models/distilbert")
|
|
|