{ "cells": [ { "cell_type": "code", "execution_count": 5, "id": "c6866894", "metadata": {}, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "code", "execution_count": 16, "id": "7b293125", "metadata": { "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1ddac164d1df40438dddfddf1730f471", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/2312 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments\n", "from datasets import Dataset, DatasetDict\n", "import torch\n", "\n", "# Load dataset\n", "data = pd.read_csv('C:/Users/Administrator/Downloads/ds_2300_Sheet1.csv')\n", "\n", "# Remove 'id' column\n", "data = data.drop(columns=['id'])\n", "\n", "# Adding a dummy label column (ensure it's an integer type)\n", "data['label'] = 0\n", "\n", "# Convert label column to integer type\n", "data['label'] = data['label'].astype(float)\n", "\n", "# Convert to Hugging Face dataset\n", "dataset = Dataset.from_pandas(data)\n", "\n", "# Loading pre-trained uncased multilingual BERT model and tokenizer\n", "model_name = 'bert-base-multilingual-uncased'\n", "tokenizer = BertTokenizer.from_pretrained(model_name)\n", "model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1) # Adjust num_labels if needed\n", "\n", "# Tokenization function\n", "def tokenize_function(examples):\n", " return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512) # Adjust max_length if needed\n", "\n", "# Tokenize the dataset\n", "tokenized_datasets = dataset.map(tokenize_function, batched=True)\n", "\n", "# Split the dataset\n", "split_datasets = tokenized_datasets.train_test_split(test_size=0.1)\n", "train_dataset = split_datasets['train']\n", "eval_dataset = split_datasets['test']\n", "\n", "# Convert train and eval datasets to PyTorch tensors and ensure labels are Long tensors\n", "def format_dataset(dataset):\n", " return dataset.with_format('torch', columns=['input_ids', 'attention_mask', 'label'])\n", "\n", "train_dataset = format_dataset(train_dataset)\n", "eval_dataset = format_dataset(eval_dataset)\n", "\n", "# Define training arguments\n", "training_args = TrainingArguments(\n", " output_dir='./results',\n", " evaluation_strategy='epoch',\n", " learning_rate=2e-5,\n", " per_device_train_batch_size=8,\n", " per_device_eval_batch_size=8,\n", " num_train_epochs=5,\n", " weight_decay=0.01,\n", ")\n", "\n", "# Define Trainer\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=train_dataset,\n", " eval_dataset=eval_dataset,\n", " tokenizer=tokenizer,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "2d533c43", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "\n", "
Epoch | \n", "Training Loss | \n", "Validation Loss | \n", "
---|
"
],
"text/plain": [
"