:228: RuntimeWarning: pyarrow._fs.FileSelector size changed, may indicate binary incompatibility. Expected 48 from C header, got 72 from PyObject\n",
"/tmp/ipykernel_209066/1338289112.py:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_eval.rename(columns ={\"output\":\"label\"}, inplace = True)\n"
]
}
],
"source": [
"import pandas as pd\n",
"import datasets\n",
"from datasets import Dataset, DatasetDict\n",
"#from sklearn import datasets\n",
"\n",
"# store the value into a new dataframe\n",
"df_eval = df[[\"comments\", \"label\"]]\n",
"# Rename output column to label. As we have already used output in other places. We are using output=label only for validation data set\n",
"df_eval.rename(columns ={\"output\":\"label\"}, inplace = True)\n",
"#convert dataframe into a dataset\n",
"df_eval = Dataset.from_pandas(df_eval)"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "51af6cc3-dd96-41f9-8ee7-946b0ca0badc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Dataset({\n",
" features: ['comments', 'label'],\n",
" num_rows: 5112\n",
"})"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_eval"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "9fa973b4-3308-4e24-b1bf-02a35710af03",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'comments': Value(dtype='string', id=None),\n",
" 'label': Value(dtype='int32', id=None)}"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_eval.features"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "6fd8cdce-ef84-4340-a04a-29711bf78719",
"metadata": {},
"outputs": [],
"source": [
"#split train and test dataset\n",
"df_train = df_eval.shuffle(seed=42).select(range(4000))\n",
"df_test = df_eval.shuffle(seed=42).select(range(1111))"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "fe524cbf-d7d5-4f02-ac9b-bf58911caa88",
"metadata": {},
"outputs": [],
"source": [
"# Save train and test data into dataset dictionary\n",
"df_eval = DatasetDict({\n",
" 'train': df_train,\n",
" 'test': df_test\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "da268016-a709-4335-bb45-c55d24d145f3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DatasetDict({\n",
" train: Dataset({\n",
" features: ['comments', 'label'],\n",
" num_rows: 4000\n",
" })\n",
" test: Dataset({\n",
" features: ['comments', 'label'],\n",
" num_rows: 1111\n",
" })\n",
"})"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# show the dictionary\n",
"df_eval"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "a04a8d20-380b-4a02-bcfa-e3061ee758e2",
"metadata": {},
"outputs": [],
"source": [
"df_train = df_eval[\"train\"]\n",
"df_test = df_eval[\"test\"]"
]
},
{
"cell_type": "markdown",
"id": "118ea056-b340-4a66-904d-88da11fec1e1",
"metadata": {
"tags": []
},
"source": [
"#### Evaluate Model"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "091a3101-499f-4d3f-8c71-397200aae22e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5112"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(df)"
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "11a6cdc0-e073-40ec-8a72-916ea6e497d7",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoModelForSequenceClassification\n",
"import numpy as np\n",
"from datasets import load_metric\n",
"from transformers import TrainingArguments\n",
"from transformers import Trainer\n",
"from transformers import AutoTokenizer\n",
"# tokenizer function\n",
"def tokenize_com(examples):\n",
" return tokenizer(examples[\"comments\"], padding=\"max_length\", truncation = True, max_length=512)"
]
},
{
"cell_type": "code",
"execution_count": 78,
"id": "9a131402-992f-41b1-9bb5-342f58109f34",
"metadata": {},
"outputs": [],
"source": [
"# Tokenize data\n",
"def tokenized_data(df): \n",
" #map tokenizer\n",
" token_com = df_eval.map(tokenize_com, batched = True)\n",
" \n",
" ds = ds.train_test_split(test_size=0.2, stratify_by_column=\"label\")\n",
" # Split tokenized dataset into train and test data\n",
" df_train_com = token_com[\"train\"].shuffle(seed=42).select(range(4000))\n",
" df_test_com = token_com[\"test\"].shuffle(seed=42).select(range(1111))\n",
" \n",
" return df_train_com, df_test_com"
]
},
{
"cell_type": "code",
"execution_count": 79,
"id": "6d9db5ee-e7e4-4f25-b7d3-0ea0a823d6fe",
"metadata": {},
"outputs": [],
"source": [
"# Train model\n",
"def trainer(model_name, args, train_dataset, eval_dataset, compute_metrics):\n",
" trainer = Trainer(model=model_name,\n",
" args = training_args,\n",
" train_dataset = train_dataset,\n",
" eval_dataset = eval_dataset,\n",
" compute_metrics = compute_metrics)\n",
" return trainer"
]
},
{
"cell_type": "code",
"execution_count": 80,
"id": "320c6e5a-5f99-4c7c-b5ea-bbd4fa104c5e",
"metadata": {},
"outputs": [],
"source": [
"#Declare Accuracy, precision, recall metric\n",
"def compute_metrics(eval_pred):\n",
" accuracy_metric = load_metric(\"accuracy\")\n",
" precision_metric = load_metric(\"precision\")\n",
" recall_metric = load_metric(\"recall\")\n",
" f1_metric = load_metric(\"f1\")\n",
"\n",
" logits, labels = eval_pred\n",
" predictions = np.argmax(logits, axis =-1)\n",
" accuracy = accuracy_metric.compute(predictions = predictions, references = labels)[\"accuracy\"]\n",
" precision = precision_metric.compute(predictions=predictions, references=labels)[\"precision\"]\n",
" recall = recall_metric.compute(predictions = predictions, references = labels)[\"recall\"]\n",
" f1_score = f1_metric.compute(predictions=predictions, references=labels)[\"f1\"]\n",
"\n",
" return {\"accuracy\": accuracy, \"precision\":precision, \"recall\":recall, \"f1\": f1_score}"
]
},
{
"cell_type": "code",
"execution_count": 81,
"id": "48377523-0a95-4e3c-a900-c6b68a36df6c",
"metadata": {},
"outputs": [],
"source": [
"# Empty cache to avoid CUDA out of memory issue\n",
"#import torch\n",
"#torch.cuda.empty_cache()"
]
},
{
"cell_type": "code",
"execution_count": 82,
"id": "0416ed33-9367-426b-8501-f506996a4f5f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DatasetDict({\n",
" train: Dataset({\n",
" features: ['comments', 'label'],\n",
" num_rows: 4000\n",
" })\n",
" test: Dataset({\n",
" features: ['comments', 'label'],\n",
" num_rows: 1111\n",
" })\n",
"})"
]
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_eval"
]
},
{
"cell_type": "code",
"execution_count": 83,
"id": "c94098de-e497-4fa7-8c15-0c4d6f1908d9",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7c1a5a1c433b48b3b131b595882da773",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/4 [00:00, ?ba/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "28651ddb3f6c4b48b90d23ea6b65872c",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/2 [00:00, ?ba/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from transformers import AutoModel, BertTokenizer\n",
"irony_name = \"csebuetnlp/banglabert\"\n",
"#tokenizer = AutoTokenizer.from_pretrained(irony_name, labels=None)\n",
"tokenizer = AutoTokenizer.from_pretrained(irony_name)\n",
"df_train, df_test = tokenized_data(df_eval)"
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "f778f670-6b8d-4904-a1fc-7bc833f4ca3f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Dataset({\n",
" features: ['comments', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
" num_rows: 4000\n",
"})"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train"
]
},
{
"cell_type": "code",
"execution_count": 85,
"id": "0d9c317a-2aa3-4331-8db6-95db13a3410c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Dataset({\n",
" features: ['comments', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
" num_rows: 1111\n",
"})"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_test"
]
},
{
"cell_type": "code",
"execution_count": 86,
"id": "bfc7b3b0-9450-4051-b512-d9c329c0a889",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of the model checkpoint at csebuetnlp/banglabert were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']\n",
"- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
}
],
"source": [
"#model_irony = AutoModelForSequenceClassification.from_pretrained(irony_name, num_labels=2, ignore_mismatched_sizes=True).to(device)\n",
"model_irony = AutoModelForSequenceClassification.from_pretrained(irony_name, num_labels=2, ignore_mismatched_sizes=True).to(device)\n",
"training_args = TrainingArguments(\"test-trainer-banglaBERT\", {'reprocess_input_data': True}, evaluation_strategy=\"epoch\")"
]
},
{
"cell_type": "code",
"execution_count": 87,
"id": "5ed25c33-0a32-4819-8c1d-46742517396d",
"metadata": {},
"outputs": [],
"source": [
"# Remove unnecessary column that are causing error\n",
"#df_train= df_train.remove_columns([\"comments\"])\n",
"#df_test= df_test.remove_columns([\"comments\"])"
]
},
{
"cell_type": "code",
"execution_count": 88,
"id": "16072f2a-dd48-4c7e-b8b6-ad8459d70599",
"metadata": {},
"outputs": [],
"source": [
"# Remove unnecessary column that are causing error\n",
"#df_train= df_train.remove_columns([\"__index_level_0__\"])\n",
"#df_test= df_test.remove_columns([\"__index_level_0__\"])"
]
},
{
"cell_type": "code",
"execution_count": 89,
"id": "24f73e44-d619-4340-bab4-fab26d516afe",
"metadata": {},
"outputs": [],
"source": [
"#df_train[\"label\"]"
]
},
{
"cell_type": "code",
"execution_count": 90,
"id": "36a2758e-6dbb-473e-9a9e-0ed30e0d26c9",
"metadata": {},
"outputs": [],
"source": [
"# Train and evaluate title data on irony speech analysis model \n",
"trainer_irony = trainer(model_irony, training_args, df_train, df_test, compute_metrics)"
]
},
{
"cell_type": "code",
"execution_count": 91,
"id": "302f356b-23d1-4b8c-b1cd-181202864e11",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Dataset({\n",
" features: ['comments', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
" num_rows: 4000\n",
"})"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train"
]
},
{
"cell_type": "code",
"execution_count": 92,
"id": "6fa87f4c-5c86-4d01-a5d2-0369766a063c",
"metadata": {},
"outputs": [],
"source": [
"#df_train[['input_ids','attention_mask','token_type_ids','label']]"
]
},
{
"cell_type": "code",
"execution_count": 93,
"id": "fcd50a0e-e2de-4e92-958c-66c0f7d6bbc1",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The following columns in the training set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: comments. If comments are not expected by `ElectraForSequenceClassification.forward`, you can safely ignore this message.\n",
"/home/raquiba/anaconda3/lib/python3.9/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
" warnings.warn(\n",
"***** Running training *****\n",
" Num examples = 4000\n",
" Num Epochs = 3\n",
" Instantaneous batch size per device = 8\n",
" Total train batch size (w. parallel, distributed & accumulation) = 8\n",
" Gradient Accumulation steps = 1\n",
" Total optimization steps = 1500\n",
" Number of trainable parameters = 110618882\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" \n",
"
\n",
" [1500/1500 05:53, Epoch 3/3]\n",
"
\n",
" \n",
" \n",
" \n",
" Epoch | \n",
" Training Loss | \n",
" Validation Loss | \n",
" Accuracy | \n",
" Precision | \n",
" Recall | \n",
" F1 | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 0.238700 | \n",
" 0.160494 | \n",
" 0.954995 | \n",
" 0.997268 | \n",
" 0.881643 | \n",
" 0.935897 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.087200 | \n",
" 0.020623 | \n",
" 0.993699 | \n",
" 0.997555 | \n",
" 0.985507 | \n",
" 0.991495 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.022600 | \n",
" 0.004940 | \n",
" 0.999100 | \n",
" 0.997590 | \n",
" 1.000000 | \n",
" 0.998794 | \n",
"
\n",
" \n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Saving model checkpoint to test-trainer-banglaBERT/checkpoint-500\n",
"Configuration saved in test-trainer-banglaBERT/checkpoint-500/config.json\n",
"Model weights saved in test-trainer-banglaBERT/checkpoint-500/pytorch_model.bin\n",
"The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: comments. If comments are not expected by `ElectraForSequenceClassification.forward`, you can safely ignore this message.\n",
"***** Running Evaluation *****\n",
" Num examples = 1111\n",
" Batch size = 8\n",
"/tmp/ipykernel_209066/3231114443.py:3: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n",
" accuracy_metric = load_metric(\"accuracy\")\n",
"Saving model checkpoint to test-trainer-banglaBERT/checkpoint-1000\n",
"Configuration saved in test-trainer-banglaBERT/checkpoint-1000/config.json\n",
"Model weights saved in test-trainer-banglaBERT/checkpoint-1000/pytorch_model.bin\n",
"The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: comments. If comments are not expected by `ElectraForSequenceClassification.forward`, you can safely ignore this message.\n",
"***** Running Evaluation *****\n",
" Num examples = 1111\n",
" Batch size = 8\n",
"Saving model checkpoint to test-trainer-banglaBERT/checkpoint-1500\n",
"Configuration saved in test-trainer-banglaBERT/checkpoint-1500/config.json\n",
"Model weights saved in test-trainer-banglaBERT/checkpoint-1500/pytorch_model.bin\n",
"The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: comments. If comments are not expected by `ElectraForSequenceClassification.forward`, you can safely ignore this message.\n",
"***** Running Evaluation *****\n",
" Num examples = 1111\n",
" Batch size = 8\n",
"\n",
"\n",
"Training completed. Do not forget to share your model on huggingface.co/models =)\n",
"\n",
"\n"
]
},
{
"data": {
"text/plain": [
"TrainOutput(global_step=1500, training_loss=0.11616522216796875, metrics={'train_runtime': 354.0437, 'train_samples_per_second': 33.894, 'train_steps_per_second': 4.237, 'total_flos': 3157332664320000.0, 'train_loss': 0.11616522216796875, 'epoch': 3.0})"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trainer_irony.train()"
]
},
{
"cell_type": "code",
"execution_count": 94,
"id": "a7f46603-8f2f-412e-819f-2bae9250dc4a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: comments. If comments are not expected by `ElectraForSequenceClassification.forward`, you can safely ignore this message.\n",
"***** Running Evaluation *****\n",
" Num examples = 1111\n",
" Batch size = 8\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" \n",
"
\n",
" [139/139 00:09]\n",
"
\n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"{'eval_loss': 0.004940225277096033,\n",
" 'eval_accuracy': 0.9990999099909991,\n",
" 'eval_precision': 0.9975903614457832,\n",
" 'eval_recall': 1.0,\n",
" 'eval_f1': 0.9987937273823884,\n",
" 'eval_runtime': 19.135,\n",
" 'eval_samples_per_second': 58.061,\n",
" 'eval_steps_per_second': 7.264,\n",
" 'epoch': 3.0}"
]
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trainer_irony.evaluate()"
]
},
{
"cell_type": "code",
"execution_count": 95,
"id": "d988e6d4-21c4-4423-9e48-8433d5e40e9e",
"metadata": {},
"outputs": [],
"source": [
"#df_train[\"label\"] = df_train[\"label\"].apply(lambda x: str(x))\n",
"#df_train[\"label\"] = df_train[\"label\"].replace(to_replace='None', value=np.nan).dropna()\n",
"#df_train = list(filter(lambda x: df_train[x] is not None, df_train))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}