{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os; os.chdir('..')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
keywordintent
0social media groupsinformational
1social media groupsnavigational
2internet forumsnavigational
3virtual communitiesnavigational
4online discussion boardscommercial
\n", "
" ], "text/plain": [ " keyword intent\n", "0 social media groups informational\n", "1 social media groups navigational\n", "2 internet forums navigational\n", "3 virtual communities navigational\n", "4 online discussion boards commercial" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "original_df= pd.read_csv(\"data/data_for_seo_new_intent.csv\")\n", "original_df.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def map_intent(intent:str):\n", " return intent.lower()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
keywordintent
0citalopram vs prozaccommercial
1who is the oldest football playerinformational
2t mobile town eastnavigational
3starbucksnavigational
4tech crunchnavigational
\n", "
" ], "text/plain": [ " keyword intent\n", "0 citalopram vs prozac commercial\n", "1 who is the oldest football player informational\n", "2 t mobile town east navigational\n", "3 starbucks navigational\n", "4 tech crunch navigational" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "temp_df= pd.read_csv(\"data_intent/intent_data.csv\")\n", "temp_df.intent= temp_df.intent.map(map_intent)\n", "temp_df= temp_df[temp_df.intent!=\"local\"]\n", "temp_df.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# original_df= temp_df.copy()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "original_df= pd.concat([original_df, temp_df])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False 1304\n", "True 196\n", "Name: count, dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "original_df.duplicated().value_counts()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# original_df.drop_duplicates(inplace=True)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False 1304\n", "True 196\n", "Name: count, dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "original_df.duplicated().value_counts()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "original_df= original_df[original_df.intent!='Local']" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['commercial', 'informational', 'navigational', 'transactional']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "intents= original_df.intent.unique().tolist()\n", "intents" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "id2label= {}\n", "label2id= {}\n", "for i in range(len(intents)):\n", " id2label[i]= intents[i]\n", " label2id[intents[i]]= i" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{0: 'commercial', 1: 'informational', 2: 'navigational', 3: 'transactional'}" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "id2label" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'commercial': 0, 'informational': 1, 'navigational': 2, 'transactional': 3}" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "label2id" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def make_label2id(label):\n", " return label2id[label]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
keywordintentid
0citalopram vs prozaccommercial0
1who is the oldest football playerinformational1
2t mobile town eastnavigational2
3starbucksnavigational2
4tech crunchnavigational2
............
1703How to make homemade pet accessories from recy...informational1
1704Top 10 science fiction book series that take r...informational1
1705How to start a car restoration and customizati...informational1
1706Ancient Mesopotamian architecture and its infl...informational1
1707Benefits of a flexitarian diet for those seeki...informational1
\n", "

1500 rows × 3 columns

\n", "
" ], "text/plain": [ " keyword intent id\n", "0 citalopram vs prozac commercial 0\n", "1 who is the oldest football player informational 1\n", "2 t mobile town east navigational 2\n", "3 starbucks navigational 2\n", "4 tech crunch navigational 2\n", "... ... ... ..\n", "1703 How to make homemade pet accessories from recy... informational 1\n", "1704 Top 10 science fiction book series that take r... informational 1\n", "1705 How to start a car restoration and customizati... informational 1\n", "1706 Ancient Mesopotamian architecture and its infl... informational 1\n", "1707 Benefits of a flexitarian diet for those seeki... informational 1\n", "\n", "[1500 rows x 3 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "original_df['id']= original_df.intent.map(make_label2id)\n", "original_df" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
keywordid
0citalopram vs prozac0
1who is the oldest football player1
2t mobile town east2
3starbucks2
4tech crunch2
.........
1703How to make homemade pet accessories from recy...1
1704Top 10 science fiction book series that take r...1
1705How to start a car restoration and customizati...1
1706Ancient Mesopotamian architecture and its infl...1
1707Benefits of a flexitarian diet for those seeki...1
\n", "

1500 rows × 2 columns

\n", "
" ], "text/plain": [ " keyword id\n", "0 citalopram vs prozac 0\n", "1 who is the oldest football player 1\n", "2 t mobile town east 2\n", "3 starbucks 2\n", "4 tech crunch 2\n", "... ... ..\n", "1703 How to make homemade pet accessories from recy... 1\n", "1704 Top 10 science fiction book series that take r... 1\n", "1705 How to start a car restoration and customizati... 1\n", "1706 Ancient Mesopotamian architecture and its infl... 1\n", "1707 Benefits of a flexitarian diet for those seeki... 1\n", "\n", "[1500 rows x 2 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# df= original_df[['metatitle', 'id']]\n", "df= original_df[['keyword', 'id']]\n", "df" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
keywordid
0Buy baby stroller3
1Why do leaves change color in the fall?1
2How to improve your leadership skills1
3sneakers amazon3
4Shop for photography equipment3
.........
1495Why do stars twinkle?1
1496Buy eco-friendly beauty products0
1497Order makeup kit3
1498Lowe's2
1499Get photography equipment3
\n", "

1500 rows × 2 columns

\n", "
" ], "text/plain": [ " keyword id\n", "0 Buy baby stroller 3\n", "1 Why do leaves change color in the fall? 1\n", "2 How to improve your leadership skills 1\n", "3 sneakers amazon 3\n", "4 Shop for photography equipment 3\n", "... ... ..\n", "1495 Why do stars twinkle? 1\n", "1496 Buy eco-friendly beauty products 0\n", "1497 Order makeup kit 3\n", "1498 Lowe's 2\n", "1499 Get photography equipment 3\n", "\n", "[1500 rows x 2 columns]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df= df.sample(frac=1).reset_index(drop=True)\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/ubuntu/FineTunedDistilledBertAIChecker/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from datasets import Dataset, load_dataset\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textlabel
870Important space missions to the New Horizons s...1
947How to start a travel and adventure blog1
477How to improve your critical thinking skills1
174How to make homemade baby food1
1369Cheap sustainable clothing brands0
396Exploring the mysteries of the deep ocean1
206Discounted eco-friendly patio decor0
191Cheap eco-friendly office products0
533Affordable pet supplies0
1398Travel tips for Japan1
\n", "
" ], "text/plain": [ " text label\n", "870 Important space missions to the New Horizons s... 1\n", "947 How to start a travel and adventure blog 1\n", "477 How to improve your critical thinking skills 1\n", "174 How to make homemade baby food 1\n", "1369 Cheap sustainable clothing brands 0\n", "396 Exploring the mysteries of the deep ocean 1\n", "206 Discounted eco-friendly patio decor 0\n", "191 Cheap eco-friendly office products 0\n", "533 Affordable pet supplies 0\n", "1398 Travel tips for Japan 1" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.rename(columns={\n", " \"keyword\": \"text\", \n", " # \"metatitle\": \"text\", \n", " \"id\": \"label\"\n", "}, \n", " inplace=True\n", ")\n", "\n", "df.sample(10)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['text', 'label'],\n", " num_rows: 1500\n", "})" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset_df= Dataset.from_pandas(df)\n", "dataset_df" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['text', 'label'],\n", " num_rows: 1125\n", " })\n", " test: Dataset({\n", " features: ['text', 'label'],\n", " num_rows: 375\n", " })\n", "})" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_data= dataset_df.train_test_split(test_size=0.25)\n", "new_data" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "def preprocess_function(examples):\n", " return tokenizer(examples[\"text\"], truncation=True)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Map: 100%|██████████| 1125/1125 [00:00<00:00, 31352.56 examples/s]\n", "Map: 100%|██████████| 375/375 [00:00<00:00, 29503.00 examples/s]\n" ] } ], "source": [ "tokenized_df = new_data.map(preprocess_function, batched=True)\n" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2023-11-04 12:46:03.199613: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", "2023-11-04 12:46:03.249373: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", "2023-11-04 12:46:03.249409: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", "2023-11-04 12:46:03.249439: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", "2023-11-04 12:46:03.257947: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", "2023-11-04 12:46:04.345188: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" ] } ], "source": [ "# from transformers import DataCollatorWithPadding\n", "\n", "# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors=\"tf\")\n", "\n", "\n", "\n", "\n", "from transformers import DataCollatorWithPadding\n", "\n", "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "import evaluate\n", "\n", "accuracy = evaluate.load(\"accuracy\")" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "\n", "\n", "def compute_metrics(eval_pred):\n", " predictions, labels = eval_pred\n", " predictions = np.argmax(predictions, axis=1)\n", " return accuracy.compute(predictions=predictions, references=labels)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer\n", "\n", "model = AutoModelForSequenceClassification.from_pretrained(\n", " # \"distilbert-base-uncased\", num_labels=5, id2label=id2label, label2id=label2id\n", " # \"distilbert-base-uncased\", num_labels=4, id2label=id2label, label2id=label2id # removed local\n", " \"bert-base-uncased\", num_labels=4, id2label=id2label, label2id=label2id # removed local\n", ")" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [426/426 00:56, Epoch 6/6]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossAccuracy
1No log0.3501810.957333
2No log0.1070430.973333
3No log0.0879780.978667
4No log0.0852740.973333
5No log0.0869870.973333
6No log0.0931970.970667

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "TrainOutput(global_step=426, training_loss=0.1806535676051753, metrics={'train_runtime': 57.2339, 'train_samples_per_second': 117.937, 'train_steps_per_second': 7.443, 'total_flos': 44042600979624.0, 'train_loss': 0.1806535676051753, 'epoch': 6.0})" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "training_args = TrainingArguments(\n", " output_dir=\"intent_classification_model_without_metatitle_with_local23\",\n", " learning_rate=2e-5,\n", " per_device_train_batch_size=16,\n", " per_device_eval_batch_size=16,\n", " num_train_epochs=6,\n", " weight_decay=0.01,\n", " evaluation_strategy=\"epoch\",\n", " save_strategy=\"epoch\",\n", " load_best_model_at_end=True,\n", " # push_to_hub=True,\n", ")\n", "\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=tokenized_df[\"train\"],\n", " eval_dataset=tokenized_df[\"test\"],\n", " tokenizer=tokenizer,\n", " data_collator=data_collator,\n", " compute_metrics=compute_metrics,\n", ")\n", "\n", "trainer.train()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }