{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os; os.chdir('..')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "from datasets import Dataset, load_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Paragraph</th>\n",
       "      <th>AI_generated</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>68451</th>\n",
       "      <td>Anoop Bikram Shahi (also Anup Bikram Shahi) (N...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51106</th>\n",
       "      <td>The Olympus scandal was precipitated on 14 Oct...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102697</th>\n",
       "      <td>Leslie Herbert \"Speedy\" Duncan (August 10, 194...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97245</th>\n",
       "      <td>Elaine Rochelle Sisman (born January 20, 1952)...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>276647</th>\n",
       "      <td>Typhoon Pamela was a powerful typhoon that str...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24973</th>\n",
       "      <td>Bernard Hartmut Breslauer (1 July 1918 – 14 Au...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>277295</th>\n",
       "      <td>Rwamagana is a city and capital of both the Rw...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>63059</th>\n",
       "      <td>Sir Lewis Cohen (23 December 1849 – 5 December...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19267</th>\n",
       "      <td>SN 2005gl was a supernova in the barred-spiral...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>239340</th>\n",
       "      <td>Sulgrave Manor, Sulgrave, Northamptonshire, En...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                Paragraph  AI_generated\n",
       "68451   Anoop Bikram Shahi (also Anup Bikram Shahi) (N...             0\n",
       "51106   The Olympus scandal was precipitated on 14 Oct...             0\n",
       "102697  Leslie Herbert \"Speedy\" Duncan (August 10, 194...             0\n",
       "97245   Elaine Rochelle Sisman (born January 20, 1952)...             1\n",
       "276647  Typhoon Pamela was a powerful typhoon that str...             0\n",
       "24973   Bernard Hartmut Breslauer (1 July 1918 – 14 Au...             0\n",
       "277295  Rwamagana is a city and capital of both the Rw...             0\n",
       "63059   Sir Lewis Cohen (23 December 1849 – 5 December...             1\n",
       "19267   SN 2005gl was a supernova in the barred-spiral...             0\n",
       "239340  Sulgrave Manor, Sulgrave, Northamptonshire, En...             0"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df= pd.read_csv('data/AI_checker_remade.csv')\n",
    "\n",
    "\n",
    "df.sample(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>103386</th>\n",
       "      <td>Huang Yu-chun (born 16 November 1939 in Shanto...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13297</th>\n",
       "      <td>John Kane (7 April 1921 – 27 February 2002) wa...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65718</th>\n",
       "      <td>Vasiliev equations are formally consistent gau...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>63724</th>\n",
       "      <td>Lauderdale Oval or Lauderdale Sports Ground \\n...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>86145</th>\n",
       "      <td>Earl Zebedee Hooker (January 15, 1930 – April ...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>152381</th>\n",
       "      <td>Agnes Strickland (18 July 1796 – 8 October 185...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>201492</th>\n",
       "      <td>Prinsep may mean any of several notable member...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>159645</th>\n",
       "      <td>Jana Herzen  is a singer-songwriter with a foc...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7977</th>\n",
       "      <td>Cincy Blues Fest is an annual blues music fest...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>261224</th>\n",
       "      <td>Bitti Mohanty, also called Bitihotra Mohanty i...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                     text  label\n",
       "103386  Huang Yu-chun (born 16 November 1939 in Shanto...      0\n",
       "13297   John Kane (7 April 1921 – 27 February 2002) wa...      1\n",
       "65718   Vasiliev equations are formally consistent gau...      0\n",
       "63724   Lauderdale Oval or Lauderdale Sports Ground \\n...      1\n",
       "86145   Earl Zebedee Hooker (January 15, 1930 – April ...      0\n",
       "152381  Agnes Strickland (18 July 1796 – 8 October 185...      1\n",
       "201492  Prinsep may mean any of several notable member...      0\n",
       "159645  Jana Herzen  is a singer-songwriter with a foc...      1\n",
       "7977    Cincy Blues Fest is an annual blues music fest...      0\n",
       "261224  Bitti Mohanty, also called Bitihotra Mohanty i...      0"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.rename(columns={\n",
    "    \"Paragraph\": \"text\", \n",
    "    \"AI_generated\": \"label\"\n",
    "}, \n",
    "          inplace=True\n",
    ")\n",
    "\n",
    "df.sample(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_dict= df.to_dict()\n",
    "# df_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['text', 'label'],\n",
       "    num_rows: 300000\n",
       "})"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Dataset.from_pandas(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['text', 'label'],\n",
       "    num_rows: 300000\n",
       "})"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_df= Dataset.from_pandas(df)\n",
    "dataset_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['text', 'label'],\n",
       "    num_rows: 300000\n",
       "})"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_dict.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dataset_df[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['text', 'label'],\n",
       "        num_rows: 240000\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['text', 'label'],\n",
       "        num_rows: 60000\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_data= dataset_df.train_test_split(test_size=0.2)\n",
    "new_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'text': 'The Hillbrow Tower (formerly JG Strijdom Tower) is a luxury residential skyscraper located in the Hillbrow district of Johannesburg, South Africa. The tower was completed in 2006 and has 48 floors. It is the tallest residential building in Johannesburg and South Africa.\\n\\nThe tower is named after JG Strijdom, the first Prime Minister of South Africa. The tower has been controversial since its inception, as it has been accused of being an eyesore and a symbol of inequality in Johannesburg.',\n",
       " 'label': 1}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_data['test'][10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess_function(examples):\n",
    "    return tokenizer(examples[\"text\"], truncation=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map: 100%|██████████| 240000/240000 [01:03<00:00, 3756.57 examples/s]\n",
      "Map: 100%|██████████| 60000/60000 [00:15<00:00, 3767.88 examples/s]\n"
     ]
    }
   ],
   "source": [
    "tokenized_df = new_data.map(preprocess_function, batched=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-10-06 10:06:53.343215: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2023-10-06 10:06:55.238543: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
     ]
    }
   ],
   "source": [
    "# from transformers import DataCollatorWithPadding\n",
    "\n",
    "# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors=\"tf\")\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "from transformers import DataCollatorWithPadding\n",
    "\n",
    "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "import evaluate\n",
    "\n",
    "accuracy = evaluate.load(\"accuracy\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "\n",
    "def compute_metrics(eval_pred):\n",
    "    predictions, labels = eval_pred\n",
    "    predictions = np.argmax(predictions, axis=1)\n",
    "    return accuracy.compute(predictions=predictions, references=labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "id2label = {0: \"NEGATIVE\", 1: \"POSITIVE\"}\n",
    "label2id = {\"NEGATIVE\": 0, \"POSITIVE\": 1}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from transformers import create_optimizer\n",
    "# import tensorflow as tf\n",
    "\n",
    "# batch_size = 16\n",
    "# num_epochs = 5\n",
    "# batches_per_epoch = len(tokenized_df[\"train\"]) // batch_size\n",
    "# total_train_steps = int(batches_per_epoch * num_epochs)\n",
    "# optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer\n",
    "\n",
    "# # model = AutoModelForSequenceClassification.from_pretrained(\n",
    "# #     \"distilbert-base-uncased\", num_labels=2, id2label=id2label, label2id=label2id\n",
    "# # )\n",
    "\n",
    "# from transformers import TFAutoModelForSequenceClassification\n",
    "\n",
    "# model = TFAutoModelForSequenceClassification.from_pretrained(\n",
    "#     \"distilbert-base-uncased\", num_labels=2, id2label=id2label, label2id=label2id\n",
    "# )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer\n",
    "\n",
    "model = AutoModelForSequenceClassification.from_pretrained(\n",
    "    \"distilbert-base-uncased\", num_labels=2, id2label=id2label, label2id=label2id\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "# tf_train_set = model.prepare_tf_dataset(\n",
    "#     tokenized_df[\"train\"],\n",
    "#     shuffle=True,\n",
    "#     batch_size=16,\n",
    "#     collate_fn=data_collator,\n",
    "# )\n",
    "\n",
    "# tf_validation_set = model.prepare_tf_dataset(\n",
    "#     tokenized_df[\"test\"],\n",
    "#     shuffle=False,\n",
    "#     batch_size=16,\n",
    "#     collate_fn=data_collator,\n",
    "# )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import tensorflow as tf\n",
    "\n",
    "# model.compile(optimizer=optimizer)  # No loss argument!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='30000' max='30000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [30000/30000 1:25:18, Epoch 2/2]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "      <th>Accuracy</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.017800</td>\n",
       "      <td>0.074963</td>\n",
       "      <td>0.984383</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.001600</td>\n",
       "      <td>0.039218</td>\n",
       "      <td>0.993867</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=30000, training_loss=0.018804504087567328, metrics={'train_runtime': 5119.9395, 'train_samples_per_second': 93.751, 'train_steps_per_second': 5.859, 'total_flos': 4.559380077797894e+16, 'train_loss': 0.018804504087567328, 'epoch': 2.0})"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "training_args = TrainingArguments(\n",
    "    output_dir=\"my_awesome_model\",\n",
    "    learning_rate=2e-5,\n",
    "    per_device_train_batch_size=16,\n",
    "    per_device_eval_batch_size=16,\n",
    "    num_train_epochs=2,\n",
    "    weight_decay=0.01,\n",
    "    evaluation_strategy=\"epoch\",\n",
    "    save_strategy=\"epoch\",\n",
    "    load_best_model_at_end=True,\n",
    "    # push_to_hub=True,\n",
    ")\n",
    "\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    train_dataset=tokenized_df[\"train\"],\n",
    "    eval_dataset=tokenized_df[\"test\"],\n",
    "    tokenizer=tokenizer,\n",
    "    data_collator=data_collator,\n",
    "    compute_metrics=compute_metrics,\n",
    ")\n",
    "\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from transformers.keras_callbacks import KerasMetricCallback\n",
    "\n",
    "# metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# callbacks = [metric_callback]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('models/tokenizer_v1/tokenizer_config.json',\n",
       " 'models/tokenizer_v1/special_tokens_map.json',\n",
       " 'models/tokenizer_v1/vocab.txt',\n",
       " 'models/tokenizer_v1/added_tokens.json',\n",
       " 'models/tokenizer_v1/tokenizer.json')"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.save_pretrained(\"models/tokenizer_v1\")\n",
    "model.save_pretrained(\"models/trained_model_v1\")\n",
    "\n",
    "tokenizer.save_pretrained(\"models/trained_model_v11\")\n",
    "model.save_pretrained(\"models/trained_model_v11\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}