{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\qqwwf\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python311\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']\n",
      "- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/5\n",
      "125/125 [==============================] - 118s 860ms/step - loss: 0.3332 - accuracy: 0.9632\n",
      "Epoch 2/5\n",
      "125/125 [==============================] - 108s 861ms/step - loss: 0.0221 - accuracy: 1.0000\n",
      "Epoch 3/5\n",
      "125/125 [==============================] - 109s 869ms/step - loss: 0.0105 - accuracy: 1.0000\n",
      "Epoch 4/5\n",
      "125/125 [==============================] - 109s 874ms/step - loss: 0.0073 - accuracy: 1.0000\n",
      "Epoch 5/5\n",
      "125/125 [==============================] - 107s 855ms/step - loss: 0.0061 - accuracy: 1.0000\n",
      "The text 'Private enterprise is crucial.' is predicted as 'Conservative' with a confidence rate of 1.00\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, create_optimizer\n",
    "\n",
    "# 1. Load the CSV file\n",
    "df = pd.read_csv(\"political_ideology_dataset.csv\")\n",
    "\n",
    "texts = df[\"text\"].tolist()\n",
    "labels = df[\"label\"].tolist()\n",
    "\n",
    "label2id = {\"Conservative\": 0, \"Liberal\": 1, \"Socialist\": 2, \"Libertarian\": 3}\n",
    "labels_numeric = [label2id[label] for label in labels]\n",
    "\n",
    "# 2. Tokenization & Preprocessing\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
    "inputs = tokenizer(texts, truncation=True, padding=True, return_tensors=\"tf\")\n",
    "input_ids = inputs[\"input_ids\"]\n",
    "attention_mask = inputs[\"attention_mask\"]\n",
    "\n",
    "# 3. Model Setup\n",
    "model = TFAutoModelForSequenceClassification.from_pretrained(\n",
    "    \"distilbert-base-uncased\", num_labels=4\n",
    ")\n",
    "\n",
    "# Optimizer\n",
    "batch_size = 32\n",
    "num_epochs = 5\n",
    "batches_per_epoch = len(texts) // batch_size\n",
    "total_train_steps = batches_per_epoch * num_epochs\n",
    "optimizer, _ = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)\n",
    "\n",
    "model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[\"accuracy\"])\n",
    "\n",
    "# 4. Training\n",
    "model.fit([input_ids, attention_mask], np.array(labels_numeric), batch_size=batch_size, epochs=num_epochs)\n",
    "\n",
    "# Specify the save path\n",
    "save_path = \"C:/Users/qqwwf/Downloads/IA\"\n",
    "\n",
    "# Save the model\n",
    "model.save_pretrained(f\"{save_path}/political_ideology_model\")\n",
    "\n",
    "# Save the tokenizer\n",
    "tokenizer.save_pretrained(f\"{save_path}/political_ideology_tokenizer\")\n",
    "\n",
    "# 5. Prediction (example) with confidence rate\n",
    "sample_text = \"Private enterprise is crucial.\"\n",
    "sample_inputs = tokenizer(sample_text, truncation=True, padding=True, return_tensors=\"tf\")\n",
    "logits = model(sample_inputs[\"input_ids\"], attention_mask=sample_inputs[\"attention_mask\"]).logits\n",
    "\n",
    "# Convert logits to probabilities\n",
    "probabilities = tf.nn.softmax(logits, axis=-1)\n",
    "\n",
    "# Get the predicted class ID and its corresponding probability\n",
    "predicted_class_id = int(tf.math.argmax(probabilities, axis=-1)[0])\n",
    "predicted_probability = float(probabilities[0, predicted_class_id])\n",
    "\n",
    "# Convert the class ID to label\n",
    "id2label = {value: key for key, value in label2id.items()}\n",
    "predicted_label = id2label[predicted_class_id]\n",
    "\n",
    "# Print the result with confidence rate\n",
    "print(f\"The text '{sample_text}' is predicted as '{predicted_label}' with a confidence rate of {predicted_probability:.2f}\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}