{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\qqwwf\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python311\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']\n", "- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/5\n", "125/125 [==============================] - 118s 860ms/step - loss: 0.3332 - accuracy: 0.9632\n", "Epoch 2/5\n", "125/125 [==============================] - 108s 861ms/step - loss: 0.0221 - accuracy: 1.0000\n", "Epoch 3/5\n", "125/125 [==============================] - 109s 869ms/step - loss: 0.0105 - accuracy: 1.0000\n", "Epoch 4/5\n", "125/125 [==============================] - 109s 874ms/step - loss: 0.0073 - accuracy: 1.0000\n", "Epoch 5/5\n", "125/125 [==============================] - 107s 855ms/step - loss: 0.0061 - accuracy: 1.0000\n", "The text 'Private enterprise is crucial.' is predicted as 'Conservative' with a confidence rate of 1.00\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import tensorflow as tf\n", "from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, create_optimizer\n", "\n", "# 1. Load the CSV file\n", "df = pd.read_csv(\"political_ideology_dataset.csv\")\n", "\n", "texts = df[\"text\"].tolist()\n", "labels = df[\"label\"].tolist()\n", "\n", "label2id = {\"Conservative\": 0, \"Liberal\": 1, \"Socialist\": 2, \"Libertarian\": 3}\n", "labels_numeric = [label2id[label] for label in labels]\n", "\n", "# 2. Tokenization & Preprocessing\n", "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", "inputs = tokenizer(texts, truncation=True, padding=True, return_tensors=\"tf\")\n", "input_ids = inputs[\"input_ids\"]\n", "attention_mask = inputs[\"attention_mask\"]\n", "\n", "# 3. Model Setup\n", "model = TFAutoModelForSequenceClassification.from_pretrained(\n", " \"distilbert-base-uncased\", num_labels=4\n", ")\n", "\n", "# Optimizer\n", "batch_size = 32\n", "num_epochs = 5\n", "batches_per_epoch = len(texts) // batch_size\n", "total_train_steps = batches_per_epoch * num_epochs\n", "optimizer, _ = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)\n", "\n", "model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[\"accuracy\"])\n", "\n", "# 4. Training\n", "model.fit([input_ids, attention_mask], np.array(labels_numeric), batch_size=batch_size, epochs=num_epochs)\n", "\n", "# Specify the save path\n", "save_path = \"C:/Users/qqwwf/Downloads/IA\"\n", "\n", "# Save the model\n", "model.save_pretrained(f\"{save_path}/political_ideology_model\")\n", "\n", "# Save the tokenizer\n", "tokenizer.save_pretrained(f\"{save_path}/political_ideology_tokenizer\")\n", "\n", "# 5. Prediction (example) with confidence rate\n", "sample_text = \"Private enterprise is crucial.\"\n", "sample_inputs = tokenizer(sample_text, truncation=True, padding=True, return_tensors=\"tf\")\n", "logits = model(sample_inputs[\"input_ids\"], attention_mask=sample_inputs[\"attention_mask\"]).logits\n", "\n", "# Convert logits to probabilities\n", "probabilities = tf.nn.softmax(logits, axis=-1)\n", "\n", "# Get the predicted class ID and its corresponding probability\n", "predicted_class_id = int(tf.math.argmax(probabilities, axis=-1)[0])\n", "predicted_probability = float(probabilities[0, predicted_class_id])\n", "\n", "# Convert the class ID to label\n", "id2label = {value: key for key, value in label2id.items()}\n", "predicted_label = id2label[predicted_class_id]\n", "\n", "# Print the result with confidence rate\n", "print(f\"The text '{sample_text}' is predicted as '{predicted_label}' with a confidence rate of {predicted_probability:.2f}\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 2 }