In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, create_optimizer

# 1. Load the CSV file
df = pd.read_csv("political_ideology_dataset.csv")

texts = df["text"].tolist()
labels = df["label"].tolist()

label2id = {"Conservative": 0, "Liberal": 1, "Socialist": 2, "Libertarian": 3}
labels_numeric = [label2id[label] for label in labels]

# 2. Tokenization & Preprocessing
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
inputs = tokenizer(texts, truncation=True, padding=True, return_tensors="tf")
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# 3. Model Setup
model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=4
)

# Optimizer
batch_size = 32
num_epochs = 5
batches_per_epoch = len(texts) // batch_size
total_train_steps = batches_per_epoch * num_epochs
optimizer, _ = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=["accuracy"])

# 4. Training
model.fit([input_ids, attention_mask], np.array(labels_numeric), batch_size=batch_size, epochs=num_epochs)

# Specify the save path
save_path = "C:/Users/qqwwf/Downloads/IA"

# Save the model
model.save_pretrained(f"{save_path}/political_ideology_model")

# Save the tokenizer
tokenizer.save_pretrained(f"{save_path}/political_ideology_tokenizer")

# 5. Prediction (example) with confidence rate
sample_text = "Private enterprise is crucial."
sample_inputs = tokenizer(sample_text, truncation=True, padding=True, return_tensors="tf")
logits = model(sample_inputs["input_ids"], attention_mask=sample_inputs["attention_mask"]).logits

# Convert logits to probabilities
probabilities = tf.nn.softmax(logits, axis=-1)

# Get the predicted class ID and its corresponding probability
predicted_class_id = int(tf.math.argmax(probabilities, axis=-1)[0])
predicted_probability = float(probabilities[0, predicted_class_id])

# Convert the class ID to label
id2label = {value: key for key, value in label2id.items()}
predicted_label = id2label[predicted_class_id]

# Print the result with confidence rate
print(f"The text '{sample_text}' is predicted as '{predicted_label}' with a confidence rate of {predicted_probability:.2f}")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', '

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
The text 'Private enterprise is crucial.' is predicted as 'Conservative' with a confidence rate of 1.00
