Spaces:
Running
Running
File size: 7,901 Bytes
d609fbc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
import os
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import torch
import numpy as np
import random
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import json
# Establecer la semilla para garantizar reproducibilidad
def set_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
set_seed(42)
# Funci贸n para cargar datos (simplificada para UTF-8)
def load_data(file_path):
data = pd.read_csv(file_path, encoding='utf-8')
return data
# Funci贸n para normalizar texto, manteniendo caracteres especiales
def normalize_text(text):
if isinstance(text, str):
return text.strip().upper()
return text
# Funci贸n para limpiar y preparar los datos
def clean_and_prepare_data(data):
data = data.copy()
# Eliminar filas con valores nulos
data = data.dropna(subset=['text', 'label'])
# Normalizar las etiquetas
data['label'] = data['label'].apply(normalize_text)
# Definir las etiquetas esperadas
emotion_labels = ['FELICIDAD', 'NEUTRAL', 'DEPRESI脫N', 'ANSIEDAD', 'ESTR脡S',
'EMERGENCIA', 'CONFUSI脫N', 'IRA', 'MIEDO', 'SORPRESA', 'DISGUSTO']
# Filtrar solo las etiquetas conocidas
data = data[data['label'].isin(emotion_labels)]
# Crear el mapeo de etiquetas
label_to_id = {label: idx for idx, label in enumerate(emotion_labels)}
data['label'] = data['label'].map(label_to_id)
# Verificar que no haya valores NaN
if data['label'].isna().any():
data = data.dropna(subset=['label'])
data['label'] = data['label'].astype(int)
return data, emotion_labels, label_to_id
# Funci贸n para dividir los datos
def split_data(data):
train_texts, val_texts, train_labels, val_labels = train_test_split(
data['text'], data['label'],
test_size=0.2,
stratify=data['label'],
random_state=42
)
return train_texts, val_texts, train_labels, val_labels
# Funci贸n para calcular los pesos de clase
def get_class_weights(labels):
class_weights = compute_class_weight(
class_weight='balanced',
classes=np.unique(labels),
y=labels
)
return torch.tensor(class_weights, dtype=torch.float)
# Funci贸n para tokenizar los datos (sin padding, ya que lo maneja el data collator)
def tokenize_data(tokenizer, texts, labels):
dataset = Dataset.from_dict({'text': texts.tolist(), 'label': labels.tolist()})
dataset = dataset.map(lambda batch: tokenizer(batch['text'], truncation=True, max_length=128), batched=True)
return dataset
# Funci贸n de p茅rdida personalizada que incorpora los pesos de clase
def custom_loss(labels, logits):
loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
return loss_fct(logits, labels)
# Clase CustomTrainer para usar la funci贸n de p茅rdida personalizada
from transformers import Trainer
class CustomTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
labels = inputs.get("labels").to(model.device)
# Realizar el forward pass
outputs = model(**inputs)
logits = outputs.get("logits")
# Calcular la p茅rdida personalizada
loss = custom_loss(labels, logits)
return (loss, outputs) if return_outputs else loss
# Funci贸n para calcular m茅tricas de evaluaci贸n
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
labels = labels.astype(int)
predictions = predictions.astype(int)
accuracy = accuracy_score(labels, predictions)
f1 = f1_score(labels, predictions, average='weighted')
precision = precision_score(labels, predictions, average='weighted')
recall = recall_score(labels, predictions, average='weighted')
return {
'accuracy': accuracy,
'f1': f1,
'precision': precision,
'recall': recall
}
# Funci贸n para predecir la etiqueta de un texto dado
def predict(text):
# Tokenizar el texto
inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=128)
inputs = {k: v.to(device) for k, v in inputs.items()}
# Realizar la predicci贸n
model.eval()
with torch.no_grad():
outputs = model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
predicted_class = torch.argmax(probs, dim=-1).item()
label = id_to_label.get(predicted_class, "Etiqueta desconocida")
return label
if __name__ == '__main__':
# Configurar el dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsando dispositivo: {device}")
# Ruta del archivo CSV
current_dir = os.path.dirname(os.path.abspath(__file__))
input_file = os.path.join(current_dir, 'data', 'emotion_dataset.csv')
# Paso 1: Cargar y preparar los datos
data = load_data(input_file)
data, emotion_labels, label_to_id = clean_and_prepare_data(data)
id_to_label = {v: k for k, v in label_to_id.items()}
# Paso 2: Dividir los datos
train_texts, val_texts, train_labels, val_labels = split_data(data)
# Paso 3: Calcular los pesos de clase
class_weights = get_class_weights(train_labels).to(device)
# Paso 4: Configurar el tokenizer
tokenizer = BertTokenizerFast.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')
# Paso 5: Tokenizar los datos
train_dataset = tokenize_data(tokenizer, train_texts, train_labels)
val_dataset = tokenize_data(tokenizer, val_texts, val_labels)
# Paso 6: Configurar el data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Paso 7: Configurar el modelo
model = BertForSequenceClassification.from_pretrained(
'dccuchile/bert-base-spanish-wwm-cased',
num_labels=len(emotion_labels)
)
# Paso 8: Configurar el entrenamiento
training_args = TrainingArguments(
output_dir='./models/bert_emotion_model',
num_train_epochs=5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
learning_rate=2e-5,
lr_scheduler_type='linear',
warmup_steps=500,
eval_steps=500,
save_steps=500,
save_total_limit=1,
evaluation_strategy="steps",
save_strategy="steps",
logging_dir='./logs',
logging_steps=100,
load_best_model_at_end=True,
metric_for_best_model='eval_loss',
report_to="none"
)
# Paso 9: Crear el entrenador personalizado
trainer = CustomTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
data_collator=data_collator,
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)
# Paso 10: Entrenar el modelo
trainer.train()
# Paso 11: Guardar el modelo y el tokenizer
trainer.save_model('./models/bert_emotion_model')
tokenizer.save_pretrained('./models/bert_emotion_model')
# Paso 12: Guardar los mapeos de etiquetas
with open('./models/bert_emotion_model/label_to_id.json', 'w') as f:
json.dump(label_to_id, f)
with open('./models/bert_emotion_model/id_to_label.json', 'w') as f:
json.dump(id_to_label, f)
print("\nModelo entrenado y guardado exitosamente.")
# Paso 13: Probar el modelo con un ejemplo
sample_text = "Me siento muy feliz hoy"
print(f"Texto: {sample_text}")
print(f"Predicci贸n: {predict(sample_text)}") |