File size: 7,901 Bytes
d609fbc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import os
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import torch
import numpy as np
import random
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import json

# Establecer la semilla para garantizar reproducibilidad
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

# Funci贸n para cargar datos (simplificada para UTF-8)
def load_data(file_path):
    data = pd.read_csv(file_path, encoding='utf-8')
    return data

# Funci贸n para normalizar texto, manteniendo caracteres especiales
def normalize_text(text):
    if isinstance(text, str):
        return text.strip().upper()
    return text

# Funci贸n para limpiar y preparar los datos
def clean_and_prepare_data(data):
    data = data.copy()
    # Eliminar filas con valores nulos
    data = data.dropna(subset=['text', 'label'])
    # Normalizar las etiquetas
    data['label'] = data['label'].apply(normalize_text)
    # Definir las etiquetas esperadas
    emotion_labels = ['FELICIDAD', 'NEUTRAL', 'DEPRESI脫N', 'ANSIEDAD', 'ESTR脡S',
                      'EMERGENCIA', 'CONFUSI脫N', 'IRA', 'MIEDO', 'SORPRESA', 'DISGUSTO']
    # Filtrar solo las etiquetas conocidas
    data = data[data['label'].isin(emotion_labels)]
    # Crear el mapeo de etiquetas
    label_to_id = {label: idx for idx, label in enumerate(emotion_labels)}
    data['label'] = data['label'].map(label_to_id)
    # Verificar que no haya valores NaN
    if data['label'].isna().any():
        data = data.dropna(subset=['label'])
    data['label'] = data['label'].astype(int)
    return data, emotion_labels, label_to_id

# Funci贸n para dividir los datos
def split_data(data):
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        data['text'], data['label'],
        test_size=0.2,
        stratify=data['label'],
        random_state=42
    )
    return train_texts, val_texts, train_labels, val_labels

# Funci贸n para calcular los pesos de clase
def get_class_weights(labels):
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(labels),
        y=labels
    )
    return torch.tensor(class_weights, dtype=torch.float)

# Funci贸n para tokenizar los datos (sin padding, ya que lo maneja el data collator)
def tokenize_data(tokenizer, texts, labels):
    dataset = Dataset.from_dict({'text': texts.tolist(), 'label': labels.tolist()})
    dataset = dataset.map(lambda batch: tokenizer(batch['text'], truncation=True, max_length=128), batched=True)
    return dataset

# Funci贸n de p茅rdida personalizada que incorpora los pesos de clase
def custom_loss(labels, logits):
    loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
    return loss_fct(logits, labels)

# Clase CustomTrainer para usar la funci贸n de p茅rdida personalizada
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels").to(model.device)
        # Realizar el forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Calcular la p茅rdida personalizada
        loss = custom_loss(labels, logits)
        return (loss, outputs) if return_outputs else loss

# Funci贸n para calcular m茅tricas de evaluaci贸n
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    labels = labels.astype(int)
    predictions = predictions.astype(int)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Funci贸n para predecir la etiqueta de un texto dado
def predict(text):
    # Tokenizar el texto
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    # Realizar la predicci贸n
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(probs, dim=-1).item()
        label = id_to_label.get(predicted_class, "Etiqueta desconocida")
    return label

if __name__ == '__main__':
    # Configurar el dispositivo
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"\nUsando dispositivo: {device}")

    # Ruta del archivo CSV
    current_dir = os.path.dirname(os.path.abspath(__file__))
    input_file = os.path.join(current_dir, 'data', 'emotion_dataset.csv')

    # Paso 1: Cargar y preparar los datos
    data = load_data(input_file)
    data, emotion_labels, label_to_id = clean_and_prepare_data(data)
    id_to_label = {v: k for k, v in label_to_id.items()}

    # Paso 2: Dividir los datos
    train_texts, val_texts, train_labels, val_labels = split_data(data)

    # Paso 3: Calcular los pesos de clase
    class_weights = get_class_weights(train_labels).to(device)

    # Paso 4: Configurar el tokenizer
    tokenizer = BertTokenizerFast.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')

    # Paso 5: Tokenizar los datos
    train_dataset = tokenize_data(tokenizer, train_texts, train_labels)
    val_dataset = tokenize_data(tokenizer, val_texts, val_labels)

    # Paso 6: Configurar el data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Paso 7: Configurar el modelo
    model = BertForSequenceClassification.from_pretrained(
        'dccuchile/bert-base-spanish-wwm-cased',
        num_labels=len(emotion_labels)
    )

    # Paso 8: Configurar el entrenamiento
    training_args = TrainingArguments(
        output_dir='./models/bert_emotion_model',
        num_train_epochs=5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        lr_scheduler_type='linear',
        warmup_steps=500,
        eval_steps=500,
        save_steps=500,
        save_total_limit=1,
        evaluation_strategy="steps",
        save_strategy="steps",
        logging_dir='./logs',
        logging_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        report_to="none"
    )

    # Paso 9: Crear el entrenador personalizado
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    # Paso 10: Entrenar el modelo
    trainer.train()

    # Paso 11: Guardar el modelo y el tokenizer
    trainer.save_model('./models/bert_emotion_model')
    tokenizer.save_pretrained('./models/bert_emotion_model')

    # Paso 12: Guardar los mapeos de etiquetas
    with open('./models/bert_emotion_model/label_to_id.json', 'w') as f:
        json.dump(label_to_id, f)
    with open('./models/bert_emotion_model/id_to_label.json', 'w') as f:
        json.dump(id_to_label, f)

    print("\nModelo entrenado y guardado exitosamente.")

    # Paso 13: Probar el modelo con un ejemplo
    sample_text = "Me siento muy feliz hoy"
    print(f"Texto: {sample_text}")
    print(f"Predicci贸n: {predict(sample_text)}")