Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import os | |
| import torch | |
| import numpy as np # NEW: Needed for math operations on arrays | |
| from torch import nn # NEW: Needed for the custom loss function | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.utils.class_weight import compute_class_weight # NEW: Calculates the penalty weights | |
| from sklearn.metrics import accuracy_score, f1_score # NEW: The strict F1 grading system | |
| from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments | |
| from datasets import Dataset | |
| def train_mindguard_model(): | |
| print("🚀 Initializing MindGuard Training Pipeline...") | |
| # --- BULLETPROOF PATHING --- | |
| # 1. Find exactly where this train.py script lives | |
| script_dir = os.path.dirname(os.path.abspath(__file__)) | |
| # 2. Go up two folders (from src/core_model) to find the project root | |
| project_root = os.path.abspath(os.path.join(script_dir, "../../")) | |
| # 3. Define the exact absolute paths | |
| data_path = os.path.join(project_root, "data", "processed", "master_training_data.csv") | |
| artifacts_dir = os.path.join(project_root, "artifacts", "xlmr_weights") | |
| # Ensure artifacts directory exists | |
| os.makedirs(artifacts_dir, exist_ok=True) | |
| print(f"Loading data from {data_path}...") | |
| df = pd.read_csv(data_path, on_bad_lines='skip') | |
| # --- THE FIX: Pandas parsing safety --- | |
| df = df.dropna(subset=['text', 'label']) | |
| df['text'] = df['text'].astype(str) | |
| df['label'] = df['label'].astype(str) | |
| # --- THE FIX: Data Sanitizer --- | |
| # 1. Drop any rows where the label is just a number (e.g., "0" or "1") | |
| df = df[~df['label'].str.isnumeric()] | |
| # 2. Drop the corrupted 'admi' label | |
| df = df[df['label'] != 'admi'] | |
| # ⚠️ UNCOMMENT THE LINE BELOW if you don't have a strong GPU and want to do a fast 2-minute test run! | |
| # df = df.sample(500, random_state=42) | |
| # 2. Convert text labels (e.g., 'Anxiety') to numbers (e.g., 0, 1, 2) | |
| label_encoder = LabelEncoder() | |
| df['label_encoded'] = label_encoder.fit_transform(df['label']) | |
| num_labels = len(label_encoder.classes_) | |
| # Save the label mapping | |
| mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_)) | |
| print(f"Detected {num_labels} unique emotions: {mapping}") | |
| # 3. Split the data into Training (80%) and Testing (20%) | |
| train_df, val_df = train_test_split(df, test_size=0.2, random_state=42) | |
| # --- NEW: CALCULATE PENALTY WEIGHTS FOR IMBALANCED DATA --- | |
| print("⚖️ Calculating Class Weights for Imbalanced Data...") | |
| unique_classes = np.unique(train_df['label_encoded']) | |
| weights = compute_class_weight(class_weight='balanced', classes=unique_classes, y=train_df['label_encoded']) | |
| # Automatically detect if a GPU is available locally, otherwise use CPU | |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| class_weights_tensor = torch.tensor(weights, dtype=torch.float).to(device) | |
| train_dataset = Dataset.from_pandas(train_df) | |
| val_dataset = Dataset.from_pandas(val_df) | |
| # 4. Load the XLM-RoBERTa Tokenizer | |
| print("Loading XLM-RoBERTa Tokenizer...") | |
| tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base') | |
| # Function to convert text into numbers | |
| def tokenize_function(examples): | |
| return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128) | |
| # Apply tokenization to both datasets | |
| print("Tokenizing the datasets (converting words to numbers)...") | |
| tokenized_train = train_dataset.map(tokenize_function, batched=True) | |
| tokenized_val = val_dataset.map(tokenize_function, batched=True) | |
| # Rename label column so the Trainer understands it | |
| tokenized_train = tokenized_train.rename_column("label_encoded", "labels") | |
| tokenized_val = tokenized_val.rename_column("label_encoded", "labels") | |
| # --- THE FIX: Strip out English text so PyTorch only sees numbers --- | |
| tokenized_train = tokenized_train.remove_columns(["text", "label"]) | |
| tokenized_val = tokenized_val.remove_columns(["text", "label"]) | |
| # Formally convert them to PyTorch tensors | |
| tokenized_train.set_format("torch") | |
| tokenized_val.set_format("torch") | |
| # 5. Load the Deep Learning Model | |
| print("Loading XLM-RoBERTa Neural Network...") | |
| model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=num_labels) | |
| # --- NEW: STRICT SCORING METRICS --- | |
| def compute_metrics(pred): | |
| labels = pred.label_ids | |
| preds = pred.predictions.argmax(-1) | |
| # F1 Macro forces the AI to prove it learned the rare emotions, not just 'Normal' | |
| f1 = f1_score(labels, preds, average='macro') | |
| acc = accuracy_score(labels, preds) | |
| return {'accuracy': acc, 'f1_macro': f1} | |
| # 6. Set up the Training Rules | |
| training_args = TrainingArguments( | |
| output_dir=artifacts_dir, # Uses absolute path | |
| eval_strategy="epoch", # Test the model at the end of every round | |
| learning_rate=3e-5, # UPDATED: Slightly higher to help learn rare classes | |
| per_device_train_batch_size=16, # How many sentences to look at once | |
| num_train_epochs=5, # UPDATED: 5 epochs to give more time to study hard emotions | |
| warmup_steps=500, # NEW: Gentle warmup to prevent wild guessing early on | |
| weight_decay=0.01, | |
| save_strategy="epoch", | |
| metric_for_best_model="f1_macro", # NEW: Tell AI to prioritize F1 over basic accuracy | |
| load_best_model_at_end=True # NEW: Automatically save the smartest brain | |
| # overwrite_output_dir=True, | |
| ) | |
| # --- NEW: CUSTOM TRAINER OVERRIDE --- | |
| class ImbalancedTrainer(Trainer): | |
| def compute_loss(self, model, inputs, return_outputs=False, **kwargs): | |
| # Grab the actual answers | |
| labels = inputs.pop("labels") | |
| # Make a prediction | |
| outputs = model(**inputs) | |
| logits = outputs.logits | |
| # Calculate the error using our Custom Penalty Weights! | |
| loss_fct = nn.CrossEntropyLoss(weight=class_weights_tensor) | |
| loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1)) | |
| return (loss, outputs) if return_outputs else loss | |
| # 7. Start Training! | |
| trainer = ImbalancedTrainer( # UPDATED: Using the strict custom trainer | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_train, | |
| eval_dataset=tokenized_val, | |
| compute_metrics=compute_metrics # NEW: Attach the strict grader | |
| ) | |
| print("🔥 Starting actual model training! (This might take a while depending on your computer)...") | |
| trainer.train() | |
| # 8. Save the final model | |
| final_model_dir = os.path.join(artifacts_dir, "final_mindguard_model") | |
| print(f"✅ Training complete. Saving the brain to {final_model_dir}...") | |
| trainer.save_model(final_model_dir) | |
| tokenizer.save_pretrained(final_model_dir) | |
| if __name__ == "__main__": | |
| train_mindguard_model() |