# -*- coding: utf-8 -*- """arabertv2-d3tok_Sentence_Constrained.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1vHngBRGkcXHJIwi317DS0AsZfsXguIc0 """ # Authorize Colab to access your Google Drive from google.colab import drive drive.mount('/content/drive') import pandas as pd import numpy as np import os import torch import torch.nn as nn import zipfile from sklearn.metrics import cohen_kappa_score from torch.utils.data import Dataset as TorchDataset from transformers import ( AutoTokenizer, AutoModel, TrainingArguments, Trainer, EarlyStoppingCallback ) import gc import ast # To safely evaluate string-formatted lists import json # Added import for json # --- Hugging Face Authentication --- from huggingface_hub import login from google.colab import userdata # Log in to Hugging Face using the token stored in Colab secrets try: login(token=userdata.get('HF_TOKEN')) print("✔️ Successfully logged into Hugging Face.") except (NameError, KeyError): print("⚠️ Hugging Face token not found in Colab secrets. Please add it as 'HF_TOKEN'.") # Fallback for local execution or if login() is preferred manually login() # --- Model & Training --- MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg" NUM_LABELS = 1 TARGET_CLASSES = 19 NUM_FEATURES = 7 # --- IMPORTANT: Set the path to your project folder on Google Drive --- PROJECT_DRIVE_PATH = '/content/drive/MyDrive/BAREC_Competition' # --- File & Directory Paths (Now relative to your Google Drive) --- BASE_DIR = PROJECT_DRIVE_PATH PROCESSED_DATA_DIR = os.path.join(BASE_DIR, "lex") CHECKPOINT_DIR = os.path.join(BASE_DIR, "results", f"hybrid_constrained_samer_regression_v2_{MODEL_NAME.split('/')[-1]}") SUBMISSION_DIR = os.path.join(BASE_DIR, "submission") # Ensure the output directories exist on your Google Drive os.makedirs(CHECKPOINT_DIR, exist_ok=True) os.makedirs(SUBMISSION_DIR, exist_ok=True) # Paths to the preprocessed input files on Google Drive TRAIN_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'train_processed_full.csv') DEV_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'dev_processed_full.csv') TEST_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'test_processed_full.csv') # --- Submission Paths on Google Drive --- SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_hybrid_constrained_samer_regression.csv") ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_hybrid_constrained_samer_regression.zip") print(f"✔️ All paths configured to use Google Drive folder: {BASE_DIR}") """# 2. DATA LOADING FUNCTION""" # ===================================================================================== # 2. DATA LOADING FUNCTION # ===================================================================================== def load_preprocessed_data(): """Loads the pre-processed CSV files directly from Google Drive.""" print("\n--- Loading Preprocessed Data from Google Drive ---") try: train_df = pd.read_csv(TRAIN_PROCESSED_PATH) val_df = pd.read_csv(DEV_PROCESSED_PATH) print("Converting 'features' column from string to list...") train_df['features'] = train_df['features'].apply(ast.literal_eval) val_df['features'] = val_df['features'].apply(ast.literal_eval) # Convert labels for regression train_df['label'] = (train_df['label'].astype(int) - 1).astype(float) val_df['label'] = (val_df['label'].astype(int) - 1).astype(float) print(f"✔ Successfully loaded {len(train_df)} training and {len(val_df)} validation records.") return train_df, val_df except FileNotFoundError as e: print(f"❌ ERROR: Preprocessed file not found: {e}.") print("Please make sure your data is uploaded to the correct Google Drive folder.") return None, None except Exception as e: print(f"❌ ERROR during data loading: {e}") return None, None """# 3. MODEL, DATASET, AND METRICS DEFINITIONS""" # ===================================================================================== # 3. MODEL, DATASET, AND METRICS DEFINITIONS # ===================================================================================== class ReadabilityDataset(TorchDataset): """ Custom PyTorch Dataset for readability prediction. It takes text, extra numerical features, and optional labels. """ def __init__(self, texts, features, labels=None, tokenizer_obj=None, max_len=256): self.texts = texts self.features = features self.labels = labels self.tokenizer = tokenizer_obj self.max_len = max_len def __len__(self): return len(self.texts) def __getitem__(self, idx): text = str(self.texts[idx]) feature_vec = torch.tensor(self.features[idx], dtype=torch.float) encoding = self.tokenizer.encode_plus( text, add_special_tokens=True, max_length=self.max_len, return_token_type_ids=False, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt', ) item = { 'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'extra_features': feature_vec } if self.labels is not None: item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float) return item class HybridRegressionModel(nn.Module): """ A hybrid model that combines a transformer base with additional numerical features. The output is a single regression value. """ def __init__(self, model_name, num_extra_features): super(HybridRegressionModel, self).__init__() self.transformer = AutoModel.from_pretrained(model_name) # The regression head takes the transformer's output + extra features self.regressor = nn.Linear(self.transformer.config.hidden_size + num_extra_features, 1) # Add the attribute expected by Hugging Face Trainer for checkpoint loading # Initialize it to None or an empty list if no keys should be ignored during saving self._keys_to_ignore_on_save = None # Or [] def forward(self, input_ids, attention_mask, extra_features, labels=None): outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask) pooler_output = outputs.pooler_output # Concatenate transformer output with extra features combined_features = torch.cat((pooler_output, extra_features), dim=1) logits = self.regressor(combined_features) loss = None if labels is not None: loss_fct = nn.MSELoss() loss = loss_fct(logits.squeeze(), labels.squeeze()) return (loss, logits) if loss is not None else logits def compute_metrics(p): """ Computes the Quadratic Weighted Kappa score for regression predictions. Predictions are rounded and clipped to be valid class labels. """ predictions, labels = p # Round predictions to nearest integer and clip to valid range [0, 18] clipped_preds = np.clip(np.round(predictions), 0, TARGET_CLASSES - 1) qwk = cohen_kappa_score(labels, clipped_preds, weights='quadratic') return {'qwk': qwk} """# 4. & 5. MAIN EXECUTION FUNCTIONS""" # ===================================================================================== # 4. & 5. MAIN EXECUTION FUNCTIONS # ===================================================================================== def main_train(): print("===== 🚀 STARTING HYBRID REGRESSION MODEL PIPELINE =====\n") print("Initializing Tokenizer...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) train_df, val_df = load_preprocessed_data() if train_df is None: print("\n! Aborting script due to data loading failure.") return print("\nCreating Torch Datasets...") train_dataset = ReadabilityDataset(train_df['d3tok_text'].tolist(), train_df['features'].tolist(), train_df['label'].tolist(), tokenizer) val_dataset = ReadabilityDataset(val_df['d3tok_text'].tolist(), val_df['features'].tolist(), val_df['label'].tolist(), tokenizer) print("✔ Datasets created.") print("\nInitializing Hybrid Regression Model...") model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES) training_args = TrainingArguments( output_dir=CHECKPOINT_DIR, num_train_epochs=15, per_device_train_batch_size=16, per_device_eval_batch_size=64, learning_rate=3e-5, warmup_ratio=0.1, weight_decay=0.01, logging_steps=100, eval_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, metric_for_best_model="qwk", greater_is_better=True, save_total_limit=2, fp16=torch.cuda.is_available(), report_to="none" ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=4)] ) print("\nStarting model training... Checkpoints will be saved to Google Drive.") latest_checkpoint = None if os.path.exists(CHECKPOINT_DIR): checkpoints = [d for d in os.listdir(CHECKPOINT_DIR) if d.startswith("checkpoint-")] if checkpoints: checkpoints.sort(key=lambda x: int(x.split('-')[-1])) latest_checkpoint = os.path.join(CHECKPOINT_DIR, checkpoints[-1]) print(f"Resuming training from latest checkpoint: {latest_checkpoint}") else: print("No checkpoints found to resume training from. Starting from scratch.") else: print("Checkpoint directory not found. Starting training from scratch.") trainer.train(resume_from_checkpoint=latest_checkpoint) print("✔ Training finished.") del model, trainer, train_dataset, val_dataset, train_df, val_df gc.collect() torch.cuda.empty_cache() def main_predict(): print("\n===== 🏆 GENERATING FINAL PREDICTIONS & SUBMISSION =====\n") try: print("Initializing Tokenizer for prediction...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) print("Loading preprocessed test data from Google Drive...") test_df = pd.read_csv(TEST_PROCESSED_PATH) test_df['features'] = test_df['features'].apply(ast.literal_eval) print("\nLoading the best trained model from Google Drive checkpoints...") best_checkpoint_path = None best_qwk = -1.0 if not os.path.exists(CHECKPOINT_DIR): raise FileNotFoundError(f"Checkpoint directory not found on Google Drive: {CHECKPOINT_DIR}.") checkpoints = [d for d in os.listdir(CHECKPOINT_DIR) if d.startswith("checkpoint-")] if not checkpoints: raise FileNotFoundError(f"No checkpoint found in the results directory on Google Drive: {CHECKPOINT_DIR}.") checkpoints.sort(key=lambda x: int(x.split('-')[-1]), reverse=True) for checkpoint in checkpoints: checkpoint_path = os.path.join(CHECKPOINT_DIR, checkpoint) trainer_state_path = os.path.join(checkpoint_path, "trainer_state.json") model_file_path = os.path.join(checkpoint_path, "pytorch_model.bin") if os.path.exists(model_file_path) and os.path.exists(trainer_state_path): try: with open(trainer_state_path, 'r') as f: trainer_state = json.load(f) if 'best_metric' in trainer_state and trainer_state['best_metric'] > best_qwk: best_qwk = trainer_state['best_metric'] best_checkpoint_path = checkpoint_path print(f"Found improved eval_qwk {best_qwk} in {checkpoint_path}") except Exception as e: print(f"Warning: Could not parse trainer_state.json in {checkpoint_path}: {e}") if not best_checkpoint_path: print("Could not find best checkpoint via trainer_state.json. Falling back to the latest checkpoint with a model file.") for checkpoint in checkpoints: checkpoint_path = os.path.join(CHECKPOINT_DIR, checkpoint) if os.path.exists(os.path.join(checkpoint_path, "pytorch_model.bin")): best_checkpoint_path = checkpoint_path print(f"Using latest valid checkpoint: {best_checkpoint_path}") break if not best_checkpoint_path: raise FileNotFoundError(f"No valid checkpoint with 'pytorch_model.bin' found in: {CHECKPOINT_DIR}.") print(f"Loading model from: {best_checkpoint_path}") model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES) model.load_state_dict(torch.load(os.path.join(best_checkpoint_path, "pytorch_model.bin"))) trainer = Trainer(model=model, args=TrainingArguments(output_dir=CHECKPOINT_DIR)) print("Generating predictions on the test set...") # Note: No labels for the test dataset test_dataset = ReadabilityDataset(test_df['d3tok_text'].tolist(), test_df['features'].tolist(), tokenizer_obj=tokenizer) predictions = trainer.predict(test_dataset) clipped_preds = np.clip(np.round(predictions.predictions.flatten()), 0, TARGET_CLASSES - 1) test_df['Prediction'] = (clipped_preds + 1).astype(int) submission_df = test_df.rename(columns={'ID': 'id'})[['id', 'Prediction']] print(f"\nSaving prediction file to: {SUBMISSION_PATH}") submission_df.to_csv(SUBMISSION_PATH, index=False) print(f"Compressing into {os.path.basename(ZIPPED_SUBMISSION_PATH)}...") with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf: zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH)) print(f"✔ Submission file '{os.path.basename(ZIPPED_SUBMISSION_PATH)}' created successfully in your Drive!") except FileNotFoundError as e: print(f"❌ ERROR: File not found: {e}. Ensure training was completed and checkpoints exist.") except Exception as e: print(f"❌ An error occurred during final prediction: {e}") """# Running script""" # ===================================================================================== # 6. SCRIPT RUNNER # ===================================================================================== # Start the training process main_train() # Once training is done, generate predictions main_predict() print("\n--- ✅ All Done! Check your Google Drive for results and submission files. ---") BEST_CHECKPOINT_PATH = '/content/drive/MyDrive/BAREC_Competition/results/hybrid_constrained_samer_regression_v2_readability-arabertv2-d3tok-reg/checkpoint-42826' # --- Input & Output Directories (derived from base paths) --- PROCESSED_DATA_DIR = os.path.join(PROJECT_DRIVE_PATH, "lex") SUBMISSION_DIR = os.path.join(PROJECT_DRIVE_PATH, "submission") # Ensure the submission output directory exists os.makedirs(SUBMISSION_DIR, exist_ok=True) # --- File Paths --- TEST_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'test_processed_full.csv') SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_final_prediction.csv") ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission.zip") print(f"✔️ Configuration loaded. Model will be loaded from: {BEST_CHECKPOINT_PATH}") """# 3. MODEL AND DATASET CLASS DEFINITIONS""" # ===================================================================================== # 3. MODEL AND DATASET CLASS DEFINITIONS # ===================================================================================== class ReadabilityDataset(TorchDataset): """ Custom PyTorch Dataset for readability prediction. It takes text, extra numerical features, and optional labels. """ def __init__(self, texts, features, labels=None, tokenizer_obj=None, max_len=256): self.texts = texts self.features = features self.labels = labels self.tokenizer = tokenizer_obj self.max_len = max_len def __len__(self): return len(self.texts) def __getitem__(self, idx): text = str(self.texts[idx]) feature_vec = torch.tensor(self.features[idx], dtype=torch.float) encoding = self.tokenizer.encode_plus( text, add_special_tokens=True, max_length=self.max_len, return_token_type_ids=False, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt', ) item = { 'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'extra_features': feature_vec } if self.labels is not None: item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float) return item class HybridRegressionModel(nn.Module): """ A hybrid model that combines a transformer base with additional numerical features. The output is a single regression value. """ def __init__(self, model_name, num_extra_features): super(HybridRegressionModel, self).__init__() self.transformer = AutoModel.from_pretrained(model_name) self.regressor = nn.Linear(self.transformer.config.hidden_size + num_extra_features, 1) def forward(self, input_ids, attention_mask, extra_features, labels=None): outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask) pooler_output = outputs.pooler_output combined_features = torch.cat((pooler_output, extra_features), dim=1) logits = self.regressor(combined_features) loss = None if labels is not None: # Using MSELoss for regression loss_fct = nn.MSELoss() loss = loss_fct(logits.squeeze(), labels.squeeze()) return (loss, logits) if loss is not None else logits """# 4. PREDICTION AND SUBMISSION SCRIPT""" # ===================================================================================== # 4. PREDICTION AND SUBMISSION SCRIPT # ===================================================================================== from safetensors.torch import load_file # Import load_file def generate_predictions(): """ Loads the trained model from the specified checkpoint, runs predictions on the test set, and saves the formatted submission file. """ print("\n===== 🏆 STARTING PREDICTION PIPELINE =====\n") try: # --- Validate Checkpoint Path --- model_weights_path = os.path.join(BEST_CHECKPOINT_PATH, 'model.safetensors') if not os.path.exists(model_weights_path): print(f"❌ ERROR: 'model.safetensors' not found at the specified path: {model_weights_path}") print("Please ensure the BEST_CHECKPOINT_PATH variable is set correctly to the directory containing the model weights.") return # --- Initialize Tokenizer --- print("1. Initializing tokenizer...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # --- Load Test Data --- print(f"2. Loading preprocessed test data from: {TEST_PROCESSED_PATH}") test_df = pd.read_csv(TEST_PROCESSED_PATH) test_df['features'] = test_df['features'].apply(ast.literal_eval) print(f" Loaded {len(test_df)} test records.") # --- Load Model from Checkpoint --- print(f"3. Loading model from checkpoint: {BEST_CHECKPOINT_PATH}") model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES) # Load the state dictionary from the .safetensors file state_dict = load_file(model_weights_path) # --- Inspect the state dictionary keys --- print("\n--- State Dictionary Keys ---") for key in state_dict.keys(): print(key) print("-----------------------------\n") # --- End Inspection --- # Load the state dictionary into the model model.load_state_dict(state_dict) print(" Model weights loaded successfully from model.safetensors.") # --- Initialize Trainer --- # A minimal Trainer is sufficient for making predictions trainer = Trainer(model=model, args=TrainingArguments(output_dir="./temp_results")) # --- Create Test Dataset --- print("4. Creating test dataset...") test_dataset = ReadabilityDataset( texts=test_df['d3tok_text'].tolist(), features=test_df['features'].tolist(), tokenizer_obj=tokenizer ) # --- Generate Predictions --- print("5. Generating predictions on the test set...") raw_predictions = trainer.predict(test_dataset) # Predictions are in the .predictions attribute; flatten them to a 1D array predictions_logits = raw_predictions.predictions.flatten() # --- Process Predictions --- # Round to the nearest integer, clip to the valid label range [0, 18], # and convert back to original class labels [1, 19] clipped_preds = np.clip(np.round(predictions_logits), 0, TARGET_CLASSES - 1) final_predictions = (clipped_preds + 1).astype(int) test_df['Prediction'] = final_predictions print(" Predictions generated and processed.") # --- Create and Save Submission File --- submission_df = test_df.rename(columns={'ID': 'id'})[['id', 'Prediction']] print(f"\n6. Saving final prediction CSV to: {SUBMISSION_PATH}") submission_df.to_csv(SUBMISSION_PATH, index=False) print(f"7. Compressing submission file into: {ZIPPED_SUBMISSION_PATH}") with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf: zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH)) print("\n--- ✅ All Done! ---") print(f"Submission file '{os.path.basename(ZIPPED_SUBMISSION_PATH)}' has been saved to your Google Drive.") print(f"Location: {SUBMISSION_DIR}") except FileNotFoundError as e: print(f"❌ ERROR: A required file was not found: {e}") print(" Please ensure all paths in the configuration section are correct.") except Exception as e: print(f"❌ An unexpected error occurred: {e}") """# 5. EXECUTE SCRIPT""" # ===================================================================================== # 5. EXECUTE SCRIPT # ===================================================================================== if __name__ == '__main__': generate_predictions() """# Results of Sentence-level Readability Assessment - Constrained on The Blind Test {'accuracy': 42.1, 'accuracy+-1': 71.6, 'avg_abs_dist': 1.2, 'qwk': 82.1, 'accuracy_7': 59.9, 'accuracy_5': 65.4, 'accuracy_3': 73.4} """