Constrained-Track-Document-Bassline-Readability-Arabertv2-d3tok-reg / Constrained-Track-Document-Bassline-Readability-Arabertv2-d3tok-reg.py
FatimahEmadEldin's picture
Upload 2 files
cbefe5c verified
# -*- coding: utf-8 -*-
"""arabertv2-d3tok_Sentence_Constrained.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1vHngBRGkcXHJIwi317DS0AsZfsXguIc0
"""
# Authorize Colab to access your Google Drive
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import zipfile
from sklearn.metrics import cohen_kappa_score
from torch.utils.data import Dataset as TorchDataset
from transformers import (
AutoTokenizer,
AutoModel,
TrainingArguments,
Trainer,
EarlyStoppingCallback
)
import gc
import ast # To safely evaluate string-formatted lists
import json # Added import for json
# --- Hugging Face Authentication ---
from huggingface_hub import login
from google.colab import userdata
# Log in to Hugging Face using the token stored in Colab secrets
try:
login(token=userdata.get('HF_TOKEN'))
print("✔️ Successfully logged into Hugging Face.")
except (NameError, KeyError):
print("⚠️ Hugging Face token not found in Colab secrets. Please add it as 'HF_TOKEN'.")
# Fallback for local execution or if login() is preferred manually
login()
# --- Model & Training ---
MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg"
NUM_LABELS = 1
TARGET_CLASSES = 19
NUM_FEATURES = 7
# --- IMPORTANT: Set the path to your project folder on Google Drive ---
PROJECT_DRIVE_PATH = '/content/drive/MyDrive/BAREC_Competition'
# --- File & Directory Paths (Now relative to your Google Drive) ---
BASE_DIR = PROJECT_DRIVE_PATH
PROCESSED_DATA_DIR = os.path.join(BASE_DIR, "lex")
CHECKPOINT_DIR = os.path.join(BASE_DIR, "results", f"hybrid_constrained_samer_regression_v2_{MODEL_NAME.split('/')[-1]}")
SUBMISSION_DIR = os.path.join(BASE_DIR, "submission")
# Ensure the output directories exist on your Google Drive
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(SUBMISSION_DIR, exist_ok=True)
# Paths to the preprocessed input files on Google Drive
TRAIN_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'train_processed_full.csv')
DEV_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'dev_processed_full.csv')
TEST_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'test_processed_full.csv')
# --- Submission Paths on Google Drive ---
SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_hybrid_constrained_samer_regression.csv")
ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_hybrid_constrained_samer_regression.zip")
print(f"✔️ All paths configured to use Google Drive folder: {BASE_DIR}")
"""# 2. DATA LOADING FUNCTION"""
# =====================================================================================
# 2. DATA LOADING FUNCTION
# =====================================================================================
def load_preprocessed_data():
"""Loads the pre-processed CSV files directly from Google Drive."""
print("\n--- Loading Preprocessed Data from Google Drive ---")
try:
train_df = pd.read_csv(TRAIN_PROCESSED_PATH)
val_df = pd.read_csv(DEV_PROCESSED_PATH)
print("Converting 'features' column from string to list...")
train_df['features'] = train_df['features'].apply(ast.literal_eval)
val_df['features'] = val_df['features'].apply(ast.literal_eval)
# Convert labels for regression
train_df['label'] = (train_df['label'].astype(int) - 1).astype(float)
val_df['label'] = (val_df['label'].astype(int) - 1).astype(float)
print(f"✔ Successfully loaded {len(train_df)} training and {len(val_df)} validation records.")
return train_df, val_df
except FileNotFoundError as e:
print(f"❌ ERROR: Preprocessed file not found: {e}.")
print("Please make sure your data is uploaded to the correct Google Drive folder.")
return None, None
except Exception as e:
print(f"❌ ERROR during data loading: {e}")
return None, None
"""# 3. MODEL, DATASET, AND METRICS DEFINITIONS"""
# =====================================================================================
# 3. MODEL, DATASET, AND METRICS DEFINITIONS
# =====================================================================================
class ReadabilityDataset(TorchDataset):
"""
Custom PyTorch Dataset for readability prediction.
It takes text, extra numerical features, and optional labels.
"""
def __init__(self, texts, features, labels=None, tokenizer_obj=None, max_len=256):
self.texts = texts
self.features = features
self.labels = labels
self.tokenizer = tokenizer_obj
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = str(self.texts[idx])
feature_vec = torch.tensor(self.features[idx], dtype=torch.float)
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=False,
padding='max_length',
truncation=True,
return_attention_mask=True,
return_tensors='pt',
)
item = {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'extra_features': feature_vec
}
if self.labels is not None:
item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
return item
class HybridRegressionModel(nn.Module):
"""
A hybrid model that combines a transformer base with additional numerical features.
The output is a single regression value.
"""
def __init__(self, model_name, num_extra_features):
super(HybridRegressionModel, self).__init__()
self.transformer = AutoModel.from_pretrained(model_name)
# The regression head takes the transformer's output + extra features
self.regressor = nn.Linear(self.transformer.config.hidden_size + num_extra_features, 1)
# Add the attribute expected by Hugging Face Trainer for checkpoint loading
# Initialize it to None or an empty list if no keys should be ignored during saving
self._keys_to_ignore_on_save = None # Or []
def forward(self, input_ids, attention_mask, extra_features, labels=None):
outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
pooler_output = outputs.pooler_output
# Concatenate transformer output with extra features
combined_features = torch.cat((pooler_output, extra_features), dim=1)
logits = self.regressor(combined_features)
loss = None
if labels is not None:
loss_fct = nn.MSELoss()
loss = loss_fct(logits.squeeze(), labels.squeeze())
return (loss, logits) if loss is not None else logits
def compute_metrics(p):
"""
Computes the Quadratic Weighted Kappa score for regression predictions.
Predictions are rounded and clipped to be valid class labels.
"""
predictions, labels = p
# Round predictions to nearest integer and clip to valid range [0, 18]
clipped_preds = np.clip(np.round(predictions), 0, TARGET_CLASSES - 1)
qwk = cohen_kappa_score(labels, clipped_preds, weights='quadratic')
return {'qwk': qwk}
"""# 4. & 5. MAIN EXECUTION FUNCTIONS"""
# =====================================================================================
# 4. & 5. MAIN EXECUTION FUNCTIONS
# =====================================================================================
def main_train():
print("===== 🚀 STARTING HYBRID REGRESSION MODEL PIPELINE =====\n")
print("Initializing Tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
train_df, val_df = load_preprocessed_data()
if train_df is None:
print("\n! Aborting script due to data loading failure.")
return
print("\nCreating Torch Datasets...")
train_dataset = ReadabilityDataset(train_df['d3tok_text'].tolist(), train_df['features'].tolist(), train_df['label'].tolist(), tokenizer)
val_dataset = ReadabilityDataset(val_df['d3tok_text'].tolist(), val_df['features'].tolist(), val_df['label'].tolist(), tokenizer)
print("✔ Datasets created.")
print("\nInitializing Hybrid Regression Model...")
model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES)
training_args = TrainingArguments(
output_dir=CHECKPOINT_DIR,
num_train_epochs=15,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
learning_rate=3e-5,
warmup_ratio=0.1,
weight_decay=0.01,
logging_steps=100,
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="qwk",
greater_is_better=True,
save_total_limit=2,
fp16=torch.cuda.is_available(),
report_to="none"
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)
print("\nStarting model training... Checkpoints will be saved to Google Drive.")
latest_checkpoint = None
if os.path.exists(CHECKPOINT_DIR):
checkpoints = [d for d in os.listdir(CHECKPOINT_DIR) if d.startswith("checkpoint-")]
if checkpoints:
checkpoints.sort(key=lambda x: int(x.split('-')[-1]))
latest_checkpoint = os.path.join(CHECKPOINT_DIR, checkpoints[-1])
print(f"Resuming training from latest checkpoint: {latest_checkpoint}")
else:
print("No checkpoints found to resume training from. Starting from scratch.")
else:
print("Checkpoint directory not found. Starting training from scratch.")
trainer.train(resume_from_checkpoint=latest_checkpoint)
print("✔ Training finished.")
del model, trainer, train_dataset, val_dataset, train_df, val_df
gc.collect()
torch.cuda.empty_cache()
def main_predict():
print("\n===== 🏆 GENERATING FINAL PREDICTIONS & SUBMISSION =====\n")
try:
print("Initializing Tokenizer for prediction...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("Loading preprocessed test data from Google Drive...")
test_df = pd.read_csv(TEST_PROCESSED_PATH)
test_df['features'] = test_df['features'].apply(ast.literal_eval)
print("\nLoading the best trained model from Google Drive checkpoints...")
best_checkpoint_path = None
best_qwk = -1.0
if not os.path.exists(CHECKPOINT_DIR):
raise FileNotFoundError(f"Checkpoint directory not found on Google Drive: {CHECKPOINT_DIR}.")
checkpoints = [d for d in os.listdir(CHECKPOINT_DIR) if d.startswith("checkpoint-")]
if not checkpoints:
raise FileNotFoundError(f"No checkpoint found in the results directory on Google Drive: {CHECKPOINT_DIR}.")
checkpoints.sort(key=lambda x: int(x.split('-')[-1]), reverse=True)
for checkpoint in checkpoints:
checkpoint_path = os.path.join(CHECKPOINT_DIR, checkpoint)
trainer_state_path = os.path.join(checkpoint_path, "trainer_state.json")
model_file_path = os.path.join(checkpoint_path, "pytorch_model.bin")
if os.path.exists(model_file_path) and os.path.exists(trainer_state_path):
try:
with open(trainer_state_path, 'r') as f:
trainer_state = json.load(f)
if 'best_metric' in trainer_state and trainer_state['best_metric'] > best_qwk:
best_qwk = trainer_state['best_metric']
best_checkpoint_path = checkpoint_path
print(f"Found improved eval_qwk {best_qwk} in {checkpoint_path}")
except Exception as e:
print(f"Warning: Could not parse trainer_state.json in {checkpoint_path}: {e}")
if not best_checkpoint_path:
print("Could not find best checkpoint via trainer_state.json. Falling back to the latest checkpoint with a model file.")
for checkpoint in checkpoints:
checkpoint_path = os.path.join(CHECKPOINT_DIR, checkpoint)
if os.path.exists(os.path.join(checkpoint_path, "pytorch_model.bin")):
best_checkpoint_path = checkpoint_path
print(f"Using latest valid checkpoint: {best_checkpoint_path}")
break
if not best_checkpoint_path:
raise FileNotFoundError(f"No valid checkpoint with 'pytorch_model.bin' found in: {CHECKPOINT_DIR}.")
print(f"Loading model from: {best_checkpoint_path}")
model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES)
model.load_state_dict(torch.load(os.path.join(best_checkpoint_path, "pytorch_model.bin")))
trainer = Trainer(model=model, args=TrainingArguments(output_dir=CHECKPOINT_DIR))
print("Generating predictions on the test set...")
# Note: No labels for the test dataset
test_dataset = ReadabilityDataset(test_df['d3tok_text'].tolist(), test_df['features'].tolist(), tokenizer_obj=tokenizer)
predictions = trainer.predict(test_dataset)
clipped_preds = np.clip(np.round(predictions.predictions.flatten()), 0, TARGET_CLASSES - 1)
test_df['Prediction'] = (clipped_preds + 1).astype(int)
submission_df = test_df.rename(columns={'ID': 'id'})[['id', 'Prediction']]
print(f"\nSaving prediction file to: {SUBMISSION_PATH}")
submission_df.to_csv(SUBMISSION_PATH, index=False)
print(f"Compressing into {os.path.basename(ZIPPED_SUBMISSION_PATH)}...")
with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH))
print(f"✔ Submission file '{os.path.basename(ZIPPED_SUBMISSION_PATH)}' created successfully in your Drive!")
except FileNotFoundError as e:
print(f"❌ ERROR: File not found: {e}. Ensure training was completed and checkpoints exist.")
except Exception as e:
print(f"❌ An error occurred during final prediction: {e}")
"""# Running script"""
# =====================================================================================
# 6. SCRIPT RUNNER
# =====================================================================================
# Start the training process
main_train()
# Once training is done, generate predictions
main_predict()
print("\n--- ✅ All Done! Check your Google Drive for results and submission files. ---")
BEST_CHECKPOINT_PATH = '/content/drive/MyDrive/BAREC_Competition/results/hybrid_constrained_samer_regression_v2_readability-arabertv2-d3tok-reg/checkpoint-42826'
# --- Input & Output Directories (derived from base paths) ---
PROCESSED_DATA_DIR = os.path.join(PROJECT_DRIVE_PATH, "lex")
SUBMISSION_DIR = os.path.join(PROJECT_DRIVE_PATH, "submission")
# Ensure the submission output directory exists
os.makedirs(SUBMISSION_DIR, exist_ok=True)
# --- File Paths ---
TEST_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'test_processed_full.csv')
SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_final_prediction.csv")
ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission.zip")
print(f"✔️ Configuration loaded. Model will be loaded from: {BEST_CHECKPOINT_PATH}")
"""# 3. MODEL AND DATASET CLASS DEFINITIONS"""
# =====================================================================================
# 3. MODEL AND DATASET CLASS DEFINITIONS
# =====================================================================================
class ReadabilityDataset(TorchDataset):
"""
Custom PyTorch Dataset for readability prediction.
It takes text, extra numerical features, and optional labels.
"""
def __init__(self, texts, features, labels=None, tokenizer_obj=None, max_len=256):
self.texts = texts
self.features = features
self.labels = labels
self.tokenizer = tokenizer_obj
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = str(self.texts[idx])
feature_vec = torch.tensor(self.features[idx], dtype=torch.float)
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=False,
padding='max_length',
truncation=True,
return_attention_mask=True,
return_tensors='pt',
)
item = {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'extra_features': feature_vec
}
if self.labels is not None:
item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
return item
class HybridRegressionModel(nn.Module):
"""
A hybrid model that combines a transformer base with additional numerical features.
The output is a single regression value.
"""
def __init__(self, model_name, num_extra_features):
super(HybridRegressionModel, self).__init__()
self.transformer = AutoModel.from_pretrained(model_name)
self.regressor = nn.Linear(self.transformer.config.hidden_size + num_extra_features, 1)
def forward(self, input_ids, attention_mask, extra_features, labels=None):
outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
pooler_output = outputs.pooler_output
combined_features = torch.cat((pooler_output, extra_features), dim=1)
logits = self.regressor(combined_features)
loss = None
if labels is not None:
# Using MSELoss for regression
loss_fct = nn.MSELoss()
loss = loss_fct(logits.squeeze(), labels.squeeze())
return (loss, logits) if loss is not None else logits
"""# 4. PREDICTION AND SUBMISSION SCRIPT"""
# =====================================================================================
# 4. PREDICTION AND SUBMISSION SCRIPT
# =====================================================================================
from safetensors.torch import load_file # Import load_file
def generate_predictions():
"""
Loads the trained model from the specified checkpoint, runs predictions on the
test set, and saves the formatted submission file.
"""
print("\n===== 🏆 STARTING PREDICTION PIPELINE =====\n")
try:
# --- Validate Checkpoint Path ---
model_weights_path = os.path.join(BEST_CHECKPOINT_PATH, 'model.safetensors')
if not os.path.exists(model_weights_path):
print(f"❌ ERROR: 'model.safetensors' not found at the specified path: {model_weights_path}")
print("Please ensure the BEST_CHECKPOINT_PATH variable is set correctly to the directory containing the model weights.")
return
# --- Initialize Tokenizer ---
print("1. Initializing tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# --- Load Test Data ---
print(f"2. Loading preprocessed test data from: {TEST_PROCESSED_PATH}")
test_df = pd.read_csv(TEST_PROCESSED_PATH)
test_df['features'] = test_df['features'].apply(ast.literal_eval)
print(f" Loaded {len(test_df)} test records.")
# --- Load Model from Checkpoint ---
print(f"3. Loading model from checkpoint: {BEST_CHECKPOINT_PATH}")
model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES)
# Load the state dictionary from the .safetensors file
state_dict = load_file(model_weights_path)
# --- Inspect the state dictionary keys ---
print("\n--- State Dictionary Keys ---")
for key in state_dict.keys():
print(key)
print("-----------------------------\n")
# --- End Inspection ---
# Load the state dictionary into the model
model.load_state_dict(state_dict)
print(" Model weights loaded successfully from model.safetensors.")
# --- Initialize Trainer ---
# A minimal Trainer is sufficient for making predictions
trainer = Trainer(model=model, args=TrainingArguments(output_dir="./temp_results"))
# --- Create Test Dataset ---
print("4. Creating test dataset...")
test_dataset = ReadabilityDataset(
texts=test_df['d3tok_text'].tolist(),
features=test_df['features'].tolist(),
tokenizer_obj=tokenizer
)
# --- Generate Predictions ---
print("5. Generating predictions on the test set...")
raw_predictions = trainer.predict(test_dataset)
# Predictions are in the .predictions attribute; flatten them to a 1D array
predictions_logits = raw_predictions.predictions.flatten()
# --- Process Predictions ---
# Round to the nearest integer, clip to the valid label range [0, 18],
# and convert back to original class labels [1, 19]
clipped_preds = np.clip(np.round(predictions_logits), 0, TARGET_CLASSES - 1)
final_predictions = (clipped_preds + 1).astype(int)
test_df['Prediction'] = final_predictions
print(" Predictions generated and processed.")
# --- Create and Save Submission File ---
submission_df = test_df.rename(columns={'ID': 'id'})[['id', 'Prediction']]
print(f"\n6. Saving final prediction CSV to: {SUBMISSION_PATH}")
submission_df.to_csv(SUBMISSION_PATH, index=False)
print(f"7. Compressing submission file into: {ZIPPED_SUBMISSION_PATH}")
with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH))
print("\n--- ✅ All Done! ---")
print(f"Submission file '{os.path.basename(ZIPPED_SUBMISSION_PATH)}' has been saved to your Google Drive.")
print(f"Location: {SUBMISSION_DIR}")
except FileNotFoundError as e:
print(f"❌ ERROR: A required file was not found: {e}")
print(" Please ensure all paths in the configuration section are correct.")
except Exception as e:
print(f"❌ An unexpected error occurred: {e}")
"""# 5. EXECUTE SCRIPT"""
# =====================================================================================
# 5. EXECUTE SCRIPT
# =====================================================================================
if __name__ == '__main__':
generate_predictions()
"""# Results of Sentence-level Readability Assessment - Constrained on The Blind Test
{'accuracy': 42.1, 'accuracy+-1': 71.6, 'avg_abs_dist': 1.2, 'qwk': 82.1, 'accuracy_7': 59.9, 'accuracy_5': 65.4, 'accuracy_3': 73.4}
"""