Upload 2 files
Browse files
Constrained-Track-Document-Bassline-Readability-Arabertv2-d3tok-reg.py
ADDED
|
@@ -0,0 +1,563 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""arabertv2-d3tok_Sentence_Constrained.ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colab.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/1vHngBRGkcXHJIwi317DS0AsZfsXguIc0
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
# Authorize Colab to access your Google Drive
|
| 11 |
+
from google.colab import drive
|
| 12 |
+
drive.mount('/content/drive')
|
| 13 |
+
|
| 14 |
+
import pandas as pd
|
| 15 |
+
import numpy as np
|
| 16 |
+
import os
|
| 17 |
+
import torch
|
| 18 |
+
import torch.nn as nn
|
| 19 |
+
import zipfile
|
| 20 |
+
from sklearn.metrics import cohen_kappa_score
|
| 21 |
+
from torch.utils.data import Dataset as TorchDataset
|
| 22 |
+
from transformers import (
|
| 23 |
+
AutoTokenizer,
|
| 24 |
+
AutoModel,
|
| 25 |
+
TrainingArguments,
|
| 26 |
+
Trainer,
|
| 27 |
+
EarlyStoppingCallback
|
| 28 |
+
)
|
| 29 |
+
import gc
|
| 30 |
+
import ast # To safely evaluate string-formatted lists
|
| 31 |
+
import json # Added import for json
|
| 32 |
+
|
| 33 |
+
# --- Hugging Face Authentication ---
|
| 34 |
+
from huggingface_hub import login
|
| 35 |
+
from google.colab import userdata
|
| 36 |
+
|
| 37 |
+
# Log in to Hugging Face using the token stored in Colab secrets
|
| 38 |
+
try:
|
| 39 |
+
login(token=userdata.get('HF_TOKEN'))
|
| 40 |
+
print("✔️ Successfully logged into Hugging Face.")
|
| 41 |
+
except (NameError, KeyError):
|
| 42 |
+
print("⚠️ Hugging Face token not found in Colab secrets. Please add it as 'HF_TOKEN'.")
|
| 43 |
+
# Fallback for local execution or if login() is preferred manually
|
| 44 |
+
login()
|
| 45 |
+
|
| 46 |
+
# --- Model & Training ---
|
| 47 |
+
MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg"
|
| 48 |
+
|
| 49 |
+
NUM_LABELS = 1
|
| 50 |
+
TARGET_CLASSES = 19
|
| 51 |
+
NUM_FEATURES = 7
|
| 52 |
+
|
| 53 |
+
# --- IMPORTANT: Set the path to your project folder on Google Drive ---
|
| 54 |
+
PROJECT_DRIVE_PATH = '/content/drive/MyDrive/BAREC_Competition'
|
| 55 |
+
|
| 56 |
+
# --- File & Directory Paths (Now relative to your Google Drive) ---
|
| 57 |
+
BASE_DIR = PROJECT_DRIVE_PATH
|
| 58 |
+
PROCESSED_DATA_DIR = os.path.join(BASE_DIR, "lex")
|
| 59 |
+
CHECKPOINT_DIR = os.path.join(BASE_DIR, "results", f"hybrid_constrained_samer_regression_v2_{MODEL_NAME.split('/')[-1]}")
|
| 60 |
+
SUBMISSION_DIR = os.path.join(BASE_DIR, "submission")
|
| 61 |
+
|
| 62 |
+
# Ensure the output directories exist on your Google Drive
|
| 63 |
+
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
|
| 64 |
+
os.makedirs(SUBMISSION_DIR, exist_ok=True)
|
| 65 |
+
|
| 66 |
+
# Paths to the preprocessed input files on Google Drive
|
| 67 |
+
TRAIN_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'train_processed_full.csv')
|
| 68 |
+
DEV_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'dev_processed_full.csv')
|
| 69 |
+
TEST_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'test_processed_full.csv')
|
| 70 |
+
|
| 71 |
+
# --- Submission Paths on Google Drive ---
|
| 72 |
+
SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_hybrid_constrained_samer_regression.csv")
|
| 73 |
+
ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_hybrid_constrained_samer_regression.zip")
|
| 74 |
+
|
| 75 |
+
print(f"✔️ All paths configured to use Google Drive folder: {BASE_DIR}")
|
| 76 |
+
|
| 77 |
+
"""# 2. DATA LOADING FUNCTION"""
|
| 78 |
+
|
| 79 |
+
# =====================================================================================
|
| 80 |
+
# 2. DATA LOADING FUNCTION
|
| 81 |
+
# =====================================================================================
|
| 82 |
+
|
| 83 |
+
def load_preprocessed_data():
|
| 84 |
+
"""Loads the pre-processed CSV files directly from Google Drive."""
|
| 85 |
+
print("\n--- Loading Preprocessed Data from Google Drive ---")
|
| 86 |
+
try:
|
| 87 |
+
train_df = pd.read_csv(TRAIN_PROCESSED_PATH)
|
| 88 |
+
val_df = pd.read_csv(DEV_PROCESSED_PATH)
|
| 89 |
+
|
| 90 |
+
print("Converting 'features' column from string to list...")
|
| 91 |
+
train_df['features'] = train_df['features'].apply(ast.literal_eval)
|
| 92 |
+
val_df['features'] = val_df['features'].apply(ast.literal_eval)
|
| 93 |
+
|
| 94 |
+
# Convert labels for regression
|
| 95 |
+
train_df['label'] = (train_df['label'].astype(int) - 1).astype(float)
|
| 96 |
+
val_df['label'] = (val_df['label'].astype(int) - 1).astype(float)
|
| 97 |
+
|
| 98 |
+
print(f"✔ Successfully loaded {len(train_df)} training and {len(val_df)} validation records.")
|
| 99 |
+
return train_df, val_df
|
| 100 |
+
except FileNotFoundError as e:
|
| 101 |
+
print(f"❌ ERROR: Preprocessed file not found: {e}.")
|
| 102 |
+
print("Please make sure your data is uploaded to the correct Google Drive folder.")
|
| 103 |
+
return None, None
|
| 104 |
+
except Exception as e:
|
| 105 |
+
print(f"❌ ERROR during data loading: {e}")
|
| 106 |
+
return None, None
|
| 107 |
+
|
| 108 |
+
"""# 3. MODEL, DATASET, AND METRICS DEFINITIONS"""
|
| 109 |
+
|
| 110 |
+
# =====================================================================================
|
| 111 |
+
# 3. MODEL, DATASET, AND METRICS DEFINITIONS
|
| 112 |
+
# =====================================================================================
|
| 113 |
+
|
| 114 |
+
class ReadabilityDataset(TorchDataset):
|
| 115 |
+
"""
|
| 116 |
+
Custom PyTorch Dataset for readability prediction.
|
| 117 |
+
It takes text, extra numerical features, and optional labels.
|
| 118 |
+
"""
|
| 119 |
+
def __init__(self, texts, features, labels=None, tokenizer_obj=None, max_len=256):
|
| 120 |
+
self.texts = texts
|
| 121 |
+
self.features = features
|
| 122 |
+
self.labels = labels
|
| 123 |
+
self.tokenizer = tokenizer_obj
|
| 124 |
+
self.max_len = max_len
|
| 125 |
+
|
| 126 |
+
def __len__(self):
|
| 127 |
+
return len(self.texts)
|
| 128 |
+
|
| 129 |
+
def __getitem__(self, idx):
|
| 130 |
+
text = str(self.texts[idx])
|
| 131 |
+
feature_vec = torch.tensor(self.features[idx], dtype=torch.float)
|
| 132 |
+
|
| 133 |
+
encoding = self.tokenizer.encode_plus(
|
| 134 |
+
text,
|
| 135 |
+
add_special_tokens=True,
|
| 136 |
+
max_length=self.max_len,
|
| 137 |
+
return_token_type_ids=False,
|
| 138 |
+
padding='max_length',
|
| 139 |
+
truncation=True,
|
| 140 |
+
return_attention_mask=True,
|
| 141 |
+
return_tensors='pt',
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
item = {
|
| 145 |
+
'input_ids': encoding['input_ids'].flatten(),
|
| 146 |
+
'attention_mask': encoding['attention_mask'].flatten(),
|
| 147 |
+
'extra_features': feature_vec
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
if self.labels is not None:
|
| 151 |
+
item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
|
| 152 |
+
|
| 153 |
+
return item
|
| 154 |
+
|
| 155 |
+
class HybridRegressionModel(nn.Module):
|
| 156 |
+
"""
|
| 157 |
+
A hybrid model that combines a transformer base with additional numerical features.
|
| 158 |
+
The output is a single regression value.
|
| 159 |
+
"""
|
| 160 |
+
def __init__(self, model_name, num_extra_features):
|
| 161 |
+
super(HybridRegressionModel, self).__init__()
|
| 162 |
+
self.transformer = AutoModel.from_pretrained(model_name)
|
| 163 |
+
# The regression head takes the transformer's output + extra features
|
| 164 |
+
self.regressor = nn.Linear(self.transformer.config.hidden_size + num_extra_features, 1)
|
| 165 |
+
|
| 166 |
+
# Add the attribute expected by Hugging Face Trainer for checkpoint loading
|
| 167 |
+
# Initialize it to None or an empty list if no keys should be ignored during saving
|
| 168 |
+
self._keys_to_ignore_on_save = None # Or []
|
| 169 |
+
|
| 170 |
+
def forward(self, input_ids, attention_mask, extra_features, labels=None):
|
| 171 |
+
outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
|
| 172 |
+
pooler_output = outputs.pooler_output
|
| 173 |
+
|
| 174 |
+
# Concatenate transformer output with extra features
|
| 175 |
+
combined_features = torch.cat((pooler_output, extra_features), dim=1)
|
| 176 |
+
|
| 177 |
+
logits = self.regressor(combined_features)
|
| 178 |
+
|
| 179 |
+
loss = None
|
| 180 |
+
if labels is not None:
|
| 181 |
+
loss_fct = nn.MSELoss()
|
| 182 |
+
loss = loss_fct(logits.squeeze(), labels.squeeze())
|
| 183 |
+
|
| 184 |
+
return (loss, logits) if loss is not None else logits
|
| 185 |
+
|
| 186 |
+
def compute_metrics(p):
|
| 187 |
+
"""
|
| 188 |
+
Computes the Quadratic Weighted Kappa score for regression predictions.
|
| 189 |
+
Predictions are rounded and clipped to be valid class labels.
|
| 190 |
+
"""
|
| 191 |
+
predictions, labels = p
|
| 192 |
+
# Round predictions to nearest integer and clip to valid range [0, 18]
|
| 193 |
+
clipped_preds = np.clip(np.round(predictions), 0, TARGET_CLASSES - 1)
|
| 194 |
+
qwk = cohen_kappa_score(labels, clipped_preds, weights='quadratic')
|
| 195 |
+
return {'qwk': qwk}
|
| 196 |
+
|
| 197 |
+
"""# 4. & 5. MAIN EXECUTION FUNCTIONS"""
|
| 198 |
+
|
| 199 |
+
# =====================================================================================
|
| 200 |
+
# 4. & 5. MAIN EXECUTION FUNCTIONS
|
| 201 |
+
# =====================================================================================
|
| 202 |
+
|
| 203 |
+
def main_train():
|
| 204 |
+
print("===== 🚀 STARTING HYBRID REGRESSION MODEL PIPELINE =====\n")
|
| 205 |
+
|
| 206 |
+
print("Initializing Tokenizer...")
|
| 207 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 208 |
+
|
| 209 |
+
train_df, val_df = load_preprocessed_data()
|
| 210 |
+
if train_df is None:
|
| 211 |
+
print("\n! Aborting script due to data loading failure.")
|
| 212 |
+
return
|
| 213 |
+
|
| 214 |
+
print("\nCreating Torch Datasets...")
|
| 215 |
+
train_dataset = ReadabilityDataset(train_df['d3tok_text'].tolist(), train_df['features'].tolist(), train_df['label'].tolist(), tokenizer)
|
| 216 |
+
val_dataset = ReadabilityDataset(val_df['d3tok_text'].tolist(), val_df['features'].tolist(), val_df['label'].tolist(), tokenizer)
|
| 217 |
+
print("✔ Datasets created.")
|
| 218 |
+
|
| 219 |
+
print("\nInitializing Hybrid Regression Model...")
|
| 220 |
+
model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES)
|
| 221 |
+
|
| 222 |
+
training_args = TrainingArguments(
|
| 223 |
+
output_dir=CHECKPOINT_DIR,
|
| 224 |
+
num_train_epochs=15,
|
| 225 |
+
per_device_train_batch_size=16,
|
| 226 |
+
per_device_eval_batch_size=64,
|
| 227 |
+
learning_rate=3e-5,
|
| 228 |
+
warmup_ratio=0.1,
|
| 229 |
+
weight_decay=0.01,
|
| 230 |
+
logging_steps=100,
|
| 231 |
+
eval_strategy="epoch",
|
| 232 |
+
save_strategy="epoch",
|
| 233 |
+
load_best_model_at_end=True,
|
| 234 |
+
metric_for_best_model="qwk",
|
| 235 |
+
greater_is_better=True,
|
| 236 |
+
save_total_limit=2,
|
| 237 |
+
fp16=torch.cuda.is_available(),
|
| 238 |
+
report_to="none"
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
trainer = Trainer(
|
| 242 |
+
model=model,
|
| 243 |
+
args=training_args,
|
| 244 |
+
train_dataset=train_dataset,
|
| 245 |
+
eval_dataset=val_dataset,
|
| 246 |
+
compute_metrics=compute_metrics,
|
| 247 |
+
callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
print("\nStarting model training... Checkpoints will be saved to Google Drive.")
|
| 251 |
+
|
| 252 |
+
latest_checkpoint = None
|
| 253 |
+
if os.path.exists(CHECKPOINT_DIR):
|
| 254 |
+
checkpoints = [d for d in os.listdir(CHECKPOINT_DIR) if d.startswith("checkpoint-")]
|
| 255 |
+
if checkpoints:
|
| 256 |
+
checkpoints.sort(key=lambda x: int(x.split('-')[-1]))
|
| 257 |
+
latest_checkpoint = os.path.join(CHECKPOINT_DIR, checkpoints[-1])
|
| 258 |
+
print(f"Resuming training from latest checkpoint: {latest_checkpoint}")
|
| 259 |
+
else:
|
| 260 |
+
print("No checkpoints found to resume training from. Starting from scratch.")
|
| 261 |
+
else:
|
| 262 |
+
print("Checkpoint directory not found. Starting training from scratch.")
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
trainer.train(resume_from_checkpoint=latest_checkpoint)
|
| 266 |
+
print("✔ Training finished.")
|
| 267 |
+
|
| 268 |
+
del model, trainer, train_dataset, val_dataset, train_df, val_df
|
| 269 |
+
gc.collect()
|
| 270 |
+
torch.cuda.empty_cache()
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def main_predict():
|
| 274 |
+
print("\n===== 🏆 GENERATING FINAL PREDICTIONS & SUBMISSION =====\n")
|
| 275 |
+
try:
|
| 276 |
+
print("Initializing Tokenizer for prediction...")
|
| 277 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 278 |
+
|
| 279 |
+
print("Loading preprocessed test data from Google Drive...")
|
| 280 |
+
test_df = pd.read_csv(TEST_PROCESSED_PATH)
|
| 281 |
+
test_df['features'] = test_df['features'].apply(ast.literal_eval)
|
| 282 |
+
|
| 283 |
+
print("\nLoading the best trained model from Google Drive checkpoints...")
|
| 284 |
+
best_checkpoint_path = None
|
| 285 |
+
best_qwk = -1.0
|
| 286 |
+
|
| 287 |
+
if not os.path.exists(CHECKPOINT_DIR):
|
| 288 |
+
raise FileNotFoundError(f"Checkpoint directory not found on Google Drive: {CHECKPOINT_DIR}.")
|
| 289 |
+
|
| 290 |
+
checkpoints = [d for d in os.listdir(CHECKPOINT_DIR) if d.startswith("checkpoint-")]
|
| 291 |
+
if not checkpoints:
|
| 292 |
+
raise FileNotFoundError(f"No checkpoint found in the results directory on Google Drive: {CHECKPOINT_DIR}.")
|
| 293 |
+
|
| 294 |
+
checkpoints.sort(key=lambda x: int(x.split('-')[-1]), reverse=True)
|
| 295 |
+
|
| 296 |
+
for checkpoint in checkpoints:
|
| 297 |
+
checkpoint_path = os.path.join(CHECKPOINT_DIR, checkpoint)
|
| 298 |
+
trainer_state_path = os.path.join(checkpoint_path, "trainer_state.json")
|
| 299 |
+
model_file_path = os.path.join(checkpoint_path, "pytorch_model.bin")
|
| 300 |
+
|
| 301 |
+
if os.path.exists(model_file_path) and os.path.exists(trainer_state_path):
|
| 302 |
+
try:
|
| 303 |
+
with open(trainer_state_path, 'r') as f:
|
| 304 |
+
trainer_state = json.load(f)
|
| 305 |
+
if 'best_metric' in trainer_state and trainer_state['best_metric'] > best_qwk:
|
| 306 |
+
best_qwk = trainer_state['best_metric']
|
| 307 |
+
best_checkpoint_path = checkpoint_path
|
| 308 |
+
print(f"Found improved eval_qwk {best_qwk} in {checkpoint_path}")
|
| 309 |
+
except Exception as e:
|
| 310 |
+
print(f"Warning: Could not parse trainer_state.json in {checkpoint_path}: {e}")
|
| 311 |
+
|
| 312 |
+
if not best_checkpoint_path:
|
| 313 |
+
print("Could not find best checkpoint via trainer_state.json. Falling back to the latest checkpoint with a model file.")
|
| 314 |
+
for checkpoint in checkpoints:
|
| 315 |
+
checkpoint_path = os.path.join(CHECKPOINT_DIR, checkpoint)
|
| 316 |
+
if os.path.exists(os.path.join(checkpoint_path, "pytorch_model.bin")):
|
| 317 |
+
best_checkpoint_path = checkpoint_path
|
| 318 |
+
print(f"Using latest valid checkpoint: {best_checkpoint_path}")
|
| 319 |
+
break
|
| 320 |
+
|
| 321 |
+
if not best_checkpoint_path:
|
| 322 |
+
raise FileNotFoundError(f"No valid checkpoint with 'pytorch_model.bin' found in: {CHECKPOINT_DIR}.")
|
| 323 |
+
|
| 324 |
+
print(f"Loading model from: {best_checkpoint_path}")
|
| 325 |
+
model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES)
|
| 326 |
+
model.load_state_dict(torch.load(os.path.join(best_checkpoint_path, "pytorch_model.bin")))
|
| 327 |
+
|
| 328 |
+
trainer = Trainer(model=model, args=TrainingArguments(output_dir=CHECKPOINT_DIR))
|
| 329 |
+
|
| 330 |
+
print("Generating predictions on the test set...")
|
| 331 |
+
# Note: No labels for the test dataset
|
| 332 |
+
test_dataset = ReadabilityDataset(test_df['d3tok_text'].tolist(), test_df['features'].tolist(), tokenizer_obj=tokenizer)
|
| 333 |
+
predictions = trainer.predict(test_dataset)
|
| 334 |
+
|
| 335 |
+
clipped_preds = np.clip(np.round(predictions.predictions.flatten()), 0, TARGET_CLASSES - 1)
|
| 336 |
+
test_df['Prediction'] = (clipped_preds + 1).astype(int)
|
| 337 |
+
|
| 338 |
+
submission_df = test_df.rename(columns={'ID': 'id'})[['id', 'Prediction']]
|
| 339 |
+
|
| 340 |
+
print(f"\nSaving prediction file to: {SUBMISSION_PATH}")
|
| 341 |
+
submission_df.to_csv(SUBMISSION_PATH, index=False)
|
| 342 |
+
|
| 343 |
+
print(f"Compressing into {os.path.basename(ZIPPED_SUBMISSION_PATH)}...")
|
| 344 |
+
with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
| 345 |
+
zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH))
|
| 346 |
+
|
| 347 |
+
print(f"✔ Submission file '{os.path.basename(ZIPPED_SUBMISSION_PATH)}' created successfully in your Drive!")
|
| 348 |
+
|
| 349 |
+
except FileNotFoundError as e:
|
| 350 |
+
print(f"❌ ERROR: File not found: {e}. Ensure training was completed and checkpoints exist.")
|
| 351 |
+
except Exception as e:
|
| 352 |
+
print(f"❌ An error occurred during final prediction: {e}")
|
| 353 |
+
|
| 354 |
+
"""# Running script"""
|
| 355 |
+
|
| 356 |
+
# =====================================================================================
|
| 357 |
+
# 6. SCRIPT RUNNER
|
| 358 |
+
# =====================================================================================
|
| 359 |
+
|
| 360 |
+
# Start the training process
|
| 361 |
+
main_train()
|
| 362 |
+
|
| 363 |
+
# Once training is done, generate predictions
|
| 364 |
+
main_predict()
|
| 365 |
+
|
| 366 |
+
print("\n--- ✅ All Done! Check your Google Drive for results and submission files. ---")
|
| 367 |
+
|
| 368 |
+
BEST_CHECKPOINT_PATH = '/content/drive/MyDrive/BAREC_Competition/results/hybrid_constrained_samer_regression_v2_readability-arabertv2-d3tok-reg/checkpoint-42826'
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
# --- Input & Output Directories (derived from base paths) ---
|
| 372 |
+
PROCESSED_DATA_DIR = os.path.join(PROJECT_DRIVE_PATH, "lex")
|
| 373 |
+
SUBMISSION_DIR = os.path.join(PROJECT_DRIVE_PATH, "submission")
|
| 374 |
+
|
| 375 |
+
# Ensure the submission output directory exists
|
| 376 |
+
os.makedirs(SUBMISSION_DIR, exist_ok=True)
|
| 377 |
+
|
| 378 |
+
# --- File Paths ---
|
| 379 |
+
TEST_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'test_processed_full.csv')
|
| 380 |
+
SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_final_prediction.csv")
|
| 381 |
+
ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission.zip")
|
| 382 |
+
|
| 383 |
+
print(f"✔️ Configuration loaded. Model will be loaded from: {BEST_CHECKPOINT_PATH}")
|
| 384 |
+
|
| 385 |
+
"""# 3. MODEL AND DATASET CLASS DEFINITIONS"""
|
| 386 |
+
|
| 387 |
+
# =====================================================================================
|
| 388 |
+
# 3. MODEL AND DATASET CLASS DEFINITIONS
|
| 389 |
+
# =====================================================================================
|
| 390 |
+
|
| 391 |
+
class ReadabilityDataset(TorchDataset):
|
| 392 |
+
"""
|
| 393 |
+
Custom PyTorch Dataset for readability prediction.
|
| 394 |
+
It takes text, extra numerical features, and optional labels.
|
| 395 |
+
"""
|
| 396 |
+
def __init__(self, texts, features, labels=None, tokenizer_obj=None, max_len=256):
|
| 397 |
+
self.texts = texts
|
| 398 |
+
self.features = features
|
| 399 |
+
self.labels = labels
|
| 400 |
+
self.tokenizer = tokenizer_obj
|
| 401 |
+
self.max_len = max_len
|
| 402 |
+
|
| 403 |
+
def __len__(self):
|
| 404 |
+
return len(self.texts)
|
| 405 |
+
|
| 406 |
+
def __getitem__(self, idx):
|
| 407 |
+
text = str(self.texts[idx])
|
| 408 |
+
feature_vec = torch.tensor(self.features[idx], dtype=torch.float)
|
| 409 |
+
|
| 410 |
+
encoding = self.tokenizer.encode_plus(
|
| 411 |
+
text,
|
| 412 |
+
add_special_tokens=True,
|
| 413 |
+
max_length=self.max_len,
|
| 414 |
+
return_token_type_ids=False,
|
| 415 |
+
padding='max_length',
|
| 416 |
+
truncation=True,
|
| 417 |
+
return_attention_mask=True,
|
| 418 |
+
return_tensors='pt',
|
| 419 |
+
)
|
| 420 |
+
|
| 421 |
+
item = {
|
| 422 |
+
'input_ids': encoding['input_ids'].flatten(),
|
| 423 |
+
'attention_mask': encoding['attention_mask'].flatten(),
|
| 424 |
+
'extra_features': feature_vec
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
if self.labels is not None:
|
| 428 |
+
item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
|
| 429 |
+
|
| 430 |
+
return item
|
| 431 |
+
|
| 432 |
+
class HybridRegressionModel(nn.Module):
|
| 433 |
+
"""
|
| 434 |
+
A hybrid model that combines a transformer base with additional numerical features.
|
| 435 |
+
The output is a single regression value.
|
| 436 |
+
"""
|
| 437 |
+
def __init__(self, model_name, num_extra_features):
|
| 438 |
+
super(HybridRegressionModel, self).__init__()
|
| 439 |
+
self.transformer = AutoModel.from_pretrained(model_name)
|
| 440 |
+
self.regressor = nn.Linear(self.transformer.config.hidden_size + num_extra_features, 1)
|
| 441 |
+
|
| 442 |
+
def forward(self, input_ids, attention_mask, extra_features, labels=None):
|
| 443 |
+
outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
|
| 444 |
+
pooler_output = outputs.pooler_output
|
| 445 |
+
combined_features = torch.cat((pooler_output, extra_features), dim=1)
|
| 446 |
+
logits = self.regressor(combined_features)
|
| 447 |
+
|
| 448 |
+
loss = None
|
| 449 |
+
if labels is not None:
|
| 450 |
+
# Using MSELoss for regression
|
| 451 |
+
loss_fct = nn.MSELoss()
|
| 452 |
+
loss = loss_fct(logits.squeeze(), labels.squeeze())
|
| 453 |
+
|
| 454 |
+
return (loss, logits) if loss is not None else logits
|
| 455 |
+
|
| 456 |
+
"""# 4. PREDICTION AND SUBMISSION SCRIPT"""
|
| 457 |
+
|
| 458 |
+
# =====================================================================================
|
| 459 |
+
# 4. PREDICTION AND SUBMISSION SCRIPT
|
| 460 |
+
# =====================================================================================
|
| 461 |
+
|
| 462 |
+
from safetensors.torch import load_file # Import load_file
|
| 463 |
+
|
| 464 |
+
def generate_predictions():
|
| 465 |
+
"""
|
| 466 |
+
Loads the trained model from the specified checkpoint, runs predictions on the
|
| 467 |
+
test set, and saves the formatted submission file.
|
| 468 |
+
"""
|
| 469 |
+
print("\n===== 🏆 STARTING PREDICTION PIPELINE =====\n")
|
| 470 |
+
try:
|
| 471 |
+
# --- Validate Checkpoint Path ---
|
| 472 |
+
model_weights_path = os.path.join(BEST_CHECKPOINT_PATH, 'model.safetensors')
|
| 473 |
+
if not os.path.exists(model_weights_path):
|
| 474 |
+
print(f"❌ ERROR: 'model.safetensors' not found at the specified path: {model_weights_path}")
|
| 475 |
+
print("Please ensure the BEST_CHECKPOINT_PATH variable is set correctly to the directory containing the model weights.")
|
| 476 |
+
return
|
| 477 |
+
|
| 478 |
+
# --- Initialize Tokenizer ---
|
| 479 |
+
print("1. Initializing tokenizer...")
|
| 480 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 481 |
+
|
| 482 |
+
# --- Load Test Data ---
|
| 483 |
+
print(f"2. Loading preprocessed test data from: {TEST_PROCESSED_PATH}")
|
| 484 |
+
test_df = pd.read_csv(TEST_PROCESSED_PATH)
|
| 485 |
+
test_df['features'] = test_df['features'].apply(ast.literal_eval)
|
| 486 |
+
print(f" Loaded {len(test_df)} test records.")
|
| 487 |
+
|
| 488 |
+
# --- Load Model from Checkpoint ---
|
| 489 |
+
print(f"3. Loading model from checkpoint: {BEST_CHECKPOINT_PATH}")
|
| 490 |
+
model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES)
|
| 491 |
+
|
| 492 |
+
# Load the state dictionary from the .safetensors file
|
| 493 |
+
state_dict = load_file(model_weights_path)
|
| 494 |
+
|
| 495 |
+
# --- Inspect the state dictionary keys ---
|
| 496 |
+
print("\n--- State Dictionary Keys ---")
|
| 497 |
+
for key in state_dict.keys():
|
| 498 |
+
print(key)
|
| 499 |
+
print("-----------------------------\n")
|
| 500 |
+
# --- End Inspection ---
|
| 501 |
+
|
| 502 |
+
# Load the state dictionary into the model
|
| 503 |
+
model.load_state_dict(state_dict)
|
| 504 |
+
print(" Model weights loaded successfully from model.safetensors.")
|
| 505 |
+
|
| 506 |
+
# --- Initialize Trainer ---
|
| 507 |
+
# A minimal Trainer is sufficient for making predictions
|
| 508 |
+
trainer = Trainer(model=model, args=TrainingArguments(output_dir="./temp_results"))
|
| 509 |
+
|
| 510 |
+
# --- Create Test Dataset ---
|
| 511 |
+
print("4. Creating test dataset...")
|
| 512 |
+
test_dataset = ReadabilityDataset(
|
| 513 |
+
texts=test_df['d3tok_text'].tolist(),
|
| 514 |
+
features=test_df['features'].tolist(),
|
| 515 |
+
tokenizer_obj=tokenizer
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
+
# --- Generate Predictions ---
|
| 519 |
+
print("5. Generating predictions on the test set...")
|
| 520 |
+
raw_predictions = trainer.predict(test_dataset)
|
| 521 |
+
|
| 522 |
+
# Predictions are in the .predictions attribute; flatten them to a 1D array
|
| 523 |
+
predictions_logits = raw_predictions.predictions.flatten()
|
| 524 |
+
|
| 525 |
+
# --- Process Predictions ---
|
| 526 |
+
# Round to the nearest integer, clip to the valid label range [0, 18],
|
| 527 |
+
# and convert back to original class labels [1, 19]
|
| 528 |
+
clipped_preds = np.clip(np.round(predictions_logits), 0, TARGET_CLASSES - 1)
|
| 529 |
+
final_predictions = (clipped_preds + 1).astype(int)
|
| 530 |
+
test_df['Prediction'] = final_predictions
|
| 531 |
+
print(" Predictions generated and processed.")
|
| 532 |
+
|
| 533 |
+
# --- Create and Save Submission File ---
|
| 534 |
+
submission_df = test_df.rename(columns={'ID': 'id'})[['id', 'Prediction']]
|
| 535 |
+
print(f"\n6. Saving final prediction CSV to: {SUBMISSION_PATH}")
|
| 536 |
+
submission_df.to_csv(SUBMISSION_PATH, index=False)
|
| 537 |
+
|
| 538 |
+
print(f"7. Compressing submission file into: {ZIPPED_SUBMISSION_PATH}")
|
| 539 |
+
with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
| 540 |
+
zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH))
|
| 541 |
+
|
| 542 |
+
print("\n--- ✅ All Done! ---")
|
| 543 |
+
print(f"Submission file '{os.path.basename(ZIPPED_SUBMISSION_PATH)}' has been saved to your Google Drive.")
|
| 544 |
+
print(f"Location: {SUBMISSION_DIR}")
|
| 545 |
+
|
| 546 |
+
except FileNotFoundError as e:
|
| 547 |
+
print(f"❌ ERROR: A required file was not found: {e}")
|
| 548 |
+
print(" Please ensure all paths in the configuration section are correct.")
|
| 549 |
+
except Exception as e:
|
| 550 |
+
print(f"❌ An unexpected error occurred: {e}")
|
| 551 |
+
|
| 552 |
+
"""# 5. EXECUTE SCRIPT"""
|
| 553 |
+
|
| 554 |
+
# =====================================================================================
|
| 555 |
+
# 5. EXECUTE SCRIPT
|
| 556 |
+
# =====================================================================================
|
| 557 |
+
|
| 558 |
+
if __name__ == '__main__':
|
| 559 |
+
generate_predictions()
|
| 560 |
+
|
| 561 |
+
"""# Results of Sentence-level Readability Assessment - Constrained on The Blind Test
|
| 562 |
+
{'accuracy': 42.1, 'accuracy+-1': 71.6, 'avg_abs_dist': 1.2, 'qwk': 82.1, 'accuracy_7': 59.9, 'accuracy_5': 65.4, 'accuracy_3': 73.4}
|
| 563 |
+
"""
|
samer_preprocessing_feature_extraction.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =====================================================================================
|
| 2 |
+
# 0. INSTALLATIONS & IMPORTS
|
| 3 |
+
# =====================================================================================
|
| 4 |
+
# This will install all necessary libraries quietly.
|
| 5 |
+
# !pip install transformers[torch] datasets pandas scikit-learn arabert accelerate pyarrow -q
|
| 6 |
+
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import numpy as np
|
| 9 |
+
import os
|
| 10 |
+
from transformers import AutoTokenizer
|
| 11 |
+
from arabert.preprocess import ArabertPreprocessor
|
| 12 |
+
|
| 13 |
+
print("✅ Libraries imported successfully.")
|
| 14 |
+
|
| 15 |
+
# =====================================================================================
|
| 16 |
+
# 1. CONFIGURATION
|
| 17 |
+
# =====================================================================================
|
| 18 |
+
# --- Model & Tokenizer ---
|
| 19 |
+
# Using the model from your original script for consistency
|
| 20 |
+
MODEL_NAME = "aubmindlab/bert-large-arabertv2"
|
| 21 |
+
MAX_LENGTH = 256 # Max sequence length for tokenizer
|
| 22 |
+
|
| 23 |
+
# --- File Paths ---
|
| 24 |
+
# Assumes your initial dataset is in the default /kaggle/input/sentses directory
|
| 25 |
+
RAW_DATA_DIR = '/kaggle/input/sentses/'
|
| 26 |
+
BAREC_TRAIN_PATH = os.path.join(RAW_DATA_DIR, 'train.csv')
|
| 27 |
+
BAREC_DEV_PATH = os.path.join(RAW_DATA_DIR, 'dev.csv')
|
| 28 |
+
BLIND_TEST_PATH = os.path.join(RAW_DATA_DIR, 'blind_test_data.csv')
|
| 29 |
+
# The SAMER files are assumed to be in the same directory for this example
|
| 30 |
+
SAMER_CORPUS_PATH = os.path.join(RAW_DATA_DIR, 'samer_train.tsv')
|
| 31 |
+
SAMER_LEXICON_PATH = os.path.join(RAW_DATA_DIR, 'SAMER-Readability-Lexicon-v2.tsv')
|
| 32 |
+
|
| 33 |
+
# --- Output Path ---
|
| 34 |
+
# Processed files will be saved here, ready for output
|
| 35 |
+
OUTPUT_DIR = '/kaggle/working/'
|
| 36 |
+
|
| 37 |
+
# --- Initialize Processors ---
|
| 38 |
+
try:
|
| 39 |
+
arabert_preprocessor = ArabertPreprocessor(model_name=MODEL_NAME)
|
| 40 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 41 |
+
print("✅ AraBERT Preprocessor and Tokenizer initialized.")
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f"❗️ Error initializing processors: {e}")
|
| 44 |
+
# Exit if the core components can't be loaded
|
| 45 |
+
exit()
|
| 46 |
+
|
| 47 |
+
# =====================================================================================
|
| 48 |
+
# 2. DATA LOADING FUNCTIONS
|
| 49 |
+
# =====================================================================================
|
| 50 |
+
|
| 51 |
+
def load_samer_lexicon(file_path):
|
| 52 |
+
"""Loads the SAMER Lexicon for feature engineering."""
|
| 53 |
+
print("\n--- Loading SAMER Lexicon ---")
|
| 54 |
+
try:
|
| 55 |
+
df = pd.read_csv(file_path, sep='\t')
|
| 56 |
+
df[['lemma', 'pos']] = df['lemma#pos'].str.split('#', expand=True)
|
| 57 |
+
lexicon_dict = df.set_index('lemma')['readability (rounded average)'].to_dict()
|
| 58 |
+
print(f"Loaded {len(lexicon_dict)} lemmas into lexicon dictionary.")
|
| 59 |
+
return lexicon_dict
|
| 60 |
+
except FileNotFoundError:
|
| 61 |
+
print("❗️ SAMER Lexicon not found. Lexical features will be disabled.")
|
| 62 |
+
return {}
|
| 63 |
+
|
| 64 |
+
def load_training_and_validation_data(lexicon):
|
| 65 |
+
"""Loads and augments training/validation data."""
|
| 66 |
+
print("\n--- Loading BAREC Training & Validation Data ---")
|
| 67 |
+
try:
|
| 68 |
+
train_df = pd.read_csv(BAREC_TRAIN_PATH)[['Sentence', 'Readability_Level_19']].rename(columns={'Sentence': 'text', 'Readability_Level_19': 'label'})
|
| 69 |
+
val_df = pd.read_csv(BAREC_DEV_PATH)[['Sentence', 'Readability_Level_19']].rename(columns={'Sentence': 'text', 'Readability_Level_19': 'label'})
|
| 70 |
+
train_df.dropna(subset=['text', 'label'], inplace=True)
|
| 71 |
+
val_df.dropna(subset=['text', 'label'], inplace=True)
|
| 72 |
+
print(f"Loaded {len(train_df)} BAREC training sentences and {len(val_df)} validation sentences.")
|
| 73 |
+
except Exception as e:
|
| 74 |
+
print(f"❗️ ERROR loading BAREC CSVs: {e}")
|
| 75 |
+
return None, None
|
| 76 |
+
|
| 77 |
+
print("\n--- Loading SAMER Corpus for Augmentation ---")
|
| 78 |
+
try:
|
| 79 |
+
samer_level_map = {'L3': 4, 'L4': 10, 'L5': 16}
|
| 80 |
+
samer_df = pd.read_csv(SAMER_CORPUS_PATH, sep='\t')
|
| 81 |
+
samer_records = []
|
| 82 |
+
for level_name, barec_level in samer_level_map.items():
|
| 83 |
+
samer_subset = samer_df[[level_name]].dropna().rename(columns={level_name: 'text'})
|
| 84 |
+
samer_subset['label'] = barec_level
|
| 85 |
+
samer_records.append(samer_subset)
|
| 86 |
+
samer_augmentation_df = pd.concat(samer_records, ignore_index=True)
|
| 87 |
+
print(f"Loaded {len(samer_augmentation_df)} sentences from SAMER.")
|
| 88 |
+
|
| 89 |
+
full_train_df = pd.concat([train_df, samer_augmentation_df], ignore_index=True)
|
| 90 |
+
full_train_df.dropna(subset=['text', 'label'], inplace=True)
|
| 91 |
+
full_train_df['label'] = full_train_df['label'].astype(int)
|
| 92 |
+
full_train_df = full_train_df.sample(frac=1, random_state=42).reset_index(drop=True)
|
| 93 |
+
print(f"Total unified training sentences: {len(full_train_df)}")
|
| 94 |
+
return full_train_df, val_df
|
| 95 |
+
except FileNotFoundError:
|
| 96 |
+
print("❗️ SAMER Corpus not found, proceeding with BAREC data only.")
|
| 97 |
+
return train_df, val_df
|
| 98 |
+
|
| 99 |
+
def load_blind_test_data(file_path):
|
| 100 |
+
"""Loads and prepares the blind test set."""
|
| 101 |
+
print(f"\n--- Loading Blind Test Data ---")
|
| 102 |
+
try:
|
| 103 |
+
doc_test_df = pd.read_csv(file_path).rename(columns={'ID': 'doc_id', 'Sentence': 'text'})
|
| 104 |
+
sentence_test_df = doc_test_df.assign(text=doc_test_df['text'].str.split('\n')).explode('text').reset_index(drop=True)
|
| 105 |
+
sentence_test_df.dropna(subset=['text'], inplace=True)
|
| 106 |
+
print(f"Loaded and exploded {len(sentence_test_df)} sentences for prediction.")
|
| 107 |
+
return sentence_test_df
|
| 108 |
+
except Exception as e:
|
| 109 |
+
print(f"❗️ ERROR loading blind test file: {e}")
|
| 110 |
+
return None
|
| 111 |
+
|
| 112 |
+
# =====================================================================================
|
| 113 |
+
# 3. FEATURE ENGINEERING & PREPROCESSING FUNCTION
|
| 114 |
+
# =====================================================================================
|
| 115 |
+
|
| 116 |
+
def get_lexical_features(text, lexicon):
|
| 117 |
+
"""Calculates lexical features based on the SAMER lexicon."""
|
| 118 |
+
if not lexicon or not isinstance(text, str):
|
| 119 |
+
return [0.0] * 7
|
| 120 |
+
|
| 121 |
+
words = text.split()
|
| 122 |
+
if not words: return [0.0] * 7
|
| 123 |
+
|
| 124 |
+
word_difficulties = [lexicon.get(word, 3.0) for word in words]
|
| 125 |
+
|
| 126 |
+
# Use float() to ensure type consistency for pyarrow
|
| 127 |
+
features = [
|
| 128 |
+
float(len(text)),
|
| 129 |
+
float(len(words)),
|
| 130 |
+
float(np.mean([len(w) for w in words]) if words else 0.0),
|
| 131 |
+
float(np.mean(word_difficulties)),
|
| 132 |
+
float(np.max(word_difficulties)),
|
| 133 |
+
float(np.sum(np.array(word_difficulties) > 4)),
|
| 134 |
+
float(len([w for w in words if w not in lexicon]) / len(words))
|
| 135 |
+
]
|
| 136 |
+
return features
|
| 137 |
+
|
| 138 |
+
def process_dataframe(df, lexicon, is_test=False):
|
| 139 |
+
"""Applies all preprocessing steps to a dataframe."""
|
| 140 |
+
print(f"\n--- Starting processing for {'Test' if is_test else 'Train/Val'} dataframe ---")
|
| 141 |
+
|
| 142 |
+
# 1. Clean and preprocess text
|
| 143 |
+
print("Step 1: Applying AraBERT preprocessor...")
|
| 144 |
+
df['text_preprocessed'] = df['text'].apply(arabert_preprocessor.preprocess)
|
| 145 |
+
|
| 146 |
+
# 2. Engineer lexical features
|
| 147 |
+
print("Step 2: Engineering lexical features...")
|
| 148 |
+
features = np.array(df['text_preprocessed'].apply(lambda x: get_lexical_features(x, lexicon)).tolist())
|
| 149 |
+
df['features'] = list(features)
|
| 150 |
+
|
| 151 |
+
# 3. Tokenize text
|
| 152 |
+
print("Step 3: Tokenizing text...")
|
| 153 |
+
encodings = tokenizer(
|
| 154 |
+
df['text_preprocessed'].tolist(),
|
| 155 |
+
truncation=True,
|
| 156 |
+
padding="max_length",
|
| 157 |
+
max_length=MAX_LENGTH
|
| 158 |
+
)
|
| 159 |
+
df['input_ids'] = encodings['input_ids']
|
| 160 |
+
df['attention_mask'] = encodings['attention_mask']
|
| 161 |
+
|
| 162 |
+
# 4. Select final columns
|
| 163 |
+
if is_test:
|
| 164 |
+
final_cols = ['doc_id', 'input_ids', 'attention_mask', 'features']
|
| 165 |
+
else:
|
| 166 |
+
final_cols = ['label', 'input_ids', 'attention_mask', 'features']
|
| 167 |
+
|
| 168 |
+
print("✅ Processing complete.")
|
| 169 |
+
return df[final_cols]
|
| 170 |
+
|
| 171 |
+
# =====================================================================================
|
| 172 |
+
# 4. EXECUTION
|
| 173 |
+
# =====================================================================================
|
| 174 |
+
|
| 175 |
+
# Load all data first
|
| 176 |
+
samer_lexicon = load_samer_lexicon(SAMER_LEXICON_PATH)
|
| 177 |
+
train_df, val_df = load_training_and_validation_data(samer_lexicon)
|
| 178 |
+
test_df = load_blind_test_data(BLIND_TEST_PATH)
|
| 179 |
+
|
| 180 |
+
# Check if data loading was successful before proceeding
|
| 181 |
+
if train_df is not None and val_df is not None and test_df is not None:
|
| 182 |
+
# Process each dataframe
|
| 183 |
+
processed_train_df = process_dataframe(train_df, samer_lexicon)
|
| 184 |
+
processed_val_df = process_dataframe(val_df, samer_lexicon)
|
| 185 |
+
processed_test_df = process_dataframe(test_df, samer_lexicon, is_test=True)
|
| 186 |
+
|
| 187 |
+
# Save the processed dataframes to Feather files
|
| 188 |
+
print("\n--- Saving processed dataframes to Feather files ---")
|
| 189 |
+
|
| 190 |
+
train_save_path = os.path.join(OUTPUT_DIR, 'train_processed.feather')
|
| 191 |
+
val_save_path = os.path.join(OUTPUT_DIR, 'val_processed.feather')
|
| 192 |
+
test_save_path = os.path.join(OUTPUT_DIR, 'test_processed.feather')
|
| 193 |
+
|
| 194 |
+
processed_train_df.to_feather(train_save_path)
|
| 195 |
+
print(f"✅ Training data saved to {train_save_path}")
|
| 196 |
+
|
| 197 |
+
processed_val_df.to_feather(val_save_path)
|
| 198 |
+
print(f"✅ Validation data saved to {val_save_path}")
|
| 199 |
+
|
| 200 |
+
processed_test_df.to_feather(test_save_path)
|
| 201 |
+
print(f"✅ Test data saved to {test_save_path}")
|
| 202 |
+
|
| 203 |
+
print("\n🎉 All preprocessing is complete. You can now save this notebook's output as a new dataset.")
|
| 204 |
+
else:
|
| 205 |
+
print("\n❗️ Script aborted due to data loading errors.")
|