FatimahEmadEldin commited on
Commit
cbefe5c
·
verified ·
1 Parent(s): d5cbd37

Upload 2 files

Browse files
Constrained-Track-Document-Bassline-Readability-Arabertv2-d3tok-reg.py ADDED
@@ -0,0 +1,563 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """arabertv2-d3tok_Sentence_Constrained.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1vHngBRGkcXHJIwi317DS0AsZfsXguIc0
8
+ """
9
+
10
+ # Authorize Colab to access your Google Drive
11
+ from google.colab import drive
12
+ drive.mount('/content/drive')
13
+
14
+ import pandas as pd
15
+ import numpy as np
16
+ import os
17
+ import torch
18
+ import torch.nn as nn
19
+ import zipfile
20
+ from sklearn.metrics import cohen_kappa_score
21
+ from torch.utils.data import Dataset as TorchDataset
22
+ from transformers import (
23
+ AutoTokenizer,
24
+ AutoModel,
25
+ TrainingArguments,
26
+ Trainer,
27
+ EarlyStoppingCallback
28
+ )
29
+ import gc
30
+ import ast # To safely evaluate string-formatted lists
31
+ import json # Added import for json
32
+
33
+ # --- Hugging Face Authentication ---
34
+ from huggingface_hub import login
35
+ from google.colab import userdata
36
+
37
+ # Log in to Hugging Face using the token stored in Colab secrets
38
+ try:
39
+ login(token=userdata.get('HF_TOKEN'))
40
+ print("✔️ Successfully logged into Hugging Face.")
41
+ except (NameError, KeyError):
42
+ print("⚠️ Hugging Face token not found in Colab secrets. Please add it as 'HF_TOKEN'.")
43
+ # Fallback for local execution or if login() is preferred manually
44
+ login()
45
+
46
+ # --- Model & Training ---
47
+ MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg"
48
+
49
+ NUM_LABELS = 1
50
+ TARGET_CLASSES = 19
51
+ NUM_FEATURES = 7
52
+
53
+ # --- IMPORTANT: Set the path to your project folder on Google Drive ---
54
+ PROJECT_DRIVE_PATH = '/content/drive/MyDrive/BAREC_Competition'
55
+
56
+ # --- File & Directory Paths (Now relative to your Google Drive) ---
57
+ BASE_DIR = PROJECT_DRIVE_PATH
58
+ PROCESSED_DATA_DIR = os.path.join(BASE_DIR, "lex")
59
+ CHECKPOINT_DIR = os.path.join(BASE_DIR, "results", f"hybrid_constrained_samer_regression_v2_{MODEL_NAME.split('/')[-1]}")
60
+ SUBMISSION_DIR = os.path.join(BASE_DIR, "submission")
61
+
62
+ # Ensure the output directories exist on your Google Drive
63
+ os.makedirs(CHECKPOINT_DIR, exist_ok=True)
64
+ os.makedirs(SUBMISSION_DIR, exist_ok=True)
65
+
66
+ # Paths to the preprocessed input files on Google Drive
67
+ TRAIN_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'train_processed_full.csv')
68
+ DEV_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'dev_processed_full.csv')
69
+ TEST_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'test_processed_full.csv')
70
+
71
+ # --- Submission Paths on Google Drive ---
72
+ SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_hybrid_constrained_samer_regression.csv")
73
+ ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_hybrid_constrained_samer_regression.zip")
74
+
75
+ print(f"✔️ All paths configured to use Google Drive folder: {BASE_DIR}")
76
+
77
+ """# 2. DATA LOADING FUNCTION"""
78
+
79
+ # =====================================================================================
80
+ # 2. DATA LOADING FUNCTION
81
+ # =====================================================================================
82
+
83
+ def load_preprocessed_data():
84
+ """Loads the pre-processed CSV files directly from Google Drive."""
85
+ print("\n--- Loading Preprocessed Data from Google Drive ---")
86
+ try:
87
+ train_df = pd.read_csv(TRAIN_PROCESSED_PATH)
88
+ val_df = pd.read_csv(DEV_PROCESSED_PATH)
89
+
90
+ print("Converting 'features' column from string to list...")
91
+ train_df['features'] = train_df['features'].apply(ast.literal_eval)
92
+ val_df['features'] = val_df['features'].apply(ast.literal_eval)
93
+
94
+ # Convert labels for regression
95
+ train_df['label'] = (train_df['label'].astype(int) - 1).astype(float)
96
+ val_df['label'] = (val_df['label'].astype(int) - 1).astype(float)
97
+
98
+ print(f"✔ Successfully loaded {len(train_df)} training and {len(val_df)} validation records.")
99
+ return train_df, val_df
100
+ except FileNotFoundError as e:
101
+ print(f"❌ ERROR: Preprocessed file not found: {e}.")
102
+ print("Please make sure your data is uploaded to the correct Google Drive folder.")
103
+ return None, None
104
+ except Exception as e:
105
+ print(f"❌ ERROR during data loading: {e}")
106
+ return None, None
107
+
108
+ """# 3. MODEL, DATASET, AND METRICS DEFINITIONS"""
109
+
110
+ # =====================================================================================
111
+ # 3. MODEL, DATASET, AND METRICS DEFINITIONS
112
+ # =====================================================================================
113
+
114
+ class ReadabilityDataset(TorchDataset):
115
+ """
116
+ Custom PyTorch Dataset for readability prediction.
117
+ It takes text, extra numerical features, and optional labels.
118
+ """
119
+ def __init__(self, texts, features, labels=None, tokenizer_obj=None, max_len=256):
120
+ self.texts = texts
121
+ self.features = features
122
+ self.labels = labels
123
+ self.tokenizer = tokenizer_obj
124
+ self.max_len = max_len
125
+
126
+ def __len__(self):
127
+ return len(self.texts)
128
+
129
+ def __getitem__(self, idx):
130
+ text = str(self.texts[idx])
131
+ feature_vec = torch.tensor(self.features[idx], dtype=torch.float)
132
+
133
+ encoding = self.tokenizer.encode_plus(
134
+ text,
135
+ add_special_tokens=True,
136
+ max_length=self.max_len,
137
+ return_token_type_ids=False,
138
+ padding='max_length',
139
+ truncation=True,
140
+ return_attention_mask=True,
141
+ return_tensors='pt',
142
+ )
143
+
144
+ item = {
145
+ 'input_ids': encoding['input_ids'].flatten(),
146
+ 'attention_mask': encoding['attention_mask'].flatten(),
147
+ 'extra_features': feature_vec
148
+ }
149
+
150
+ if self.labels is not None:
151
+ item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
152
+
153
+ return item
154
+
155
+ class HybridRegressionModel(nn.Module):
156
+ """
157
+ A hybrid model that combines a transformer base with additional numerical features.
158
+ The output is a single regression value.
159
+ """
160
+ def __init__(self, model_name, num_extra_features):
161
+ super(HybridRegressionModel, self).__init__()
162
+ self.transformer = AutoModel.from_pretrained(model_name)
163
+ # The regression head takes the transformer's output + extra features
164
+ self.regressor = nn.Linear(self.transformer.config.hidden_size + num_extra_features, 1)
165
+
166
+ # Add the attribute expected by Hugging Face Trainer for checkpoint loading
167
+ # Initialize it to None or an empty list if no keys should be ignored during saving
168
+ self._keys_to_ignore_on_save = None # Or []
169
+
170
+ def forward(self, input_ids, attention_mask, extra_features, labels=None):
171
+ outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
172
+ pooler_output = outputs.pooler_output
173
+
174
+ # Concatenate transformer output with extra features
175
+ combined_features = torch.cat((pooler_output, extra_features), dim=1)
176
+
177
+ logits = self.regressor(combined_features)
178
+
179
+ loss = None
180
+ if labels is not None:
181
+ loss_fct = nn.MSELoss()
182
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
183
+
184
+ return (loss, logits) if loss is not None else logits
185
+
186
+ def compute_metrics(p):
187
+ """
188
+ Computes the Quadratic Weighted Kappa score for regression predictions.
189
+ Predictions are rounded and clipped to be valid class labels.
190
+ """
191
+ predictions, labels = p
192
+ # Round predictions to nearest integer and clip to valid range [0, 18]
193
+ clipped_preds = np.clip(np.round(predictions), 0, TARGET_CLASSES - 1)
194
+ qwk = cohen_kappa_score(labels, clipped_preds, weights='quadratic')
195
+ return {'qwk': qwk}
196
+
197
+ """# 4. & 5. MAIN EXECUTION FUNCTIONS"""
198
+
199
+ # =====================================================================================
200
+ # 4. & 5. MAIN EXECUTION FUNCTIONS
201
+ # =====================================================================================
202
+
203
+ def main_train():
204
+ print("===== 🚀 STARTING HYBRID REGRESSION MODEL PIPELINE =====\n")
205
+
206
+ print("Initializing Tokenizer...")
207
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
208
+
209
+ train_df, val_df = load_preprocessed_data()
210
+ if train_df is None:
211
+ print("\n! Aborting script due to data loading failure.")
212
+ return
213
+
214
+ print("\nCreating Torch Datasets...")
215
+ train_dataset = ReadabilityDataset(train_df['d3tok_text'].tolist(), train_df['features'].tolist(), train_df['label'].tolist(), tokenizer)
216
+ val_dataset = ReadabilityDataset(val_df['d3tok_text'].tolist(), val_df['features'].tolist(), val_df['label'].tolist(), tokenizer)
217
+ print("✔ Datasets created.")
218
+
219
+ print("\nInitializing Hybrid Regression Model...")
220
+ model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES)
221
+
222
+ training_args = TrainingArguments(
223
+ output_dir=CHECKPOINT_DIR,
224
+ num_train_epochs=15,
225
+ per_device_train_batch_size=16,
226
+ per_device_eval_batch_size=64,
227
+ learning_rate=3e-5,
228
+ warmup_ratio=0.1,
229
+ weight_decay=0.01,
230
+ logging_steps=100,
231
+ eval_strategy="epoch",
232
+ save_strategy="epoch",
233
+ load_best_model_at_end=True,
234
+ metric_for_best_model="qwk",
235
+ greater_is_better=True,
236
+ save_total_limit=2,
237
+ fp16=torch.cuda.is_available(),
238
+ report_to="none"
239
+ )
240
+
241
+ trainer = Trainer(
242
+ model=model,
243
+ args=training_args,
244
+ train_dataset=train_dataset,
245
+ eval_dataset=val_dataset,
246
+ compute_metrics=compute_metrics,
247
+ callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
248
+ )
249
+
250
+ print("\nStarting model training... Checkpoints will be saved to Google Drive.")
251
+
252
+ latest_checkpoint = None
253
+ if os.path.exists(CHECKPOINT_DIR):
254
+ checkpoints = [d for d in os.listdir(CHECKPOINT_DIR) if d.startswith("checkpoint-")]
255
+ if checkpoints:
256
+ checkpoints.sort(key=lambda x: int(x.split('-')[-1]))
257
+ latest_checkpoint = os.path.join(CHECKPOINT_DIR, checkpoints[-1])
258
+ print(f"Resuming training from latest checkpoint: {latest_checkpoint}")
259
+ else:
260
+ print("No checkpoints found to resume training from. Starting from scratch.")
261
+ else:
262
+ print("Checkpoint directory not found. Starting training from scratch.")
263
+
264
+
265
+ trainer.train(resume_from_checkpoint=latest_checkpoint)
266
+ print("✔ Training finished.")
267
+
268
+ del model, trainer, train_dataset, val_dataset, train_df, val_df
269
+ gc.collect()
270
+ torch.cuda.empty_cache()
271
+
272
+
273
+ def main_predict():
274
+ print("\n===== 🏆 GENERATING FINAL PREDICTIONS & SUBMISSION =====\n")
275
+ try:
276
+ print("Initializing Tokenizer for prediction...")
277
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
278
+
279
+ print("Loading preprocessed test data from Google Drive...")
280
+ test_df = pd.read_csv(TEST_PROCESSED_PATH)
281
+ test_df['features'] = test_df['features'].apply(ast.literal_eval)
282
+
283
+ print("\nLoading the best trained model from Google Drive checkpoints...")
284
+ best_checkpoint_path = None
285
+ best_qwk = -1.0
286
+
287
+ if not os.path.exists(CHECKPOINT_DIR):
288
+ raise FileNotFoundError(f"Checkpoint directory not found on Google Drive: {CHECKPOINT_DIR}.")
289
+
290
+ checkpoints = [d for d in os.listdir(CHECKPOINT_DIR) if d.startswith("checkpoint-")]
291
+ if not checkpoints:
292
+ raise FileNotFoundError(f"No checkpoint found in the results directory on Google Drive: {CHECKPOINT_DIR}.")
293
+
294
+ checkpoints.sort(key=lambda x: int(x.split('-')[-1]), reverse=True)
295
+
296
+ for checkpoint in checkpoints:
297
+ checkpoint_path = os.path.join(CHECKPOINT_DIR, checkpoint)
298
+ trainer_state_path = os.path.join(checkpoint_path, "trainer_state.json")
299
+ model_file_path = os.path.join(checkpoint_path, "pytorch_model.bin")
300
+
301
+ if os.path.exists(model_file_path) and os.path.exists(trainer_state_path):
302
+ try:
303
+ with open(trainer_state_path, 'r') as f:
304
+ trainer_state = json.load(f)
305
+ if 'best_metric' in trainer_state and trainer_state['best_metric'] > best_qwk:
306
+ best_qwk = trainer_state['best_metric']
307
+ best_checkpoint_path = checkpoint_path
308
+ print(f"Found improved eval_qwk {best_qwk} in {checkpoint_path}")
309
+ except Exception as e:
310
+ print(f"Warning: Could not parse trainer_state.json in {checkpoint_path}: {e}")
311
+
312
+ if not best_checkpoint_path:
313
+ print("Could not find best checkpoint via trainer_state.json. Falling back to the latest checkpoint with a model file.")
314
+ for checkpoint in checkpoints:
315
+ checkpoint_path = os.path.join(CHECKPOINT_DIR, checkpoint)
316
+ if os.path.exists(os.path.join(checkpoint_path, "pytorch_model.bin")):
317
+ best_checkpoint_path = checkpoint_path
318
+ print(f"Using latest valid checkpoint: {best_checkpoint_path}")
319
+ break
320
+
321
+ if not best_checkpoint_path:
322
+ raise FileNotFoundError(f"No valid checkpoint with 'pytorch_model.bin' found in: {CHECKPOINT_DIR}.")
323
+
324
+ print(f"Loading model from: {best_checkpoint_path}")
325
+ model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES)
326
+ model.load_state_dict(torch.load(os.path.join(best_checkpoint_path, "pytorch_model.bin")))
327
+
328
+ trainer = Trainer(model=model, args=TrainingArguments(output_dir=CHECKPOINT_DIR))
329
+
330
+ print("Generating predictions on the test set...")
331
+ # Note: No labels for the test dataset
332
+ test_dataset = ReadabilityDataset(test_df['d3tok_text'].tolist(), test_df['features'].tolist(), tokenizer_obj=tokenizer)
333
+ predictions = trainer.predict(test_dataset)
334
+
335
+ clipped_preds = np.clip(np.round(predictions.predictions.flatten()), 0, TARGET_CLASSES - 1)
336
+ test_df['Prediction'] = (clipped_preds + 1).astype(int)
337
+
338
+ submission_df = test_df.rename(columns={'ID': 'id'})[['id', 'Prediction']]
339
+
340
+ print(f"\nSaving prediction file to: {SUBMISSION_PATH}")
341
+ submission_df.to_csv(SUBMISSION_PATH, index=False)
342
+
343
+ print(f"Compressing into {os.path.basename(ZIPPED_SUBMISSION_PATH)}...")
344
+ with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
345
+ zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH))
346
+
347
+ print(f"✔ Submission file '{os.path.basename(ZIPPED_SUBMISSION_PATH)}' created successfully in your Drive!")
348
+
349
+ except FileNotFoundError as e:
350
+ print(f"❌ ERROR: File not found: {e}. Ensure training was completed and checkpoints exist.")
351
+ except Exception as e:
352
+ print(f"❌ An error occurred during final prediction: {e}")
353
+
354
+ """# Running script"""
355
+
356
+ # =====================================================================================
357
+ # 6. SCRIPT RUNNER
358
+ # =====================================================================================
359
+
360
+ # Start the training process
361
+ main_train()
362
+
363
+ # Once training is done, generate predictions
364
+ main_predict()
365
+
366
+ print("\n--- ✅ All Done! Check your Google Drive for results and submission files. ---")
367
+
368
+ BEST_CHECKPOINT_PATH = '/content/drive/MyDrive/BAREC_Competition/results/hybrid_constrained_samer_regression_v2_readability-arabertv2-d3tok-reg/checkpoint-42826'
369
+
370
+
371
+ # --- Input & Output Directories (derived from base paths) ---
372
+ PROCESSED_DATA_DIR = os.path.join(PROJECT_DRIVE_PATH, "lex")
373
+ SUBMISSION_DIR = os.path.join(PROJECT_DRIVE_PATH, "submission")
374
+
375
+ # Ensure the submission output directory exists
376
+ os.makedirs(SUBMISSION_DIR, exist_ok=True)
377
+
378
+ # --- File Paths ---
379
+ TEST_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'test_processed_full.csv')
380
+ SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_final_prediction.csv")
381
+ ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission.zip")
382
+
383
+ print(f"✔️ Configuration loaded. Model will be loaded from: {BEST_CHECKPOINT_PATH}")
384
+
385
+ """# 3. MODEL AND DATASET CLASS DEFINITIONS"""
386
+
387
+ # =====================================================================================
388
+ # 3. MODEL AND DATASET CLASS DEFINITIONS
389
+ # =====================================================================================
390
+
391
+ class ReadabilityDataset(TorchDataset):
392
+ """
393
+ Custom PyTorch Dataset for readability prediction.
394
+ It takes text, extra numerical features, and optional labels.
395
+ """
396
+ def __init__(self, texts, features, labels=None, tokenizer_obj=None, max_len=256):
397
+ self.texts = texts
398
+ self.features = features
399
+ self.labels = labels
400
+ self.tokenizer = tokenizer_obj
401
+ self.max_len = max_len
402
+
403
+ def __len__(self):
404
+ return len(self.texts)
405
+
406
+ def __getitem__(self, idx):
407
+ text = str(self.texts[idx])
408
+ feature_vec = torch.tensor(self.features[idx], dtype=torch.float)
409
+
410
+ encoding = self.tokenizer.encode_plus(
411
+ text,
412
+ add_special_tokens=True,
413
+ max_length=self.max_len,
414
+ return_token_type_ids=False,
415
+ padding='max_length',
416
+ truncation=True,
417
+ return_attention_mask=True,
418
+ return_tensors='pt',
419
+ )
420
+
421
+ item = {
422
+ 'input_ids': encoding['input_ids'].flatten(),
423
+ 'attention_mask': encoding['attention_mask'].flatten(),
424
+ 'extra_features': feature_vec
425
+ }
426
+
427
+ if self.labels is not None:
428
+ item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
429
+
430
+ return item
431
+
432
+ class HybridRegressionModel(nn.Module):
433
+ """
434
+ A hybrid model that combines a transformer base with additional numerical features.
435
+ The output is a single regression value.
436
+ """
437
+ def __init__(self, model_name, num_extra_features):
438
+ super(HybridRegressionModel, self).__init__()
439
+ self.transformer = AutoModel.from_pretrained(model_name)
440
+ self.regressor = nn.Linear(self.transformer.config.hidden_size + num_extra_features, 1)
441
+
442
+ def forward(self, input_ids, attention_mask, extra_features, labels=None):
443
+ outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
444
+ pooler_output = outputs.pooler_output
445
+ combined_features = torch.cat((pooler_output, extra_features), dim=1)
446
+ logits = self.regressor(combined_features)
447
+
448
+ loss = None
449
+ if labels is not None:
450
+ # Using MSELoss for regression
451
+ loss_fct = nn.MSELoss()
452
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
453
+
454
+ return (loss, logits) if loss is not None else logits
455
+
456
+ """# 4. PREDICTION AND SUBMISSION SCRIPT"""
457
+
458
+ # =====================================================================================
459
+ # 4. PREDICTION AND SUBMISSION SCRIPT
460
+ # =====================================================================================
461
+
462
+ from safetensors.torch import load_file # Import load_file
463
+
464
+ def generate_predictions():
465
+ """
466
+ Loads the trained model from the specified checkpoint, runs predictions on the
467
+ test set, and saves the formatted submission file.
468
+ """
469
+ print("\n===== 🏆 STARTING PREDICTION PIPELINE =====\n")
470
+ try:
471
+ # --- Validate Checkpoint Path ---
472
+ model_weights_path = os.path.join(BEST_CHECKPOINT_PATH, 'model.safetensors')
473
+ if not os.path.exists(model_weights_path):
474
+ print(f"❌ ERROR: 'model.safetensors' not found at the specified path: {model_weights_path}")
475
+ print("Please ensure the BEST_CHECKPOINT_PATH variable is set correctly to the directory containing the model weights.")
476
+ return
477
+
478
+ # --- Initialize Tokenizer ---
479
+ print("1. Initializing tokenizer...")
480
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
481
+
482
+ # --- Load Test Data ---
483
+ print(f"2. Loading preprocessed test data from: {TEST_PROCESSED_PATH}")
484
+ test_df = pd.read_csv(TEST_PROCESSED_PATH)
485
+ test_df['features'] = test_df['features'].apply(ast.literal_eval)
486
+ print(f" Loaded {len(test_df)} test records.")
487
+
488
+ # --- Load Model from Checkpoint ---
489
+ print(f"3. Loading model from checkpoint: {BEST_CHECKPOINT_PATH}")
490
+ model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES)
491
+
492
+ # Load the state dictionary from the .safetensors file
493
+ state_dict = load_file(model_weights_path)
494
+
495
+ # --- Inspect the state dictionary keys ---
496
+ print("\n--- State Dictionary Keys ---")
497
+ for key in state_dict.keys():
498
+ print(key)
499
+ print("-----------------------------\n")
500
+ # --- End Inspection ---
501
+
502
+ # Load the state dictionary into the model
503
+ model.load_state_dict(state_dict)
504
+ print(" Model weights loaded successfully from model.safetensors.")
505
+
506
+ # --- Initialize Trainer ---
507
+ # A minimal Trainer is sufficient for making predictions
508
+ trainer = Trainer(model=model, args=TrainingArguments(output_dir="./temp_results"))
509
+
510
+ # --- Create Test Dataset ---
511
+ print("4. Creating test dataset...")
512
+ test_dataset = ReadabilityDataset(
513
+ texts=test_df['d3tok_text'].tolist(),
514
+ features=test_df['features'].tolist(),
515
+ tokenizer_obj=tokenizer
516
+ )
517
+
518
+ # --- Generate Predictions ---
519
+ print("5. Generating predictions on the test set...")
520
+ raw_predictions = trainer.predict(test_dataset)
521
+
522
+ # Predictions are in the .predictions attribute; flatten them to a 1D array
523
+ predictions_logits = raw_predictions.predictions.flatten()
524
+
525
+ # --- Process Predictions ---
526
+ # Round to the nearest integer, clip to the valid label range [0, 18],
527
+ # and convert back to original class labels [1, 19]
528
+ clipped_preds = np.clip(np.round(predictions_logits), 0, TARGET_CLASSES - 1)
529
+ final_predictions = (clipped_preds + 1).astype(int)
530
+ test_df['Prediction'] = final_predictions
531
+ print(" Predictions generated and processed.")
532
+
533
+ # --- Create and Save Submission File ---
534
+ submission_df = test_df.rename(columns={'ID': 'id'})[['id', 'Prediction']]
535
+ print(f"\n6. Saving final prediction CSV to: {SUBMISSION_PATH}")
536
+ submission_df.to_csv(SUBMISSION_PATH, index=False)
537
+
538
+ print(f"7. Compressing submission file into: {ZIPPED_SUBMISSION_PATH}")
539
+ with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
540
+ zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH))
541
+
542
+ print("\n--- ✅ All Done! ---")
543
+ print(f"Submission file '{os.path.basename(ZIPPED_SUBMISSION_PATH)}' has been saved to your Google Drive.")
544
+ print(f"Location: {SUBMISSION_DIR}")
545
+
546
+ except FileNotFoundError as e:
547
+ print(f"❌ ERROR: A required file was not found: {e}")
548
+ print(" Please ensure all paths in the configuration section are correct.")
549
+ except Exception as e:
550
+ print(f"❌ An unexpected error occurred: {e}")
551
+
552
+ """# 5. EXECUTE SCRIPT"""
553
+
554
+ # =====================================================================================
555
+ # 5. EXECUTE SCRIPT
556
+ # =====================================================================================
557
+
558
+ if __name__ == '__main__':
559
+ generate_predictions()
560
+
561
+ """# Results of Sentence-level Readability Assessment - Constrained on The Blind Test
562
+ {'accuracy': 42.1, 'accuracy+-1': 71.6, 'avg_abs_dist': 1.2, 'qwk': 82.1, 'accuracy_7': 59.9, 'accuracy_5': 65.4, 'accuracy_3': 73.4}
563
+ """
samer_preprocessing_feature_extraction.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =====================================================================================
2
+ # 0. INSTALLATIONS & IMPORTS
3
+ # =====================================================================================
4
+ # This will install all necessary libraries quietly.
5
+ # !pip install transformers[torch] datasets pandas scikit-learn arabert accelerate pyarrow -q
6
+
7
+ import pandas as pd
8
+ import numpy as np
9
+ import os
10
+ from transformers import AutoTokenizer
11
+ from arabert.preprocess import ArabertPreprocessor
12
+
13
+ print("✅ Libraries imported successfully.")
14
+
15
+ # =====================================================================================
16
+ # 1. CONFIGURATION
17
+ # =====================================================================================
18
+ # --- Model & Tokenizer ---
19
+ # Using the model from your original script for consistency
20
+ MODEL_NAME = "aubmindlab/bert-large-arabertv2"
21
+ MAX_LENGTH = 256 # Max sequence length for tokenizer
22
+
23
+ # --- File Paths ---
24
+ # Assumes your initial dataset is in the default /kaggle/input/sentses directory
25
+ RAW_DATA_DIR = '/kaggle/input/sentses/'
26
+ BAREC_TRAIN_PATH = os.path.join(RAW_DATA_DIR, 'train.csv')
27
+ BAREC_DEV_PATH = os.path.join(RAW_DATA_DIR, 'dev.csv')
28
+ BLIND_TEST_PATH = os.path.join(RAW_DATA_DIR, 'blind_test_data.csv')
29
+ # The SAMER files are assumed to be in the same directory for this example
30
+ SAMER_CORPUS_PATH = os.path.join(RAW_DATA_DIR, 'samer_train.tsv')
31
+ SAMER_LEXICON_PATH = os.path.join(RAW_DATA_DIR, 'SAMER-Readability-Lexicon-v2.tsv')
32
+
33
+ # --- Output Path ---
34
+ # Processed files will be saved here, ready for output
35
+ OUTPUT_DIR = '/kaggle/working/'
36
+
37
+ # --- Initialize Processors ---
38
+ try:
39
+ arabert_preprocessor = ArabertPreprocessor(model_name=MODEL_NAME)
40
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
41
+ print("✅ AraBERT Preprocessor and Tokenizer initialized.")
42
+ except Exception as e:
43
+ print(f"❗️ Error initializing processors: {e}")
44
+ # Exit if the core components can't be loaded
45
+ exit()
46
+
47
+ # =====================================================================================
48
+ # 2. DATA LOADING FUNCTIONS
49
+ # =====================================================================================
50
+
51
+ def load_samer_lexicon(file_path):
52
+ """Loads the SAMER Lexicon for feature engineering."""
53
+ print("\n--- Loading SAMER Lexicon ---")
54
+ try:
55
+ df = pd.read_csv(file_path, sep='\t')
56
+ df[['lemma', 'pos']] = df['lemma#pos'].str.split('#', expand=True)
57
+ lexicon_dict = df.set_index('lemma')['readability (rounded average)'].to_dict()
58
+ print(f"Loaded {len(lexicon_dict)} lemmas into lexicon dictionary.")
59
+ return lexicon_dict
60
+ except FileNotFoundError:
61
+ print("❗️ SAMER Lexicon not found. Lexical features will be disabled.")
62
+ return {}
63
+
64
+ def load_training_and_validation_data(lexicon):
65
+ """Loads and augments training/validation data."""
66
+ print("\n--- Loading BAREC Training & Validation Data ---")
67
+ try:
68
+ train_df = pd.read_csv(BAREC_TRAIN_PATH)[['Sentence', 'Readability_Level_19']].rename(columns={'Sentence': 'text', 'Readability_Level_19': 'label'})
69
+ val_df = pd.read_csv(BAREC_DEV_PATH)[['Sentence', 'Readability_Level_19']].rename(columns={'Sentence': 'text', 'Readability_Level_19': 'label'})
70
+ train_df.dropna(subset=['text', 'label'], inplace=True)
71
+ val_df.dropna(subset=['text', 'label'], inplace=True)
72
+ print(f"Loaded {len(train_df)} BAREC training sentences and {len(val_df)} validation sentences.")
73
+ except Exception as e:
74
+ print(f"❗️ ERROR loading BAREC CSVs: {e}")
75
+ return None, None
76
+
77
+ print("\n--- Loading SAMER Corpus for Augmentation ---")
78
+ try:
79
+ samer_level_map = {'L3': 4, 'L4': 10, 'L5': 16}
80
+ samer_df = pd.read_csv(SAMER_CORPUS_PATH, sep='\t')
81
+ samer_records = []
82
+ for level_name, barec_level in samer_level_map.items():
83
+ samer_subset = samer_df[[level_name]].dropna().rename(columns={level_name: 'text'})
84
+ samer_subset['label'] = barec_level
85
+ samer_records.append(samer_subset)
86
+ samer_augmentation_df = pd.concat(samer_records, ignore_index=True)
87
+ print(f"Loaded {len(samer_augmentation_df)} sentences from SAMER.")
88
+
89
+ full_train_df = pd.concat([train_df, samer_augmentation_df], ignore_index=True)
90
+ full_train_df.dropna(subset=['text', 'label'], inplace=True)
91
+ full_train_df['label'] = full_train_df['label'].astype(int)
92
+ full_train_df = full_train_df.sample(frac=1, random_state=42).reset_index(drop=True)
93
+ print(f"Total unified training sentences: {len(full_train_df)}")
94
+ return full_train_df, val_df
95
+ except FileNotFoundError:
96
+ print("❗️ SAMER Corpus not found, proceeding with BAREC data only.")
97
+ return train_df, val_df
98
+
99
+ def load_blind_test_data(file_path):
100
+ """Loads and prepares the blind test set."""
101
+ print(f"\n--- Loading Blind Test Data ---")
102
+ try:
103
+ doc_test_df = pd.read_csv(file_path).rename(columns={'ID': 'doc_id', 'Sentence': 'text'})
104
+ sentence_test_df = doc_test_df.assign(text=doc_test_df['text'].str.split('\n')).explode('text').reset_index(drop=True)
105
+ sentence_test_df.dropna(subset=['text'], inplace=True)
106
+ print(f"Loaded and exploded {len(sentence_test_df)} sentences for prediction.")
107
+ return sentence_test_df
108
+ except Exception as e:
109
+ print(f"❗️ ERROR loading blind test file: {e}")
110
+ return None
111
+
112
+ # =====================================================================================
113
+ # 3. FEATURE ENGINEERING & PREPROCESSING FUNCTION
114
+ # =====================================================================================
115
+
116
+ def get_lexical_features(text, lexicon):
117
+ """Calculates lexical features based on the SAMER lexicon."""
118
+ if not lexicon or not isinstance(text, str):
119
+ return [0.0] * 7
120
+
121
+ words = text.split()
122
+ if not words: return [0.0] * 7
123
+
124
+ word_difficulties = [lexicon.get(word, 3.0) for word in words]
125
+
126
+ # Use float() to ensure type consistency for pyarrow
127
+ features = [
128
+ float(len(text)),
129
+ float(len(words)),
130
+ float(np.mean([len(w) for w in words]) if words else 0.0),
131
+ float(np.mean(word_difficulties)),
132
+ float(np.max(word_difficulties)),
133
+ float(np.sum(np.array(word_difficulties) > 4)),
134
+ float(len([w for w in words if w not in lexicon]) / len(words))
135
+ ]
136
+ return features
137
+
138
+ def process_dataframe(df, lexicon, is_test=False):
139
+ """Applies all preprocessing steps to a dataframe."""
140
+ print(f"\n--- Starting processing for {'Test' if is_test else 'Train/Val'} dataframe ---")
141
+
142
+ # 1. Clean and preprocess text
143
+ print("Step 1: Applying AraBERT preprocessor...")
144
+ df['text_preprocessed'] = df['text'].apply(arabert_preprocessor.preprocess)
145
+
146
+ # 2. Engineer lexical features
147
+ print("Step 2: Engineering lexical features...")
148
+ features = np.array(df['text_preprocessed'].apply(lambda x: get_lexical_features(x, lexicon)).tolist())
149
+ df['features'] = list(features)
150
+
151
+ # 3. Tokenize text
152
+ print("Step 3: Tokenizing text...")
153
+ encodings = tokenizer(
154
+ df['text_preprocessed'].tolist(),
155
+ truncation=True,
156
+ padding="max_length",
157
+ max_length=MAX_LENGTH
158
+ )
159
+ df['input_ids'] = encodings['input_ids']
160
+ df['attention_mask'] = encodings['attention_mask']
161
+
162
+ # 4. Select final columns
163
+ if is_test:
164
+ final_cols = ['doc_id', 'input_ids', 'attention_mask', 'features']
165
+ else:
166
+ final_cols = ['label', 'input_ids', 'attention_mask', 'features']
167
+
168
+ print("✅ Processing complete.")
169
+ return df[final_cols]
170
+
171
+ # =====================================================================================
172
+ # 4. EXECUTION
173
+ # =====================================================================================
174
+
175
+ # Load all data first
176
+ samer_lexicon = load_samer_lexicon(SAMER_LEXICON_PATH)
177
+ train_df, val_df = load_training_and_validation_data(samer_lexicon)
178
+ test_df = load_blind_test_data(BLIND_TEST_PATH)
179
+
180
+ # Check if data loading was successful before proceeding
181
+ if train_df is not None and val_df is not None and test_df is not None:
182
+ # Process each dataframe
183
+ processed_train_df = process_dataframe(train_df, samer_lexicon)
184
+ processed_val_df = process_dataframe(val_df, samer_lexicon)
185
+ processed_test_df = process_dataframe(test_df, samer_lexicon, is_test=True)
186
+
187
+ # Save the processed dataframes to Feather files
188
+ print("\n--- Saving processed dataframes to Feather files ---")
189
+
190
+ train_save_path = os.path.join(OUTPUT_DIR, 'train_processed.feather')
191
+ val_save_path = os.path.join(OUTPUT_DIR, 'val_processed.feather')
192
+ test_save_path = os.path.join(OUTPUT_DIR, 'test_processed.feather')
193
+
194
+ processed_train_df.to_feather(train_save_path)
195
+ print(f"✅ Training data saved to {train_save_path}")
196
+
197
+ processed_val_df.to_feather(val_save_path)
198
+ print(f"✅ Validation data saved to {val_save_path}")
199
+
200
+ processed_test_df.to_feather(test_save_path)
201
+ print(f"✅ Test data saved to {test_save_path}")
202
+
203
+ print("\n🎉 All preprocessing is complete. You can now save this notebook's output as a new dataset.")
204
+ else:
205
+ print("\n❗️ Script aborted due to data loading errors.")