Oguzz07 commited on
Commit
6f65cd1
·
verified ·
1 Parent(s): 3b69a24

Add causal_selection/meta_learner/trainer.py

Browse files
causal_selection/meta_learner/trainer.py ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Meta-learner: trains models to predict algorithm performance from dataset meta-features.
3
+ Supports multi-output regression (predict SHD per algorithm) and ranking evaluation.
4
+ """
5
+ import os
6
+ import numpy as np
7
+ import pandas as pd
8
+ import json
9
+ import logging
10
+ from collections import defaultdict
11
+
12
+ from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
13
+ from sklearn.multioutput import MultiOutputRegressor
14
+ from sklearn.preprocessing import StandardScaler
15
+ from sklearn.model_selection import LeaveOneGroupOut, cross_val_predict
16
+ from sklearn.metrics import mean_squared_error, mean_absolute_error
17
+ import joblib
18
+
19
+ from causal_selection.features.extractor import FEATURE_NAMES
20
+ from causal_selection.discovery.algorithms import ALGORITHM_POOL
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ ALGO_NAMES = list(ALGORITHM_POOL.keys())
25
+ RESULTS_DIR = '/app/causal_selection/data/results'
26
+ MODEL_DIR = '/app/causal_selection/models'
27
+
28
+
29
+ def load_meta_dataset(results_dir=RESULTS_DIR):
30
+ """Load meta-dataset from CSV files."""
31
+ X = pd.read_csv(os.path.join(results_dir, 'meta_features.csv'))
32
+ Y_shd = pd.read_csv(os.path.join(results_dir, 'shd_matrix.csv'))
33
+ Y_nshd = pd.read_csv(os.path.join(results_dir, 'normalized_shd_matrix.csv'))
34
+ configs = pd.read_csv(os.path.join(results_dir, 'configs.csv'))
35
+ return X, Y_shd, Y_nshd, configs
36
+
37
+
38
+ def train_meta_learner(X, Y, model_type='rf', **model_kwargs):
39
+ """Train a multi-output regression model.
40
+
41
+ Args:
42
+ X: feature matrix (n_tasks, n_features)
43
+ Y: target matrix (n_tasks, n_algorithms) - SHD values
44
+ model_type: 'rf' or 'gbm'
45
+
46
+ Returns:
47
+ trained model, scaler
48
+ """
49
+ scaler = StandardScaler()
50
+ X_scaled = scaler.fit_transform(X)
51
+
52
+ if model_type == 'rf':
53
+ base = RandomForestRegressor(
54
+ n_estimators=model_kwargs.get('n_estimators', 200),
55
+ max_depth=model_kwargs.get('max_depth', None),
56
+ min_samples_leaf=model_kwargs.get('min_samples_leaf', 2),
57
+ random_state=42,
58
+ n_jobs=-1,
59
+ )
60
+ elif model_type == 'gbm':
61
+ base = GradientBoostingRegressor(
62
+ n_estimators=model_kwargs.get('n_estimators', 200),
63
+ max_depth=model_kwargs.get('max_depth', 5),
64
+ learning_rate=model_kwargs.get('learning_rate', 0.1),
65
+ min_samples_leaf=model_kwargs.get('min_samples_leaf', 3),
66
+ random_state=42,
67
+ )
68
+ else:
69
+ raise ValueError(f"Unknown model type: {model_type}")
70
+
71
+ model = MultiOutputRegressor(base)
72
+ model.fit(X_scaled, Y)
73
+
74
+ return model, scaler
75
+
76
+
77
+ def predict_top_k(model, scaler, X_new, k=3):
78
+ """Predict top-k algorithms for new dataset(s).
79
+
80
+ Args:
81
+ model: trained multi-output model
82
+ scaler: fitted StandardScaler
83
+ X_new: feature matrix (n_new, n_features)
84
+ k: number of top algorithms to return
85
+
86
+ Returns:
87
+ top_k_indices: (n_new, k) array of algorithm indices (sorted by predicted SHD ascending)
88
+ predicted_shd: (n_new, n_algorithms) full predicted SHD matrix
89
+ """
90
+ X_scaled = scaler.transform(X_new)
91
+ predicted = model.predict(X_scaled)
92
+
93
+ if predicted.ndim == 1:
94
+ predicted = predicted.reshape(1, -1)
95
+
96
+ top_k_indices = np.argsort(predicted, axis=1)[:, :k]
97
+ return top_k_indices, predicted
98
+
99
+
100
+ def evaluate_lono_cv(X, Y, configs, model_type='rf', k=3, **model_kwargs):
101
+ """Leave-One-Network-Out Cross-Validation.
102
+
103
+ For each network, train on all other networks, test on that network.
104
+ This tests generalization to truly unseen graph structures.
105
+
106
+ Returns:
107
+ results: dict with metrics per network and overall
108
+ """
109
+ networks = configs['network'].values
110
+ unique_networks = sorted(configs['network'].unique())
111
+
112
+ results = {
113
+ 'per_network': {},
114
+ 'all_predictions': [],
115
+ 'all_true': [],
116
+ 'all_configs': [],
117
+ }
118
+
119
+ scaler = StandardScaler()
120
+
121
+ for test_net in unique_networks:
122
+ test_mask = networks == test_net
123
+ train_mask = ~test_mask
124
+
125
+ if train_mask.sum() < 3:
126
+ logger.warning(f"Skipping {test_net}: only {train_mask.sum()} training samples")
127
+ continue
128
+
129
+ X_train = X.values[train_mask]
130
+ Y_train = Y.values[train_mask]
131
+ X_test = X.values[test_mask]
132
+ Y_test = Y.values[test_mask]
133
+
134
+ # Scale
135
+ scaler.fit(X_train)
136
+ X_train_s = scaler.transform(X_train)
137
+ X_test_s = scaler.transform(X_test)
138
+
139
+ # Train
140
+ if model_type == 'rf':
141
+ base = RandomForestRegressor(
142
+ n_estimators=model_kwargs.get('n_estimators', 200),
143
+ max_depth=model_kwargs.get('max_depth', None),
144
+ min_samples_leaf=model_kwargs.get('min_samples_leaf', 2),
145
+ random_state=42, n_jobs=-1,
146
+ )
147
+ else:
148
+ base = GradientBoostingRegressor(
149
+ n_estimators=model_kwargs.get('n_estimators', 200),
150
+ max_depth=model_kwargs.get('max_depth', 5),
151
+ learning_rate=model_kwargs.get('learning_rate', 0.1),
152
+ min_samples_leaf=model_kwargs.get('min_samples_leaf', 3),
153
+ random_state=42,
154
+ )
155
+
156
+ model = MultiOutputRegressor(base)
157
+ model.fit(X_train_s, Y_train)
158
+
159
+ # Predict
160
+ Y_pred = model.predict(X_test_s)
161
+
162
+ # Evaluate
163
+ net_metrics = _compute_ranking_metrics(Y_pred, Y_test, k=k)
164
+ net_metrics['n_test'] = int(test_mask.sum())
165
+ net_metrics['n_train'] = int(train_mask.sum())
166
+ results['per_network'][test_net] = net_metrics
167
+
168
+ results['all_predictions'].extend(Y_pred.tolist())
169
+ results['all_true'].extend(Y_test.tolist())
170
+ results['all_configs'].extend(
171
+ configs[test_mask][['network', 'n_samples', 'seed']].to_dict('records')
172
+ )
173
+
174
+ logger.info(f" {test_net:15s}: top{k}_hit={net_metrics['top_k_hit_rate']:.3f} "
175
+ f"regret={net_metrics['mean_regret']:.2f} "
176
+ f"ndcg={net_metrics['ndcg_at_k']:.3f}")
177
+
178
+ # Overall metrics
179
+ all_pred = np.array(results['all_predictions'])
180
+ all_true = np.array(results['all_true'])
181
+ overall = _compute_ranking_metrics(all_pred, all_true, k=k)
182
+ results['overall'] = overall
183
+
184
+ return results
185
+
186
+
187
+ def _compute_ranking_metrics(Y_pred, Y_true, k=3):
188
+ """Compute ranking metrics for algorithm selection.
189
+
190
+ Args:
191
+ Y_pred: (n, n_algos) predicted SHD values
192
+ Y_true: (n, n_algos) true SHD values
193
+ k: top-k to consider
194
+ """
195
+ n = Y_pred.shape[0]
196
+
197
+ top_k_hits = 0
198
+ regrets = []
199
+ ndcgs = []
200
+
201
+ for i in range(n):
202
+ true_ranking = np.argsort(Y_true[i]) # best algo first
203
+ pred_ranking = np.argsort(Y_pred[i]) # predicted best first
204
+
205
+ true_best = true_ranking[0]
206
+ pred_top_k = pred_ranking[:k]
207
+
208
+ # Top-k hit rate: is the true best in predicted top-k?
209
+ if true_best in pred_top_k:
210
+ top_k_hits += 1
211
+
212
+ # SHD regret: SHD of best in predicted top-k minus oracle best SHD
213
+ oracle_shd = Y_true[i, true_best]
214
+ selected_shds = [Y_true[i, j] for j in pred_top_k]
215
+ best_selected_shd = min(selected_shds)
216
+ regret = best_selected_shd - oracle_shd
217
+ regrets.append(regret)
218
+
219
+ # NDCG@k
220
+ ndcg = _ndcg_at_k(Y_true[i], Y_pred[i], k)
221
+ ndcgs.append(ndcg)
222
+
223
+ # Also compute: is one of the true top-3 in the predicted top-3?
224
+ top_k_overlap = 0
225
+ for i in range(n):
226
+ true_top_k = set(np.argsort(Y_true[i])[:k])
227
+ pred_top_k = set(np.argsort(Y_pred[i])[:k])
228
+ overlap = len(true_top_k & pred_top_k)
229
+ top_k_overlap += overlap / k
230
+
231
+ return {
232
+ 'top_k_hit_rate': top_k_hits / n, # true best in predicted top-k
233
+ 'top_k_overlap_rate': top_k_overlap / n, # avg overlap between true/pred top-k
234
+ 'mean_regret': np.mean(regrets),
235
+ 'median_regret': np.median(regrets),
236
+ 'max_regret': np.max(regrets),
237
+ 'ndcg_at_k': np.mean(ndcgs),
238
+ 'mean_pred_mse': mean_squared_error(Y_true, Y_pred),
239
+ 'mean_pred_mae': mean_absolute_error(Y_true, Y_pred),
240
+ }
241
+
242
+
243
+ def _ndcg_at_k(true_scores, pred_scores, k):
244
+ """Normalized Discounted Cumulative Gain at k.
245
+
246
+ For algorithm selection: lower SHD = better, so we negate scores for ranking.
247
+ """
248
+ # Convert SHD to relevance: rel = max_shd - shd (higher = better)
249
+ max_shd = max(true_scores.max(), 1)
250
+ relevance = max_shd - true_scores
251
+
252
+ # Predicted ranking
253
+ pred_order = np.argsort(pred_scores)[:k]
254
+
255
+ # DCG
256
+ dcg = 0
257
+ for rank, idx in enumerate(pred_order):
258
+ dcg += relevance[idx] / np.log2(rank + 2)
259
+
260
+ # Ideal DCG
261
+ ideal_order = np.argsort(-relevance)[:k]
262
+ idcg = 0
263
+ for rank, idx in enumerate(ideal_order):
264
+ idcg += relevance[idx] / np.log2(rank + 2)
265
+
266
+ return dcg / idcg if idcg > 0 else 0
267
+
268
+
269
+ def get_feature_importance(model, feature_names=FEATURE_NAMES, algo_names=ALGO_NAMES):
270
+ """Extract feature importance from trained model."""
271
+ importances = {}
272
+ for i, (algo, estimator) in enumerate(zip(algo_names, model.estimators_)):
273
+ if hasattr(estimator, 'feature_importances_'):
274
+ importances[algo] = dict(zip(feature_names, estimator.feature_importances_))
275
+
276
+ # Average importance across algorithms
277
+ avg_importance = defaultdict(float)
278
+ for algo, imp in importances.items():
279
+ for feat, val in imp.items():
280
+ avg_importance[feat] += val / len(importances)
281
+
282
+ return dict(avg_importance), importances
283
+
284
+
285
+ def save_model(model, scaler, model_dir=MODEL_DIR):
286
+ """Save trained model and scaler."""
287
+ os.makedirs(model_dir, exist_ok=True)
288
+ joblib.dump(model, os.path.join(model_dir, 'meta_learner.pkl'))
289
+ joblib.dump(scaler, os.path.join(model_dir, 'scaler.pkl'))
290
+ logger.info(f"Model saved to {model_dir}")
291
+
292
+
293
+ def load_model(model_dir=MODEL_DIR):
294
+ """Load trained model and scaler."""
295
+ model = joblib.load(os.path.join(model_dir, 'meta_learner.pkl'))
296
+ scaler = joblib.load(os.path.join(model_dir, 'scaler.pkl'))
297
+ return model, scaler
298
+
299
+
300
+ if __name__ == '__main__':
301
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
302
+
303
+ # Load meta-dataset
304
+ X, Y_shd, Y_nshd, configs = load_meta_dataset()
305
+
306
+ print(f"Meta-dataset: X={X.shape}, Y_shd={Y_shd.shape}")
307
+ print(f"Networks: {configs['network'].unique()}")
308
+ print(f"Configs per network:")
309
+ print(configs['network'].value_counts().to_string())
310
+
311
+ # Evaluate with LONO-CV
312
+ print("\n" + "=" * 80)
313
+ print("LEAVE-ONE-NETWORK-OUT CV (RandomForest)")
314
+ print("=" * 80)
315
+
316
+ results_rf = evaluate_lono_cv(X, Y_nshd, configs, model_type='rf', k=3)
317
+
318
+ print(f"\nOverall Results (RF):")
319
+ for k, v in results_rf['overall'].items():
320
+ print(f" {k:25s}: {v:.4f}")
321
+
322
+ print("\n" + "=" * 80)
323
+ print("LEAVE-ONE-NETWORK-OUT CV (GradientBoosting)")
324
+ print("=" * 80)
325
+
326
+ results_gbm = evaluate_lono_cv(X, Y_nshd, configs, model_type='gbm', k=3)
327
+
328
+ print(f"\nOverall Results (GBM):")
329
+ for k, v in results_gbm['overall'].items():
330
+ print(f" {k:25s}: {v:.4f}")
331
+
332
+ # Train final model on all data
333
+ print("\n" + "=" * 80)
334
+ print("TRAINING FINAL MODEL")
335
+ print("=" * 80)
336
+
337
+ best_type = 'rf' if results_rf['overall']['top_k_hit_rate'] >= results_gbm['overall']['top_k_hit_rate'] else 'gbm'
338
+ print(f"Selected model type: {best_type}")
339
+
340
+ model, scaler = train_meta_learner(X, Y_nshd, model_type=best_type)
341
+ save_model(model, scaler)
342
+
343
+ # Feature importance
344
+ avg_imp, per_algo_imp = get_feature_importance(model)
345
+ print("\nTop 10 Most Important Features:")
346
+ for feat, imp in sorted(avg_imp.items(), key=lambda x: -x[1])[:10]:
347
+ print(f" {feat:30s}: {imp:.4f}")
348
+
349
+ # Save all evaluation results
350
+ with open(os.path.join(RESULTS_DIR, 'evaluation_results.json'), 'w') as f:
351
+ json.dump({
352
+ 'rf': {k: v for k, v in results_rf['overall'].items()},
353
+ 'gbm': {k: v for k, v in results_gbm['overall'].items()},
354
+ 'feature_importance': avg_imp,
355
+ 'selected_model': best_type,
356
+ }, f, indent=2)