ynuozhang commited on 5 days ago

Commit

baf3373

1 Parent(s): b8c6018

update code

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

metrics/nonfouling/train_predictions_binary.csv +0 -0
metrics/nonfouling/val_predictions_binary.csv +3 -3438
tokenizer/.ipynb_checkpoints/my_tokenizers-checkpoint.py +398 -0
tokenizer/__pycache__/my_tokenizers.cpython-310.pyc +0 -0
tokenizer/my_tokenizers.py +398 -0
tokenizer/new_splits.txt +159 -0
tokenizer/new_vocab.txt +586 -0
training_classifiers/.gitignore +0 -0
training_classifiers/.ipynb_checkpoints/binding_affinity_iptm-checkpoint.py +132 -0
training_classifiers/.ipynb_checkpoints/binding_affinity_split-checkpoint.py +847 -0
training_classifiers/.ipynb_checkpoints/binding_training-checkpoint.py +414 -0
training_classifiers/.ipynb_checkpoints/binding_wt-checkpoint.bash +31 -0
training_classifiers/.ipynb_checkpoints/finetune_boost-checkpoint.py +508 -0
training_classifiers/.ipynb_checkpoints/generate_binding_val-checkpoint.py +309 -0
training_classifiers/.ipynb_checkpoints/peptiverse_filelist-checkpoint.txt +234 -0
training_classifiers/.ipynb_checkpoints/train_boost-checkpoint.py +417 -0
training_classifiers/.ipynb_checkpoints/train_ml-checkpoint.py +468 -0
training_classifiers/.ipynb_checkpoints/train_ml_regression-checkpoint.py +410 -0
training_classifiers/.ipynb_checkpoints/train_nn-checkpoint.py +426 -0
training_classifiers/.ipynb_checkpoints/train_nn_regression-checkpoint.py +420 -0
training_data_cleaned/data_split.ipynb → training_classifiers/binding_affinity/val_smiles_pooled.csv +2 -2
training_data_cleaned/nf_smiles_train.csv → training_classifiers/binding_affinity/val_smiles_unpooled.csv +2 -2
training_data_cleaned/smiles_data_split.ipynb → training_classifiers/binding_affinity/val_wt_pooled.csv +2 -2
training_data_cleaned/nf_smiles_val.csv → training_classifiers/binding_affinity/val_wt_unpooled.csv +2 -2
training_classifiers/binding_affinity/wt_smiles_pooled/best_model.pt +3 -0
training_classifiers/binding_affinity/wt_smiles_pooled/best_params.json +10 -0
training_classifiers/binding_affinity/wt_smiles_pooled/optuna_trials.csv +3 -0
training_classifiers/binding_affinity/wt_smiles_unpooled/.ipynb_checkpoints/best_params-checkpoint.json +10 -0
training_classifiers/binding_affinity/wt_smiles_unpooled/best_model.pt +3 -0
training_classifiers/binding_affinity/wt_smiles_unpooled/best_params.json +10 -0
training_classifiers/binding_affinity/wt_smiles_unpooled/optuna_trials.csv +3 -0
training_classifiers/binding_affinity/wt_wt_pooled/.ipynb_checkpoints/optuna_trials-checkpoint.csv +3 -0
training_classifiers/binding_affinity/wt_wt_pooled/best_model.pt +3 -0
training_classifiers/binding_affinity/wt_wt_pooled/best_params.json +10 -0
training_classifiers/binding_affinity/wt_wt_pooled/optuna_trials.csv +3 -0
training_classifiers/binding_affinity/wt_wt_unpooled/best_model.pt +3 -0
training_classifiers/binding_affinity/wt_wt_unpooled/best_params.json +10 -0
training_classifiers/binding_affinity/wt_wt_unpooled/optuna_trials.csv +3 -0
training_classifiers/binding_affinity_iptm.py +132 -0
training_classifiers/binding_affinity_split.py +847 -0
training_classifiers/binding_training.py +414 -0
training_classifiers/binding_wt.bash +31 -0
training_classifiers/hemolysis/cnn_smiles/best_model.pt +3 -0
training_classifiers/hemolysis/cnn_smiles/best_model_benchmark.json +39 -0
training_classifiers/hemolysis/cnn_smiles/optimization_summary.txt +19 -0
training_classifiers/hemolysis/cnn_smiles/pr_curve.png +0 -0
training_classifiers/hemolysis/cnn_smiles/roc_curve.png +0 -0
training_classifiers/hemolysis/cnn_smiles/study_trials.csv +3 -0
training_classifiers/hemolysis/cnn_smiles/train_predictions.csv +3 -0
training_classifiers/hemolysis/cnn_smiles/val_predictions.csv +3 -0

metrics/nonfouling/train_predictions_binary.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

metrics/nonfouling/val_predictions_binary.csv CHANGED Viewed

@@ -1,3438 +1,3 @@
-True Label,Predicted Probability,Predicted Label
-0,0.21203287,0
-1,0.7625684,1
-0,0.0048168427,0
-0,0.0016095717,0
-0,0.0010283176,0
-1,0.6661874,1
-0,0.25750402,0
-0,0.4311336,0
-0,0.00085044367,0
-0,0.0011397039,0
-0,0.00088075985,0
-0,0.0020098046,0
-0,0.0010203379,0
-0,0.0008713204,0
-0,0.0017780334,0
-0,0.00342609,0
-0,0.00080001954,0
-0,0.0027687384,0
-0,0.5390543,1
-0,0.8238043,1
-0,0.0012005109,0
-0,0.00097379234,0
-0,0.10598443,0
-0,0.25473288,0
-0,0.0013989281,0
-1,0.8493596,1
-0,0.3361668,0
-0,0.0011108345,0
-0,0.00085180654,0
-0,0.57711107,1
-0,0.6323541,1
-0,0.00092630496,0
-0,0.000880342,0
-1,0.7630482,1
-0,0.14976597,0
-1,0.6572999,1
-0,0.0013483333,0
-0,0.0010882191,0
-0,0.0009723114,0
-0,0.00078559143,0
-0,0.33249167,0
-0,0.0010722216,0
-0,0.00078953034,0
-1,0.6428388,1
-1,0.24572088,0
-1,0.6876133,1
-0,0.20859648,0
-0,0.0020129737,0
-0,0.56601,1
-0,0.0012045301,0
-0,0.0010913766,0
-0,0.00096886675,0
-1,0.59327847,1
-0,0.24701594,0
-0,0.00094282435,0
-0,0.00080143235,0
-0,0.00093203504,0
-1,0.38815123,0
-0,0.1851648,0
-0,0.0012724681,0
-0,0.5877677,1
-0,0.00086790195,0
-0,0.00084711233,0
-0,0.00089334225,0
-0,0.07253498,0
-0,0.003544662,0
-0,0.06225388,0
-0,0.5347445,1
-0,0.0015253695,0
-0,0.32455897,0
-0,0.0011612711,0
-0,0.7157994,1
-0,0.0011453595,0
-0,0.001003294,0
-0,0.0008531371,0
-1,0.69131196,1
-0,0.00073970045,0
-0,0.00097454654,0
-0,0.0009003313,0
-1,0.6489763,1
-0,0.0010982461,0
-0,0.00080810016,0
-0,0.0012549972,0
-1,0.44953474,0
-0,0.20876294,0
-0,0.73931056,1
-0,0.0009876546,0
-0,0.29462275,0
-0,0.39917734,0
-0,0.000782265,0
-0,0.42951488,0
-0,0.001463046,0
-1,0.68715686,1
-1,0.42291453,0
-0,0.0022332973,0
-0,0.0009480977,0
-0,0.0024130554,0
-0,0.06350319,0
-1,0.66712207,1
-0,0.00079292513,0
-0,0.0009931386,0
-0,0.6005864,1
-0,0.0008592792,0
-0,0.12034535,0
-1,0.6524521,1
-0,0.0009471925,0
-0,0.04952685,0
-0,0.081261575,0
-0,0.00092108996,0
-0,0.43716368,0
-0,0.0010192527,0
-0,0.0009138947,0
-0,0.00087209453,0
-0,0.0007799222,0
-1,0.71242714,1
-0,0.0009697011,0
-0,0.25283346,0
-0,0.0033405088,0
-0,0.0009034709,0
-0,0.0027930748,0
-0,0.0011101163,0
-1,0.34234285,0
-0,0.0009976418,0
-0,0.0009782554,0
-0,0.0010062164,0
-0,0.0012275511,0
-0,0.0007695558,0
-0,0.0019192833,0
-0,0.15324375,0
-0,0.0008446999,0
-0,0.0010133649,0
-0,0.00087288895,0
-0,0.1176671,0
-0,0.0011782635,0
-0,0.4397572,0
-0,0.0007951967,0
-0,0.0012110751,0
-0,0.00088101206,0
-0,0.12597291,0
-1,0.5714519,1
-1,0.50501525,1
-0,0.0012025698,0
-0,0.5724967,1
-0,0.09112893,0
-0,0.0013447981,0
-0,0.0011533678,0
-1,0.57831836,1
-0,0.48726425,0
-0,0.0011903379,0
-0,0.00084324833,0
-0,0.00082829024,0
-0,0.085955195,0
-0,0.0008221822,0
-1,0.62516063,1
-0,0.0010661274,0
-0,0.47287834,0
-0,0.30710956,0
-0,0.01076754,0
-1,0.21308176,0
-1,0.7633791,1
-0,0.45939833,0
-0,0.0010105146,0
-1,0.7428216,1
-0,0.15860216,0
-1,0.2822227,0
-0,0.0011596903,0
-0,0.00090244703,0
-0,0.0011741482,0
-1,0.53843015,1
-0,0.0031817604,0
-0,0.0009357714,0
-0,0.00084562885,0
-0,0.40793115,0
-0,0.0009336455,0
-0,0.0012610556,0
-0,0.0009685405,0
-0,0.0008348695,0
-0,0.00084012165,0
-0,0.0011492906,0
-0,0.0009675606,0
-0,0.6298985,1
-0,0.756409,1
-0,0.0012567933,0
-0,0.67586565,1
-0,0.00087434775,0
-0,0.49520925,0
-0,0.00083109684,0
-0,0.0015268176,0
-0,0.0009127699,0
-0,0.47751015,0
-0,0.0009006195,0
-0,0.0015889019,0
-0,0.27180874,0
-1,0.6914706,1
-0,0.789135,1
-0,0.0009752872,0
-0,0.000897456,0
-0,0.0011121263,0
-0,0.0015918579,0
-0,0.6213229,1
-0,0.6309986,1
-0,0.0020146987,0
-0,0.4184653,0
-0,0.0020128626,0
-0,0.21320935,0
-0,0.0008362684,0
-0,0.001030758,0
-0,0.0011157958,0
-0,0.0009790816,0
-1,0.7094025,1
-1,0.9093414,1
-0,0.17732719,0
-0,0.0009374656,0
-0,0.00094623194,0
-1,0.8687336,1
-0,0.65683585,1
-0,0.001121003,0
-0,0.590965,1
-0,0.6421117,1
-0,0.0010331942,0
-0,0.0009423399,0
-0,0.26386958,0
-0,0.0009064185,0
-0,0.016375644,0
-0,0.001191659,0
-0,0.17972796,0
-0,0.8418873,1
-0,0.0009111323,0
-0,0.26404643,0
-0,0.0010872352,0
-0,0.0017061317,0
-0,0.81021,1
-0,0.00087346835,0
-0,0.14280483,0
-0,0.16507958,0
-0,0.0008098925,0
-0,0.00083044183,0
-0,0.0012201187,0
-0,0.0010541711,0
-0,0.28025955,0
-0,0.0012255623,0
-0,0.0010174535,0
-0,0.0016367439,0
-0,0.73789763,1
-0,0.32007608,0
-0,0.12180342,0
-0,0.0015864924,0
-0,0.70650285,1
-0,0.00086796284,0
-0,0.0009819808,0
-0,0.099008076,0
-0,0.19955282,0
-1,0.46594706,0
-0,0.00096476724,0
-1,0.7014958,1
-0,0.6206068,1
-1,0.40477422,0
-0,0.0010020116,0
-0,0.55101025,1
-0,0.0010169167,0
-0,0.0011705819,0
-0,0.43195137,0
-0,0.33231077,0
-0,0.0010730977,0
-1,0.6236388,1
-1,0.8430124,1
-0,0.0010129493,0
-1,0.55188674,1
-0,0.2805052,0
-0,0.49679318,0
-0,0.00085314247,0
-0,0.0009600679,0
-0,0.000983881,0
-0,0.0010313862,0
-0,0.0010591929,0
-0,0.0014554731,0
-0,0.2721196,0
-0,0.0010521681,0
-0,0.0008385733,0
-0,0.0010760311,0
-0,0.56204385,1
-0,0.0009945527,0
-0,0.6538143,1
-0,0.000972335,0
-0,0.0015788077,0
-0,0.00078132324,0
-0,0.0009190443,0
-0,0.0009206846,0
-0,0.14582296,0
-0,0.0009473762,0
-0,0.00079042354,0
-0,0.0010401446,0
-0,0.3425446,0
-1,0.7118903,1
-0,0.0011929816,0
-0,0.8595175,1
-0,0.5998442,1
-0,0.0008737177,0
-1,0.80021775,1
-0,0.0067423894,0
-0,0.37208188,0
-0,0.0009025443,0
-1,0.4007288,0
-0,0.00084359787,0
-0,0.010245014,0
-0,0.0009981133,0
-0,0.0007901667,0
-1,0.8616254,1
-1,0.37549037,0
-0,0.0010908762,0
-1,0.82627475,1
-0,0.7775751,1
-1,0.64397454,1
-0,0.000756293,0
-0,0.000830868,0
-0,0.0008689119,0
-0,0.21817225,0
-1,0.54163814,1
-0,0.001031394,0
-0,0.4650486,0
-1,0.42690405,0
-0,0.000841264,0
-0,0.003041604,0
-0,0.00094686233,0
-0,0.0120981,0
-1,0.12877595,0
-0,0.0010682141,0
-0,0.26241425,0
-0,0.00917543,0
-0,0.4467414,0
-0,0.00091395644,0
-0,0.0012513435,0
-0,0.6208037,1
-0,0.09325837,0
-0,0.0014404579,0
-0,0.0013578972,0
-0,0.564982,1
-0,0.0011169427,0
-0,0.0013792963,0
-0,0.0019788202,0
-0,0.11213151,0
-0,0.00093545037,0
-0,0.00083710946,0
-0,0.0009821211,0
-0,0.33052954,0
-1,0.63790786,1
-0,0.38441333,0
-0,0.65978384,1
-0,0.16266404,0
-0,0.0009782265,0
-0,0.7020993,1
-1,0.5939269,1
-0,0.00086739147,0
-1,0.80537134,1
-1,0.5931398,1
-0,0.0010550644,0
-0,0.000939566,0
-0,0.001143589,0
-0,0.0015645259,0
-0,0.0010082681,0
-0,0.0011559983,0
-0,0.0030087254,0
-0,0.0019511192,0
-0,0.4628644,0
-0,0.46333745,0
-0,0.0009037835,0
-0,0.0012149046,0
-0,0.0011464095,0
-1,0.53254116,1
-0,0.0011695515,0
-0,0.0011102897,0
-0,0.0015082303,0
-0,0.0009924652,0
-0,0.29110697,0
-0,0.0009876668,0
-0,0.00079951587,0
-0,0.0010723416,0
-0,0.0009410649,0
-0,0.0044702576,0
-0,0.0016339025,0
-1,0.58247274,1
-0,0.00095149525,0
-0,0.0010258186,0
-0,0.00090839405,0
-1,0.5408768,1
-0,0.00091459276,0
-0,0.00096972886,0
-0,0.000929176,0
-0,0.00096466026,0
-0,0.0012642274,0
-0,0.0013000804,0
-1,0.64949554,1
-1,0.824855,1
-0,0.5840037,1
-0,0.0022679865,0
-1,0.70146537,1
-0,0.0008208898,0
-1,0.5012139,1
-0,0.0010077616,0
-0,0.0011317601,0
-1,0.5587163,1
-0,0.0008192366,0
-0,0.00091622194,0
-1,0.26070353,0
-1,0.7721127,1
-0,0.0011609249,0
-1,0.5806655,1
-0,0.001057503,0
-0,0.0009096564,0
-0,0.63499725,1
-1,0.77146506,1
-0,0.00092794717,0
-0,0.0011426172,0
-1,0.6665107,1
-0,0.44717062,0
-1,0.7875412,1
-0,0.6128087,1
-0,0.0018639723,0
-0,0.0011927163,0
-0,0.0011212432,0
-0,0.0010541681,0
-1,0.759266,1
-0,0.000915701,0
-1,0.8248112,1
-1,0.2618734,0
-1,0.5829796,1
-0,0.000971727,0
-0,0.34199846,0
-0,0.22960144,0
-0,0.0008905708,0
-0,0.7192157,1
-0,0.5267322,1
-0,0.0011434992,0
-1,0.82825303,1
-0,0.0007324419,0
-0,0.0009348062,0
-0,0.0018712084,0
-1,0.7346453,1
-0,0.0008732254,0
-0,0.0010398608,0
-1,0.78774214,1
-0,0.0010178161,0
-0,0.40890852,0
-0,0.0007731539,0
-0,0.30410865,0
-1,0.6904336,1
-0,0.0016686685,0
-0,0.17082378,0
-0,0.0019347976,0
-0,0.00089052453,0
-1,0.6660989,1
-0,0.0010408974,0
-0,0.27290353,0
-0,0.009841075,0
-0,0.0012475859,0
-1,0.5256839,1
-1,0.22151299,0
-0,0.00091817125,0
-0,0.5700492,1
-0,0.19963185,0
-0,0.0009827572,0
-0,0.0008537978,0
-0,0.00092485244,0
-0,0.0012399766,0
-0,0.0013511787,0
-0,0.10416922,0
-0,0.0008518869,0
-0,0.000871039,0
-0,0.001035327,0
-0,0.000883175,0
-0,0.0025706466,0
-0,0.29247305,0
-0,0.0008869903,0
-1,0.71239984,1
-1,0.7177157,1
-0,0.0007620353,0
-0,0.0625917,0
-0,0.0009716233,0
-0,0.0010923475,0
-0,0.0009638545,0
-0,0.0014103759,0
-0,0.586873,1
-0,0.1582566,0
-0,0.7623745,1
-0,0.00090825645,0
-0,0.008724699,0
-0,0.0008719578,0
-0,0.0010188158,0
-0,0.0008288342,0
-0,0.00085399643,0
-0,0.0011273371,0
-0,0.62889636,1
-1,0.20183899,0
-0,0.0009553314,0
-0,0.005830987,0
-0,0.10113329,0
-0,0.058883034,0
-0,0.004936521,0
-0,0.001236107,0
-0,0.0008304486,0
-0,0.0012260479,0
-0,0.4102268,0
-1,0.63618875,1
-1,0.33070007,0
-0,0.7466114,1
-0,0.0008505032,0
-0,0.15627518,0
-1,0.53720474,1
-1,0.42872614,0
-0,0.0009015459,0
-1,0.16489983,0
-0,0.7569152,1
-0,0.0009473306,0
-0,0.54220945,1
-0,0.0010804973,0
-0,0.0007759088,0
-0,0.21974401,0
-0,0.0009557337,0
-0,0.00080877467,0
-1,0.72012144,1
-1,0.6555891,1
-0,0.0010442814,0
-0,0.23529597,0
-1,0.5538642,1
-0,0.001103702,0
-1,0.75086635,1
-0,0.0010794887,0
-0,0.00087138265,0
-0,0.45431912,0
-0,0.00098219,0
-1,0.24641904,0
-0,0.001045231,0
-0,0.001125692,0
-0,0.00088083575,0
-1,0.7283503,1
-0,0.40620965,0
-0,0.0009369744,0
-1,0.61685985,1
-0,0.0015938416,0
-0,0.0010618207,0
-1,0.6549626,1
-0,0.0011033998,0
-0,0.0010170939,0
-0,0.0009539079,0
-0,0.0013007914,0
-0,0.0009015459,0
-0,0.0014242147,0
-0,0.707509,1
-0,0.33996958,0
-0,0.249575,0
-0,0.0009841922,0
-0,0.00089403824,0
-0,0.7104041,1
-1,0.7468179,1
-0,0.39707997,0
-0,0.0008614993,0
-0,0.0014179454,0
-0,0.023018427,0
-0,0.00091979245,0
-0,0.00094296545,0
-0,0.00087731396,0
-0,0.0014412756,0
-0,0.0010167825,0
-0,0.002101245,0
-0,0.0032875312,0
-0,0.0008558566,0
-1,0.6307645,1
-0,0.777811,1
-0,0.4272062,0
-0,0.35077578,0
-1,0.63610154,1
-0,0.35240352,0
-0,0.0012432288,0
-0,0.0008819291,0
-0,0.16675064,0
-0,0.0055521415,0
-0,0.0008978255,0
-1,0.5747646,1
-0,0.0009587978,0
-1,0.7340868,1
-0,0.0016573233,0
-0,0.0007863164,0
-0,0.00305028,0
-0,0.0009701932,0
-0,0.001084977,0
-0,0.0009680819,0
-0,0.23961695,0
-0,0.31963304,0
-0,0.0011128527,0
-0,0.0016477697,0
-0,0.00095384475,0
-0,0.0012485523,0
-0,0.0027906268,0
-0,0.00086827046,0
-0,0.0009598256,0
-1,0.64515626,1
-0,0.0010624658,0
-1,0.43718228,0
-0,0.0008661823,0
-1,0.6091658,1
-0,0.2584575,0
-0,0.000878247,0
-1,0.5297075,1
-0,0.0008958644,0
-0,0.63287455,1
-0,0.8396176,1
-0,0.0017420241,0
-0,0.00093098823,0
-0,0.0008039557,0
-0,0.0008432391,0
-0,0.34551686,0
-0,0.14493409,0
-0,0.001269443,0
-1,0.6718914,1
-0,0.0011318232,0
-1,0.4247725,0
-0,0.0009978332,0
-0,0.0010476196,0
-0,0.0008974574,0
-0,0.42583707,0
-0,0.00087123946,0
-0,0.17696548,0
-1,0.7879036,1
-0,0.000978515,0
-0,0.0009257359,0
-0,0.0012878132,0
-1,0.64483523,1
-0,0.0008505065,0
-1,0.43661386,0
-1,0.57060575,1
-0,0.42568576,0
-0,0.0009259524,0
-0,0.0009263901,0
-0,0.00083254103,0
-0,0.00087859563,0
-0,0.0030385156,0
-1,0.5120378,1
-0,0.0009727936,0
-0,0.0008510578,0
-0,0.0010575671,0
-1,0.6155601,1
-0,0.00091628404,0
-1,0.82486975,1
-0,0.0011844339,0
-0,0.0010148155,0
-1,0.6681816,1
-0,0.0011263781,0
-0,0.61207116,1
-0,0.62184155,1
-1,0.73041785,1
-0,0.0025686398,0
-0,0.0009894907,0
-0,0.0010610846,0
-0,0.0010225485,0
-0,0.001962883,0
-0,0.0010426701,0
-0,0.001386491,0
-0,0.00080189446,0
-0,0.0019367785,0
-0,0.0008910609,0
-0,0.0017919212,0
-0,0.0009899494,0
-0,0.0008948113,0
-0,0.00095926056,0
-0,0.3376383,0
-0,0.0014171846,0
-1,0.659626,1
-0,0.0014162698,0
-0,0.69116306,1
-0,0.0009343347,0
-0,0.00096477,0
-0,0.000974611,0
-1,0.389206,0
-0,0.00087169366,0
-0,0.12066001,0
-0,0.0010589029,0
-1,0.5742767,1
-0,0.0012632777,0
-0,0.0028386211,0
-1,0.7000751,1
-0,0.711954,1
-1,0.2301944,0
-0,0.593717,1
-0,0.0010470055,0
-1,0.20598996,0
-0,0.0011602843,0
-0,0.0009489052,0
-0,0.0009171116,0
-0,0.00077974907,0
-0,0.00092091836,0
-0,0.4514688,0
-0,0.0008686565,0
-1,0.68002725,1
-0,0.0009320532,0
-0,0.00096953986,0
-1,0.47608092,0
-1,0.43994707,0
-0,0.0016875481,0
-0,0.0013200458,0
-1,0.30488643,0
-1,0.2136204,0
-1,0.7080725,1
-0,0.35315382,0
-0,0.0007530385,0
-1,0.3350419,0
-0,0.0010032223,0
-0,0.0010372837,0
-0,0.5047713,1
-0,0.0011856316,0
-1,0.5202941,1
-0,0.036287144,0
-0,0.0015443955,0
-0,0.45689735,0
-0,0.05079241,0
-0,0.00078609615,0
-0,0.00089042104,0
-0,0.00091053615,0
-1,0.8260853,1
-0,0.0012496725,0
-0,0.001003521,0
-0,0.0014080106,0
-0,0.43465498,0
-1,0.7085056,1
-0,0.1071419,0
-0,0.38532647,0
-0,0.0007924066,0
-0,0.0012905765,0
-0,0.38276187,0
-0,0.6617229,1
-1,0.34884775,0
-0,0.0024217672,0
-0,0.0009956957,0
-0,0.25291744,0
-1,0.6034158,1
-0,0.0022521024,0
-0,0.0009386203,0
-1,0.50254047,1
-0,0.00085585134,0
-0,0.59543693,1
-0,0.0011632922,0
-0,0.5392403,1
-1,0.7379359,1
-0,0.615833,1
-0,0.0011334324,0
-1,0.6452454,1
-1,0.5439059,1
-1,0.4070706,0
-0,0.00085760804,0
-0,0.0013209702,0
-0,0.00088978535,0
-0,0.17897561,0
-0,0.0008940443,0
-0,0.0021460964,0
-0,0.46505737,0
-0,0.00095002423,0
-0,0.0011486006,0
-0,0.0008134391,0
-1,0.61036927,1
-1,0.82847095,1
-0,0.0008939127,0
-1,0.74133915,1
-0,0.0010881014,0
-1,0.6198983,1
-0,0.00094616745,0
-1,0.37103793,0
-0,0.0011978437,0
-0,0.21946022,0
-0,0.0010989493,0
-0,0.0011152511,0
-1,0.6064778,1
-0,0.0021220825,0
-0,0.001013543,0
-0,0.00082881283,0
-0,0.0016203971,0
-0,0.71427095,1
-0,0.5296034,1
-0,0.0007731554,0
-0,0.0011991286,0
-0,0.0014669152,0
-0,0.56059104,1
-0,0.0009583868,0
-1,0.83952165,1
-0,0.22009522,0
-0,0.001713881,0
-0,0.0011007866,0
-0,0.22656342,0
-0,0.0026020007,0
-0,0.0018667651,0
-0,0.5471779,1
-0,0.001456346,0
-0,0.0010325527,0
-0,0.20448126,0
-0,0.001224137,0
-1,0.6375793,1
-0,0.2193653,0
-0,0.0008086656,0
-0,0.0012699576,0
-0,0.0008365637,0
-0,0.57084,1
-0,0.0010449449,0
-1,0.20258097,0
-0,0.0010269393,0
-1,0.4684375,0
-0,0.0009493274,0
-0,0.0009980378,0
-0,0.0009156465,0
-0,0.6496492,1
-0,0.34035468,0
-0,0.0010597436,0
-1,0.47263625,0
-0,0.0009759152,0
-1,0.78246176,1
-0,0.4524046,0
-1,0.38766143,0
-1,0.7272198,1
-0,0.0008958892,0
-0,0.0010322925,0
-0,0.0010094311,0
-0,0.0012807564,0
-0,0.5271925,1
-0,0.36517152,0
-0,0.10721343,0
-0,0.0009403063,0
-0,0.00077919016,0
-0,0.0009822751,0
-0,0.00076234364,0
-0,0.00078162784,0
-1,0.5729158,1
-0,0.00089957967,0
-0,0.0009848445,0
-1,0.551513,1
-1,0.7289652,1
-0,0.0010857919,0
-1,0.5924267,1
-0,0.0015767976,0
-0,0.0008469038,0
-1,0.90326667,1
-0,0.0009163141,0
-0,0.0011933714,0
-0,0.0011287286,0
-0,0.0008222916,0
-0,0.008933486,0
-0,0.0018526828,0
-0,0.00112532,0
-0,0.08725183,0
-0,0.0011224038,0
-0,0.0024273458,0
-0,0.0020511525,0
-0,0.0021996044,0
-1,0.7084392,1
-0,0.0013229746,0
-0,0.017357908,0
-0,0.002221878,0
-0,0.00085381826,0
-0,0.0045482507,0
-0,0.27093408,0
-0,0.50731814,1
-0,0.0012390758,0
-1,0.56386113,1
-0,0.00089642877,0
-0,0.0009294908,0
-0,0.0012274805,0
-0,0.0009345854,0
-0,0.00086570746,0
-0,0.0011756949,0
-0,0.38236865,0
-1,0.687024,1
-0,0.23687929,0
-0,0.0016748687,0
-1,0.6843725,1
-0,0.304691,0
-0,0.0009893903,0
-0,0.0012283546,0
-0,0.0009146128,0
-0,0.7573988,1
-1,0.34269,0
-0,0.0034435734,0
-0,0.00087736914,0
-0,0.0015209204,0
-0,0.0009608211,0
-0,0.0009280081,0
-0,0.0012842439,0
-1,0.6248447,1
-1,0.62274516,1
-0,0.00165578,0
-0,0.0008775915,0
-0,0.18800546,0
-1,0.777882,1
-1,0.76358753,1
-0,0.00093880715,0
-0,0.7536075,1
-0,0.3016229,0
-0,0.0011477552,0
-0,0.0014952337,0
-0,0.0009555025,0
-0,0.15660593,0
-0,0.0011227432,0
-0,0.001997399,0
-0,0.56990355,1
-0,0.3734899,0
-1,0.5575483,1
-1,0.6860012,1
-1,0.10437922,0
-1,0.8180956,1
-0,0.0011188495,0
-0,0.0010619572,0
-0,0.56458724,1
-0,0.0010933846,0
-0,0.0012181381,0
-0,0.4674541,0
-1,0.485787,0
-0,0.0013055103,0
-1,0.70587736,1
-0,0.00079359673,0
-1,0.80720824,1
-0,0.0016512059,0
-1,0.7673012,1
-0,0.0010114614,0
-0,0.0013267859,0
-0,0.0008793849,0
-0,0.0013299336,0
-0,0.08014288,0
-0,0.0035911172,0
-0,0.0009812173,0
-1,0.25576597,0
-1,0.7145252,1
-1,0.74173844,1
-1,0.7591139,1
-1,0.79900056,1
-0,0.0010525746,0
-0,0.0010317984,0
-0,0.0009139964,0
-0,0.0011968515,0
-0,0.0010743748,0
-0,0.46601886,0
-0,0.0010757077,0
-0,0.00852641,0
-0,0.002048128,0
-1,0.80355567,1
-0,0.0014045321,0
-0,0.0011115061,0
-0,0.0008338689,0
-1,0.5675804,1
-0,0.0008722262,0
-0,0.001032084,0
-0,0.00090455357,0
-0,0.000761143,0
-1,0.25547284,0
-0,0.72657347,1
-0,0.00095886085,0
-0,0.0007715649,0
-0,0.00086148566,0
-0,0.0017338935,0
-0,0.0029073667,0
-1,0.74649566,1
-0,0.00082679826,0
-0,0.00097232254,0
-0,0.001773119,0
-0,0.0009934949,0
-1,0.8185504,1
-0,0.0017876541,0
-0,0.0008482355,0
-0,0.00088651665,0
-0,0.0028928719,0
-0,0.000960697,0
-0,0.6135191,1
-0,0.0015025416,0
-0,0.00096636615,0
-1,0.27818596,0
-0,0.0010354616,0
-1,0.7740497,1
-0,0.0010154156,0
-0,0.0010276875,0
-0,0.0013553945,0
-0,0.64635444,1
-0,0.36508304,0
-0,0.001062856,0
-0,0.00074448035,0
-0,0.0013278496,0
-0,0.43072903,0
-0,0.3727711,0
-0,0.1476106,0
-1,0.7469999,1
-1,0.5393194,1
-1,0.62362605,1
-0,0.4977242,0
-0,0.0019504097,0
-0,0.0011121453,0
-0,0.00094129675,0
-1,0.13999684,0
-1,0.84094584,1
-1,0.67023414,1
-0,0.0009355351,0
-0,0.0012011972,0
-1,0.85856575,1
-0,0.00095948775,0
-0,0.00092585845,0
-1,0.78414935,1
-0,0.2081838,0
-1,0.3134156,0
-0,0.5827518,1
-0,0.0011884035,0
-0,0.3416024,0
-0,0.3513188,0
-1,0.7360253,1
-0,0.14421782,0
-0,0.0008775974,0
-0,0.0010155869,0
-1,0.6316725,1
-0,0.0011462267,0
-0,0.00090081355,0
-0,0.46972373,0
-0,0.0010553041,0
-0,0.5809954,1
-0,0.0059841666,0
-1,0.6962589,1
-0,0.004652636,0
-0,0.0008641569,0
-1,0.80275476,1
-0,0.0008407392,0
-0,0.0010207775,0
-0,0.0011965961,0
-1,0.41581193,0
-0,0.002618945,0
-0,0.001120625,0
-0,0.00090287067,0
-1,0.6921951,1
-0,0.32014215,0
-1,0.44863924,0
-0,0.20511761,0
-0,0.0008183238,0
-0,0.27403337,0
-0,0.0014313993,0
-0,0.52280444,1
-0,0.00095016713,0
-0,0.001587931,0
-0,0.2349104,0
-1,0.65222037,1
-0,0.0015776413,0
-0,0.0012768807,0
-0,0.0011938995,0
-1,0.8248594,1
-1,0.5302688,1
-0,0.0017132758,0
-0,0.0008661427,0
-0,0.0013237718,0
-0,0.30642387,0
-0,0.0013227944,0
-0,0.0007931251,0
-0,0.000937015,0
-0,0.0011775857,0
-0,0.0031580946,0
-0,0.0017192552,0
-0,0.00090431585,0
-0,0.0008876776,0
-0,0.3237665,0
-0,0.0011375708,0
-0,0.713965,1
-1,0.6401749,1
-0,0.055340905,0
-0,0.0008713582,0
-0,0.0011798014,0
-0,0.19061498,0
-0,0.0008910244,0
-1,0.6702638,1
-0,0.00078732,0
-1,0.5069871,1
-0,0.0010917349,0
-1,0.30569232,0
-0,0.0010508688,0
-0,0.008350478,0
-0,0.1295904,0
-0,0.0012138723,0
-0,0.23459788,0
-0,0.00095988956,0
-0,0.0011302889,0
-0,0.41533685,0
-1,0.81494963,1
-1,0.20318276,0
-1,0.6849259,1
-0,0.0007971484,0
-0,0.0010026495,0
-1,0.53866345,1
-0,0.47900033,0
-0,0.0013153972,0
-0,0.00092683197,0
-0,0.00079303206,0
-0,0.0010194737,0
-0,0.00090722676,0
-0,0.0014398387,0
-0,0.0008348118,0
-1,0.8690826,1
-0,0.45386383,0
-0,0.0009348062,0
-1,0.7537117,1
-0,0.0009848943,0
-1,0.8337264,1
-0,0.0013707685,0
-1,0.64028686,1
-0,0.0009233346,0
-0,0.0011110144,0
-0,0.00091898517,0
-0,0.37525868,0
-0,0.0012479519,0
-0,0.0011017517,0
-0,0.23442647,0
-0,0.001286486,0
-0,0.0010251929,0
-0,0.0012779953,0
-0,0.10511962,0
-0,0.0013187885,0
-0,0.0008432262,0
-0,0.0023270152,0
-0,0.19635822,0
-0,0.42605725,0
-0,0.0026588596,0
-0,0.0010699191,0
-0,0.00083928637,0
-0,0.0010421542,0
-0,0.0010161884,0
-0,0.0007994666,0
-0,0.0011844927,0
-1,0.51526463,1
-1,0.8424703,1
-0,0.0008336365,0
-0,0.0008402345,0
-0,0.0010367532,0
-0,0.0008751524,0
-0,0.0013134012,0
-0,0.0008601877,0
-0,0.28373632,0
-0,0.0010632672,0
-0,0.001001677,0
-0,0.0009289228,0
-0,0.0010856837,0
-0,0.0009986022,0
-0,0.0011016888,0
-0,0.0009970806,0
-0,0.0014796017,0
-0,0.0009895341,0
-1,0.87620765,1
-0,0.13668354,0
-0,0.5543678,1
-0,0.0011205432,0
-1,0.60824937,1
-1,0.8209723,1
-1,0.77438575,1
-0,0.44236442,0
-1,0.5708855,1
-0,0.0014770482,0
-0,0.0011481309,0
-1,0.76328576,1
-0,0.0012447835,0
-0,0.6776131,1
-0,0.22794476,0
-0,0.0015239945,0
-0,0.0014258837,0
-0,0.2854073,0
-0,0.0008883935,0
-1,0.57807,1
-0,0.0011267756,0
-1,0.61485714,1
-0,0.0007958319,0
-0,0.0010823915,0
-0,0.2250206,0
-1,0.4204774,0
-0,0.00093722483,0
-0,0.0017883043,0
-0,0.0009139735,0
-0,0.00089569465,0
-0,0.0013786783,0
-0,0.63649786,1
-0,0.00097638293,0
-1,0.73162854,1
-0,0.00088065065,0
-0,0.0010577292,0
-0,0.0011432399,0
-1,0.6012633,1
-0,0.0011757269,0
-0,0.0008843888,0
-0,0.00094909506,0
-0,0.0009065131,0
-1,0.50341845,1
-1,0.3092106,0
-0,0.087408595,0
-1,0.47898576,0
-0,0.7805265,1
-0,0.55192417,1
-0,0.100362,0
-0,0.0009126828,0
-1,0.7031464,1
-0,0.0008378504,0
-0,0.000909575,0
-0,0.14698155,0
-0,0.000886812,0
-0,0.0034394178,0
-0,0.00083480857,0
-0,0.0009230623,0
-0,0.00086228485,0
-0,0.0012248036,0
-0,0.3685421,0
-1,0.75045246,1
-1,0.29140982,0
-1,0.824118,1
-0,0.0011165916,0
-0,0.0015328506,0
-0,0.49720347,0
-1,0.6922905,1
-0,0.0007836441,0
-0,0.000927602,0
-0,0.70492715,1
-0,0.0008786006,0
-0,0.7033895,1
-1,0.6138756,1
-1,0.5470745,1
-1,0.28883335,0
-0,0.001316554,0
-0,0.000900596,0
-1,0.77885973,1
-0,0.3792245,0
-0,0.0010777283,0
-0,0.022616526,0
-0,0.3294621,0
-0,0.00081920385,0
-0,0.0011605065,0
-0,0.2769352,0
-0,0.80774456,1
-0,0.000816923,0
-0,0.0010059879,0
-0,0.0009761908,0
-0,0.0019736588,0
-0,0.001108255,0
-0,0.00084299047,0
-0,0.0007932735,0
-0,0.0010224554,0
-0,0.0010019916,0
-0,0.0011584796,0
-0,0.00095932867,0
-0,0.0014852965,0
-0,0.0011517362,0
-0,0.0016003981,0
-0,0.5883149,1
-0,0.0011397423,0
-0,0.54703045,1
-0,0.46631202,0
-1,0.72806704,1
-1,0.66733813,1
-0,0.00082930306,0
-0,0.0013221847,0
-0,0.37714672,0
-1,0.6671186,1
-0,0.76171786,1
-1,0.84557354,1
-0,0.0009865002,0
-0,0.00078149413,0
-0,0.0020528194,0
-0,0.001968213,0
-1,0.29894271,0
-1,0.65170336,1
-0,0.00087412616,0
-0,0.0008334153,0
-0,0.002001055,0
-0,0.0010972196,0
-1,0.6604654,1
-1,0.75812054,1
-1,0.69461435,1
-0,0.20077878,0
-0,0.19034809,0
-0,0.001009536,0
-0,0.09553723,0
-1,0.4960136,0
-0,0.0012337598,0
-0,0.0030067663,0
-0,0.53967786,1
-0,0.0012488324,0
-0,0.001111368,0
-0,0.001234995,0
-0,0.0011818678,0
-0,0.42678392,0
-1,0.73771423,1
-0,0.7895419,1
-0,0.0012994151,0
-0,0.00084598584,0
-0,0.0007987745,0
-0,0.0012115806,0
-1,0.58688515,1
-0,0.00091692246,0
-0,0.0010023817,0
-1,0.7862743,1
-1,0.25347495,0
-0,0.3326281,0
-0,0.3226471,0
-0,0.6390405,1
-0,0.54111165,1
-0,0.001012588,0
-0,0.0009537402,0
-0,0.0008496503,0
-1,0.6967211,1
-0,0.0010966522,0
-0,0.0011798083,0
-0,0.6963768,1
-0,0.00095325924,0
-0,0.0011999392,0
-0,0.32601213,0
-1,0.5316113,1
-0,0.46846902,0
-0,0.00096525403,0
-0,0.0011292063,0
-0,0.001140362,0
-0,0.0012218289,0
-0,0.40930805,0
-0,0.0011080602,0
-1,0.5122479,1
-1,0.59998643,1
-0,0.061730716,0
-1,0.41931114,0
-0,0.16193198,0
-0,0.0007972308,0
-0,0.0009665351,0
-0,0.0010098865,0
-0,0.0020258357,0
-0,0.46376464,0
-0,0.28660434,0
-0,0.0010756479,0
-0,0.5611161,1
-0,0.1110538,0
-0,0.0015265195,0
-0,0.00088010135,0
-0,0.23574975,0
-0,0.42993984,0
-0,0.0012539547,0
-0,0.39793822,0
-0,0.0008850873,0
-0,0.0010175374,0
-0,0.57915115,1
-0,0.60442656,1
-0,0.0008875359,0
-0,0.0010697005,0
-0,0.0009562333,0
-0,0.0019253149,0
-1,0.77746564,1
-0,0.0015686869,0
-1,0.31335396,0
-0,0.000908781,0
-0,0.0013533514,0
-0,0.2515571,0
-0,0.00089592073,0
-1,0.25762314,0
-1,0.6580362,1
-0,0.000960752,0
-0,0.0010607035,0
-0,0.0077122697,0
-1,0.6530093,1
-0,0.00087614363,0
-0,0.007246284,0
-0,0.6812245,1
-0,0.23485817,0
-0,0.00086641597,0
-0,0.0012842317,0
-0,0.0007809932,0
-1,0.5749303,1
-0,0.0015573403,0
-0,0.13056204,0
-0,0.357255,0
-0,0.34036377,0
-0,0.3980148,0
-1,0.69733506,1
-0,0.25944367,0
-0,0.34873602,0
-0,0.0009726848,0
-1,0.49505424,0
-1,0.6892435,1
-1,0.7743485,1
-0,0.32794416,0
-1,0.22797777,0
-0,0.0009458529,0
-0,0.38203833,0
-0,0.0012890919,0
-0,0.000789744,0
-0,0.31899726,0
-0,0.6728214,1
-0,0.000999975,0
-0,0.00094006164,0
-0,0.00075543154,0
-0,0.0013109997,0
-0,0.26060212,0
-0,0.2962978,0
-1,0.81047857,1
-0,0.0009169193,0
-0,0.0009614863,0
-0,0.00097122806,0
-0,0.0015418291,0
-0,0.0010950873,0
-0,0.0043601934,0
-1,0.6357652,1
-0,0.001165279,0
-0,0.0009673975,0
-1,0.83262265,1
-0,0.0009282163,0
-0,0.70172244,1
-0,0.0009607166,0
-0,0.00084028306,0
-0,0.6203581,1
-0,0.2677468,0
-1,0.7861525,1
-1,0.76065445,1
-0,0.0022763049,0
-0,0.0008973102,0
-0,0.0008466766,0
-0,0.0009204096,0
-1,0.5665215,1
-0,0.50933987,1
-0,0.0011783128,0
-0,0.00090577826,0
-0,0.47160682,0
-1,0.86713326,1
-0,0.001341303,0
-1,0.16438311,0
-1,0.5899627,1
-0,0.0009171007,0
-0,0.0009108268,0
-0,0.00084698235,0
-0,0.0007781086,0
-0,0.0010233963,0
-0,0.24456206,0
-0,0.0014641931,0
-0,0.0007724079,0
-0,0.35753417,0
-0,0.000865038,0
-0,0.0031078586,0
-0,0.0008967957,0
-1,0.77871156,1
-1,0.58639294,1
-0,0.0026566426,0
-0,0.00082757784,0
-0,0.0011661336,0
-0,0.001019244,0
-1,0.7419938,1
-1,0.42898682,0
-1,0.76121277,1
-0,0.0012546297,0
-1,0.37173283,0
-1,0.60157835,1
-1,0.3550261,0
-0,0.00083142443,0
-0,0.010433526,0
-0,0.0008385298,0
-0,0.0010685887,0
-0,0.0014192335,0
-0,0.00084556115,0
-0,0.25132394,0
-0,0.00080387027,0
-0,0.0009537752,0
-1,0.77484345,1
-0,0.0010258998,0
-0,0.00085469737,0
-1,0.80244,1
-0,0.6002382,1
-0,0.00085503253,0
-0,0.00087335554,0
-0,0.1733697,0
-0,0.0013072107,0
-0,0.31264597,0
-0,0.82034665,1
-1,0.67145926,1
-1,0.5704474,1
-0,0.0019223319,0
-0,0.0009967473,0
-0,0.4709534,0
-0,0.0044812486,0
-0,0.0009170116,0
-1,0.7392076,1
-0,0.0061432202,0
-0,0.45998988,0
-0,0.0033909979,0
-0,0.0011331846,0
-0,0.0010110885,0
-0,0.0011058631,0
-1,0.41766933,0
-0,0.00071926217,0
-0,0.54065317,1
-1,0.34456867,0
-0,0.0010258704,0
-0,0.0016063552,0
-0,0.3136817,0
-0,0.0009535526,0
-0,0.0010021557,0
-0,0.0009705446,0
-0,0.14366324,0
-0,0.11577339,0
-1,0.5175177,1
-0,0.0010051089,0
-0,0.0010658188,0
-0,0.00097995,0
-0,0.0011440237,0
-1,0.54906136,1
-0,0.4807756,0
-0,0.300868,0
-0,0.0011075152,0
-0,0.00085573347,0
-0,0.00085721305,0
-0,0.0010270772,0
-1,0.75990576,1
-0,0.06632882,0
-0,0.001487759,0
-1,0.20113204,0
-0,0.0013309364,0
-0,0.000907346,0
-1,0.5900306,1
-0,0.000946925,0
-0,0.0010906772,0
-0,0.41585302,0
-1,0.78238595,1
-0,0.0009065691,0
-0,0.0008776255,0
-1,0.788677,1
-0,0.001570436,0
-0,0.22623111,0
-0,0.0011954956,0
-0,0.26859984,0
-0,0.0012529476,0
-0,0.00084351,0
-0,0.0009662851,0
-0,0.0008385861,0
-0,0.0011187536,0
-0,0.000936137,0
-0,0.0012308148,0
-0,0.00084265514,0
-1,0.55672204,1
-1,0.7771148,1
-0,0.0012370128,0
-0,0.0010973206,0
-0,0.00087856123,0
-0,0.11740843,0
-0,0.0010494886,0
-0,0.0016682858,0
-0,0.0012532771,0
-0,0.001405361,0
-0,0.20851627,0
-1,0.6311097,1
-0,0.0008749682,0
-1,0.6411818,1
-1,0.83963007,1
-0,0.0011985023,0
-0,0.0010454545,0
-0,0.7701166,1
-0,0.00076997356,0
-1,0.8383424,1
-1,0.6781555,1
-0,0.0036841896,0
-1,0.59758455,1
-0,0.043098766,0
-0,0.0008827312,0
-0,0.00080906873,0
-1,0.78783256,1
-0,0.0011804759,0
-0,0.46424973,0
-0,0.0017560824,0
-0,0.004443426,0
-0,0.29807636,0
-0,0.0010315041,0
-0,0.5789729,1
-0,0.00084546005,0
-0,0.0011440341,0
-1,0.81396484,1
-1,0.56533325,1
-0,0.0011488326,0
-0,0.0012227881,0
-0,0.25655642,0
-1,0.5899143,1
-0,0.54285944,1
-0,0.0012625692,0
-0,0.00080501626,0
-0,0.0011473355,0
-0,0.0011238786,0
-0,0.0021833822,0
-0,0.0008517226,0
-0,0.0009454461,0
-0,0.0010944246,0
-0,0.024547417,0
-0,0.0010403111,0
-0,0.26350185,0
-0,0.0010104905,0
-0,0.09447297,0
-0,0.000852578,0
-0,0.0012083028,0
-0,0.6784862,1
-0,0.0009658967,0
-0,0.0010201805,0
-0,0.16826008,0
-0,0.0008753944,0
-0,0.00078190427,0
-0,0.20378338,0
-1,0.6095833,1
-1,0.55670387,1
-0,0.47983488,0
-1,0.24339448,0
-0,0.0013973623,0
-0,0.0008691309,0
-1,0.8703108,1
-0,0.002405853,0
-0,0.0011524003,0
-0,0.55783266,1
-0,0.0012722318,0
-0,0.00088787306,0
-0,0.1720217,0
-0,0.0009373888,0
-0,0.0043997974,0
-1,0.34884313,0
-1,0.3523087,0
-0,0.65511626,1
-0,0.0014629874,0
-0,0.37225887,0
-0,0.0012105026,0
-0,0.0010079421,0
-0,0.0010033519,0
-0,0.0008274714,0
-0,0.00087697804,0
-0,0.20703182,0
-0,0.0011603661,0
-0,0.0014855879,0
-0,0.22130914,0
-0,0.00086291565,0
-0,0.0008673242,0
-0,0.0011818307,0
-0,0.00096120656,0
-0,0.000923244,0
-0,0.4318089,0
-1,0.31608683,0
-1,0.77528465,1
-0,0.0013540791,0
-0,0.0008699616,0
-0,0.00094812346,0
-0,0.51795197,1
-1,0.61414665,1
-1,0.7352273,1
-0,0.00086347247,0
-0,0.0008687025,0
-0,0.0011694061,0
-0,0.0011693755,0
-0,0.455578,0
-0,0.105311655,0
-0,0.0008285432,0
-0,0.0010142302,0
-1,0.19479476,0
-0,0.0008654564,0
-0,0.0011336721,0
-0,0.0011340285,0
-0,0.24410512,0
-0,0.00090666034,0
-0,0.0015996577,0
-1,0.73918,1
-0,0.000983292,0
-0,0.0013622475,0
-0,0.0012581018,0
-0,0.00080613553,0
-0,0.0009217467,0
-1,0.26258734,0
-0,0.0012133989,0
-0,0.001480057,0
-0,0.7632006,1
-1,0.5682584,1
-0,0.0014863011,0
-0,0.63781154,1
-0,0.0011002115,0
-0,0.00084043556,0
-0,0.0014428858,0
-0,0.0013104859,0
-1,0.76846975,1
-1,0.6673162,1
-0,0.0008704569,0
-0,0.0008219127,0
-0,0.65152436,1
-0,0.0008993478,0
-0,0.0008565709,0
-0,0.14742456,0
-0,0.057693627,0
-0,0.0008922193,0
-1,0.15329692,0
-1,0.65068716,1
-0,0.0009436021,0
-0,0.009623958,0
-0,0.14724772,0
-0,0.0014856155,0
-0,0.635374,1
-0,0.53666717,1
-0,0.24563298,0
-0,0.46256322,0
-0,0.0009238498,0
-0,0.0008537007,0
-1,0.7335384,1
-0,0.00082300097,0
-1,0.444842,0
-0,0.0010771926,0
-0,0.00084987056,0
-0,0.0020735206,0
-0,0.0008576367,0
-0,0.00083232333,0
-0,0.0011556697,0
-0,0.0015070596,0
-0,0.0008087288,0
-0,0.0013067058,0
-0,0.21971786,0
-0,0.00081399345,0
-1,0.5227279,1
-0,0.0012097977,0
-0,0.001634093,0
-1,0.8266393,1
-1,0.79590076,1
-0,0.2257322,0
-1,0.6879368,1
-0,0.0008921309,0
-0,0.000969393,0
-1,0.77856237,1
-0,0.00078926905,0
-0,0.001172239,0
-1,0.40544796,0
-1,0.69128567,1
-0,0.0058039543,0
-0,0.0010676429,0
-0,0.0009675725,0
-0,0.24802871,0
-0,0.00088985654,0
-0,0.00090884947,0
-0,0.48389488,0
-0,0.67111087,1
-0,0.0011216454,0
-0,0.000996518,0
-1,0.7353937,1
-1,0.6577438,1
-1,0.7053728,1
-1,0.78178835,1
-0,0.615978,1
-1,0.6272983,1
-0,0.004934533,0
-0,0.0024336318,0
-0,0.00089967996,0
-1,0.5925518,1
-1,0.7473727,1
-0,0.3549951,0
-0,0.57792014,1
-1,0.6967723,1
-0,0.6283576,1
-0,0.3178549,0
-1,0.2248403,0
-1,0.81427705,1
-0,0.0010010622,0
-0,0.0013560583,0
-0,0.0015472178,0
-0,0.0009804085,0
-0,0.0007600732,0
-1,0.5372315,1
-0,0.0010776622,0
-0,0.00083396706,0
-0,0.0013004588,0
-0,0.5804702,1
-0,0.35824117,0
-0,0.001130411,0
-1,0.45011675,0
-0,0.0011523701,0
-0,0.0009201427,0
-0,0.0009954533,0
-0,0.00096182036,0
-0,0.00093874236,0
-0,0.4260264,0
-0,0.0012323499,0
-0,0.0014474612,0
-0,0.0009775738,0
-0,0.2297708,0
-0,0.69282055,1
-0,0.0009934746,0
-0,0.36029232,0
-1,0.36951026,0
-0,0.40954018,0
-0,0.00085924973,0
-0,0.003017331,0
-0,0.0016876189,0
-0,0.0010288198,0
-1,0.7446905,1
-0,0.24530704,0
-0,0.0010668626,0
-0,0.0037525713,0
-0,0.001123924,0
-0,0.0013527669,0
-0,0.30088598,0
-0,0.22248338,0
-0,0.0012103878,0
-0,0.40475878,0
-0,0.0008372259,0
-0,0.46901873,0
-0,0.0012301363,0
-0,0.0011264655,0
-0,0.00089167035,0
-0,0.00086598546,0
-0,0.0007683753,0
-0,0.120619036,0
-1,0.31516576,0
-0,0.0010315306,0
-1,0.82660633,1
-1,0.16573146,0
-1,0.45056126,0
-0,0.012582662,0
-0,0.2679602,0
-0,0.0007925501,0
-0,0.0017378323,0
-0,0.0017654862,0
-1,0.65987283,1
-1,0.6453965,1
-1,0.553943,1
-0,0.34653622,0
-0,0.0009486267,0
-0,0.0010880433,0
-1,0.64795786,1
-1,0.8529027,1
-1,0.537922,1
-0,0.0017200281,0
-0,0.0008353453,0
-1,0.75242555,1
-0,0.0007492936,0
-0,0.38131416,0
-0,0.61270493,1
-0,0.3288878,0
-0,0.6910125,1
-1,0.75235885,1
-1,0.7090527,1
-0,0.00084375514,0
-0,0.7937191,1
-0,0.00093711726,0
-1,0.5996378,1
-0,0.0009235945,0
-0,0.39941287,0
-1,0.42442018,0
-0,0.0010583275,0
-0,0.60000926,1
-0,0.0010812476,0
-0,0.0009984957,0
-1,0.8072236,1
-1,0.4488358,0
-0,0.0015813457,0
-1,0.2241001,0
-1,0.75282294,1
-0,0.000868323,0
-1,0.6267692,1
-0,0.00089177827,0
-0,0.001014311,0
-0,0.0008509839,0
-0,0.5767851,1
-0,0.0007812928,0
-0,0.0012110077,0
-0,0.0009440347,0
-1,0.7179761,1
-1,0.4313237,0
-0,0.0010092129,0
-0,0.00078347983,0
-0,0.0011101945,0
-1,0.53682137,1
-0,0.0009936662,0
-0,0.0011565915,0
-0,0.0010442929,0
-1,0.39957625,0
-0,0.001340685,0
-1,0.5603316,1
-0,0.22706328,0
-0,0.0009566926,0
-0,0.43829724,0
-0,0.0008824684,0
-0,0.0011820873,0
-0,0.0008683048,0
-0,0.00074347574,0
-0,0.51406115,1
-0,0.0025957846,0
-0,0.0010757236,0
-1,0.59766775,1
-0,0.0009575653,0
-0,0.0012813427,0
-0,0.001484598,0
-0,0.8911086,1
-1,0.41940725,0
-0,0.104963206,0
-0,0.22381157,0
-0,0.50649196,1
-0,0.0010766535,0
-0,0.00096266886,0
-0,0.0012597787,0
-0,0.0008376636,0
-0,0.0021784822,0
-0,0.0014312923,0
-0,0.0009531075,0
-0,0.0009719756,0
-1,0.5082326,1
-1,0.5103978,1
-0,0.4345131,0
-0,0.0010050534,0
-0,0.0008835647,0
-0,0.0007883947,0
-0,0.0022930545,0
-0,0.0029679493,0
-0,0.0011517105,0
-0,0.30619615,0
-0,0.093977414,0
-0,0.00095720595,0
-0,0.0015321785,0
-1,0.63398594,1
-0,0.000823775,0
-0,0.00079622905,0
-0,0.0009634666,0
-0,0.49582902,0
-0,0.0009126799,0
-0,0.0009068175,0
-0,0.0008108485,0
-0,0.0016464454,0
-0,0.17892216,0
-0,0.0009123811,0
-0,0.5026859,1
-0,0.00095499237,0
-0,0.0007871654,0
-0,0.001076556,0
-0,0.0011965865,0
-1,0.34323403,0
-0,0.87479687,1
-0,0.0008861932,0
-0,0.35567224,0
-0,0.36455706,0
-0,0.0010463424,0
-0,0.13184956,0
-0,0.001133562,0
-0,0.2384513,0
-0,0.002471907,0
-0,0.5236164,1
-1,0.7201213,1
-0,0.20469072,0
-1,0.3963374,0
-1,0.52494746,1
-0,0.14653996,0
-0,0.0008450751,0
-0,0.075510696,0
-1,0.23577085,0
-1,0.2894605,0
-0,0.0015299466,0
-0,0.7472723,1
-1,0.71758413,1
-1,0.63296187,1
-0,0.0013299449,0
-0,0.777783,1
-1,0.6209929,1
-0,0.00093404716,0
-0,0.58170086,1
-0,0.7959799,1
-0,0.0010129189,0
-0,0.0009979403,0
-1,0.44537392,0
-0,0.561583,1
-0,0.000845328,0
-0,0.0010346215,0
-1,0.67326033,1
-0,0.0009218446,0
-0,0.0010009438,0
-1,0.7646678,1
-0,0.6851708,1
-0,0.34248728,0
-0,0.0010312623,0
-0,0.001760763,0
-0,0.35641682,0
-0,0.0016838545,0
-0,0.6281676,1
-1,0.4627605,0
-0,0.0015306249,0
-1,0.23714353,0
-1,0.71622694,1
-1,0.7319818,1
-0,0.00108557,0
-0,0.5537684,1
-0,0.0012903535,0
-0,0.0012236463,0
-0,0.0012290003,0
-0,0.0008847146,0
-0,0.17526501,0
-1,0.6066298,1
-1,0.6446647,1
-0,0.00080044393,0
-0,0.098615125,0
-1,0.5616812,1
-0,0.6933321,1
-0,0.0011550131,0
-0,0.0011100766,0
-0,0.5804321,1
-0,0.0010168053,0
-0,0.22010425,0
-1,0.32657132,0
-0,0.0009580052,0
-1,0.5661701,1
-0,0.8999228,1
-0,0.0010242854,0
-0,0.00079186546,0
-0,0.0008344046,0
-0,0.006115876,0
-0,0.6555151,1
-0,0.47155666,0
-0,0.0011539217,0
-0,0.42844838,0
-0,0.00095852744,0
-0,0.0011793413,0
-0,0.5727451,1
-0,0.0011716434,0
-0,0.017039847,0
-0,0.0011528861,0
-0,0.0008173667,0
-0,0.0068512554,0
-1,0.752859,1
-1,0.67884725,1
-0,0.002062399,0
-0,0.0008773194,0
-0,0.4187466,0
-0,0.0008783964,0
-0,0.0011804664,0
-0,0.00090095046,0
-0,0.0008461568,0
-0,0.00090098643,0
-1,0.4443772,0
-0,0.0012510206,0
-0,0.0010878003,0
-0,0.001125162,0
-0,0.0009025954,0
-0,0.00082608557,0
-0,0.0012391479,0
-0,0.0010274188,0
-0,0.00085046276,0
-0,0.00094802276,0
-1,0.20541348,0
-1,0.37556043,0
-0,0.40013596,0
-0,0.6528374,1
-0,0.6438984,1
-0,0.33941498,0
-1,0.6012913,1
-1,0.40081245,0
-0,0.0009239743,0
-0,0.00088827714,0
-1,0.69142634,1
-0,0.00095390016,0
-0,0.00091235427,0
-0,0.46778736,0
-1,0.6588788,1
-0,0.00085993443,0
-0,0.00095059595,0
-0,0.31993032,0
-1,0.31242043,0
-1,0.7491844,1
-0,0.00084422436,0
-1,0.81296134,1
-0,0.0009211382,0
-1,0.45493677,0
-0,0.0011309453,0
-1,0.27920976,0
-0,0.001766247,0
-1,0.6483464,1
-0,0.4439611,0
-0,0.28825438,0
-0,0.0013631671,0
-0,0.001132262,0
-0,0.0013900561,0
-0,0.0015789722,0
-0,0.0017141184,0
-1,0.69817054,1
-0,0.011650326,0
-0,0.001025334,0
-0,0.0011714353,0
-0,0.0008079835,0
-1,0.5759009,1
-0,0.22395323,0
-1,0.79228,1
-0,0.0011739928,0
-0,0.00091123086,0
-0,0.00080784655,0
-0,0.0010515592,0
-0,0.39967638,0
-0,0.0010145825,0
-1,0.5423705,1
-1,0.74459624,1
-0,0.00082461955,0
-0,0.0029421886,0
-0,0.43035188,0
-0,0.6877102,1
-0,0.0009364068,0
-0,0.0012012246,0
-0,0.000867792,0
-0,0.15176591,0
-1,0.77357423,1
-0,0.0034248335,0
-0,0.00086901913,0
-0,0.027418558,0
-0,0.00096116075,0
-1,0.6818173,1
-0,0.0009722299,0
-1,0.8720049,1
-0,0.00084275793,0
-0,0.015429355,0
-0,0.0009900979,0
-0,0.0008618162,0
-0,0.0010659914,0
-0,0.0011133268,0
-1,0.6155381,1
-0,0.0019113526,0
-0,0.0013318051,0
-0,0.0021403905,0
-0,0.0013350251,0
-0,0.0011662951,0
-0,0.0009040375,0
-0,0.0029542346,0
-0,0.0026244,0
-0,0.20788012,0
-0,0.5909254,1
-0,0.0010929306,0
-0,0.08327696,0
-0,0.001310451,0
-0,0.14910121,0
-0,0.0011818194,0
-0,0.001830041,0
-1,0.69931924,1
-0,0.0028769197,0
-0,0.38193005,0
-0,0.0017126591,0
-0,0.0008859742,0
-0,0.41827688,0
-0,0.0011043509,0
-0,0.27034986,0
-0,0.0010202782,0
-0,0.0010163564,0
-1,0.85359937,1
-0,0.001006123,0
-0,0.00084935484,0
-0,0.011224475,0
-0,0.0010811862,0
-0,0.0010717752,0
-0,0.3722477,0
-0,0.0010041799,0
-0,0.0022786073,0
-0,0.0010501673,0
-0,0.44608837,0
-0,0.00079554186,0
-0,0.49635226,0
-0,0.4600765,0
-1,0.31137487,0
-0,0.0009268139,0
-0,0.001196574,0
-0,0.001026193,0
-0,0.0009154746,0
-1,0.33101282,0
-0,0.0011979094,0
-0,0.0014616075,0
-0,0.0014142476,0
-1,0.8949496,1
-1,0.8674389,1
-0,0.0008275794,0
-0,0.00087104103,0
-0,0.0010423438,0
-0,0.31495747,0
-0,0.0014686183,0
-1,0.67044705,1
-0,0.2931136,0
-0,0.04735578,0
-0,0.09337471,0
-0,0.0009415873,0
-0,0.0010099063,0
-0,0.53362274,1
-0,0.00096574856,0
-0,0.0012844005,0
-0,0.00083572033,0
-1,0.8030804,1
-0,0.0010202461,0
-0,0.0013574356,0
-1,0.6834372,1
-0,0.09290696,0
-0,0.0009394532,0
-0,0.0009227664,0
-1,0.50783664,1
-1,0.8649702,1
-1,0.7761252,1
-0,0.0012978833,0
-0,0.002984829,0
-0,0.0010553977,0
-0,0.42205676,0
-1,0.77211195,1
-0,0.001033883,0
-0,0.63332963,1
-0,0.0014111141,0
-0,0.0010597117,0
-1,0.26766336,0
-0,0.3339755,0
-0,0.16533276,0
-1,0.6683201,1
-0,0.0021313336,0
-0,0.001062777,0
-1,0.58701336,1
-0,0.000942338,0
-0,0.16852604,0
-0,0.6969176,1
-0,0.001123046,0
-1,0.6442029,1
-0,0.0031450344,0
-0,0.0009817418,0
-0,0.00335572,0
-0,0.001056942,0
-1,0.59806097,1
-1,0.79247564,1
-0,0.0009922732,0
-0,0.46785074,0
-0,0.22426423,0
-0,0.41550258,0
-0,0.0008828788,0
-0,0.0008916164,0
-1,0.5482998,1
-0,0.0011476376,0
-0,0.8011317,1
-0,0.000926976,0
-0,0.0009660738,0
-0,0.06638102,0
-0,0.0012834723,0
-0,0.00091870327,0
-0,0.0008933465,0
-0,0.0031780212,0
-0,0.0016238751,0
-0,0.3595388,0
-0,0.6696774,1
-0,0.40328088,0
-0,0.013335245,0
-0,0.0010570795,0
-0,0.0013795571,0
-0,0.16846372,0
-1,0.84474844,1
-0,0.0008750753,0
-0,0.00087973947,0
-1,0.50548005,1
-0,0.0008324304,0
-0,0.00094381167,0
-0,0.00089194905,0
-0,0.5607766,1
-1,0.54241484,1
-0,0.00443618,0
-0,0.36638293,0
-0,0.7677964,1
-0,0.37992284,0
-1,0.70686316,1
-0,0.5411161,1
-1,0.34657082,0
-1,0.42200235,0
-0,0.0010511446,0
-0,0.0011092986,0
-0,0.10377656,0
-0,0.14405686,0
-0,0.001046122,0
-0,0.0011387641,0
-0,0.054750968,0
-1,0.84262145,1
-0,0.0011182426,0
-0,0.70416677,1
-0,0.001527749,0
-1,0.39897177,0
-0,0.0014704145,0
-0,0.00088706374,0
-0,0.0008660363,0
-0,0.0020009428,0
-1,0.7243906,1
-0,0.0011177352,0
-1,0.58792394,1
-0,0.0012186854,0
-0,0.000997782,0
-1,0.50993186,1
-1,0.08566585,0
-0,0.0009616673,0
-0,0.3577684,0
-0,0.00096860697,0
-0,0.446375,0
-0,0.0008482884,0
-0,0.0009922902,0
-0,0.6955276,1
-0,0.5420666,1
-0,0.0011368056,0
-0,0.000882534,0
-0,0.0009420459,0
-0,0.0014183329,0
-0,0.000876334,0
-0,0.00083285803,0
-0,0.0013245676,0
-0,0.00087310805,0
-0,0.0013295789,0
-0,0.6923525,1
-0,0.0010767679,0
-0,0.005409557,0
-0,0.0009981047,0
-0,0.015847243,0
-0,0.0013694075,0
-0,0.0007474202,0
-1,0.506777,1
-0,0.009123179,0
-0,0.49657914,0
-0,0.277546,0
-0,0.16679445,0
-0,0.0013762934,0
-0,0.0011636585,0
-0,0.0101664085,0
-0,0.0010100742,0
-0,0.00094104477,0
-0,0.32086867,0
-0,0.75031704,1
-0,0.0014099678,0
-0,0.0013869625,0
-0,0.0016245442,0
-0,0.0010221949,0
-1,0.49876264,0
-1,0.38857585,0
-0,0.0020588983,0
-0,0.0015847568,0
-1,0.6566301,1
-0,0.00082876394,0
-0,0.0008605746,0
-0,0.7650169,1
-1,0.34104934,0
-0,0.68121535,1
-0,0.0011083046,0
-0,0.0019450967,0
-1,0.51100874,1
-0,0.0011186971,0
-0,0.5587132,1
-0,0.00088953995,0
-0,0.0011756523,0
-1,0.65532446,1
-0,0.0009612486,0
-0,0.0021935876,0
-1,0.5830356,1
-0,0.0012566766,0
-1,0.60258913,1
-0,0.0011863246,0
-0,0.0009032179,0
-0,0.65853,1
-0,0.000863914,0
-0,0.0010347526,0
-1,0.7523419,1
-1,0.72493607,1
-0,0.0007886984,0
-0,0.0011739223,0
-0,0.0030585844,0
-0,0.0012207521,0
-0,0.00080916967,0
-1,0.64112777,1
-0,0.52794874,1
-0,0.0010475264,0
-0,0.6387218,1
-0,0.0008685981,0
-0,0.0009589503,0
-1,0.55499166,1
-0,0.44648212,0
-0,0.03675145,0
-0,0.2166356,0
-0,0.0012044515,0
-0,0.0013222471,0
-0,0.4260182,0
-0,0.2610802,0
-1,0.19833244,0
-0,0.00086645305,0
-1,0.5524298,1
-0,0.0009163962,0
-1,0.85790485,1
-0,0.0013663898,0
-0,0.0014356149,0
-1,0.6442424,1
-0,0.71102035,1
-1,0.5645003,1
-0,0.0012521823,0
-0,0.00092533894,0
-0,0.0018732402,0
-1,0.54946786,1
-0,0.0012147287,0
-0,0.0019167685,0
-0,0.0019650552,0
-0,0.0010118411,0
-0,0.001942437,0
-1,0.74532837,1
-0,0.73756206,1
-1,0.6251341,1
-0,0.27416506,0
-1,0.50981134,1
-0,0.5543468,1
-0,0.0009716256,0
-0,0.0007795425,0
-0,0.0019102368,0
-0,0.0013252186,0
-0,0.0011993565,0
-0,0.4715206,0
-0,0.060475998,0
-0,0.0010275397,0
-0,0.09666838,0
-0,0.00092452514,0
-0,0.00089867495,0
-0,0.00094824354,0
-0,0.59952736,1
-1,0.3576,0
-0,0.0012081462,0
-0,0.56758493,1
-1,0.6091145,1
-0,0.0028927878,0
-0,0.00090088655,0
-0,0.0022922496,0
-0,0.0008741644,0
-0,0.0010697065,0
-0,0.23075378,0
-0,0.2533621,0
-1,0.7158058,1
-0,0.00092310895,0
-0,0.0008439115,0
-0,0.29364723,0
-0,0.3924025,0
-0,0.00081181526,0
-0,0.0016517383,0
-1,0.65525025,1
-0,0.0009397724,0
-0,0.0010028163,0
-0,0.0010158733,0
-1,0.5670775,1
-0,0.00096147077,0
-0,0.6200651,1
-0,0.0022722506,0
-1,0.708499,1
-0,0.335328,0
-0,0.0012308101,0
-0,0.52159727,1
-0,0.68693334,1
-1,0.78105223,1
-0,0.146075,0
-0,0.22292003,0
-0,0.4519507,0
-0,0.00090513856,0
-1,0.7193635,1
-0,0.0008796326,0
-0,0.5841723,1
-0,0.00097607274,0
-0,0.49114498,0
-0,0.32498598,0
-0,0.0007685969,0
-0,0.0009030518,0
-0,0.0010781017,0
-0,0.0008947887,0
-0,0.00097482925,0
-0,0.0018749419,0
-1,0.81562716,1
-0,0.4467036,0
-0,0.0012841815,0
-1,0.8361843,1
-0,0.0028613487,0
-0,0.0009700805,0
-0,0.00091441197,0
-0,0.0011520925,0
-0,0.35427848,0
-0,0.0009871047,0
-0,0.0010851595,0
-0,0.0011901299,0
-1,0.68657166,1
-0,0.0008210955,0
-0,0.3926858,0
-0,0.0008118559,0
-0,0.001132336,0
-0,0.47707522,0
-0,0.0010266873,0
-1,0.49586618,0
-0,0.49493933,0
-0,0.0007915444,0
-0,0.0015369055,0
-0,0.001586034,0
-1,0.3755337,0
-0,0.42481285,0
-0,0.023645595,0
-0,0.0010042259,0
-0,0.00128545,0
-1,0.86342037,1
-0,0.52136,1
-0,0.00090365595,0
-0,0.00097448146,0
-0,0.109581724,0
-0,0.46571532,0
-0,0.00094029406,0
-0,0.101251364,0
-1,0.7810546,1
-0,0.46070743,0
-0,0.49183476,0
-1,0.5577063,1
-0,0.0015362684,0
-0,0.4263195,0
-0,0.7231806,1
-0,0.37002757,0
-0,0.752774,1
-0,0.000886504,0
-1,0.22578251,0
-1,0.4892348,0
-0,0.16326025,0
-0,0.67995006,1
-0,0.0012501619,0
-0,0.0010966564,0
-0,0.0032683455,0
-0,0.4806333,0
-0,0.00090476696,0
-0,0.0015114724,0
-0,0.0008801575,0
-0,0.0013020836,0
-0,0.0008198137,0
-0,0.39454362,0
-0,0.00093733915,0
-1,0.2374296,0
-0,0.4772929,0
-0,0.00083000946,0
-1,0.39207602,0
-1,0.21760413,0
-0,0.00088210235,0
-0,0.4371137,0
-0,0.003961796,0
-1,0.407404,0
-1,0.32632312,0
-0,0.0013358905,0
-0,0.44445646,0
-0,0.00078979187,0
-0,0.001217051,0
-1,0.14045572,0
-0,0.00090959703,0
-0,0.12760106,0
-0,0.00092583685,0
-0,0.0010166396,0
-0,0.0013670418,0
-0,0.0011671932,0
-0,0.0009024935,0
-0,0.0009488663,0
-0,0.23633154,0
-0,0.0009680432,0
-0,0.005821207,0
-0,0.001028018,0
-1,0.8682412,1
-0,0.0031583507,0
-0,0.0009497762,0
-0,0.001053368,0
-0,0.32452446,0
-0,0.2644262,0
-0,0.0016570487,0
-1,0.41961834,0
-0,0.002138197,0
-0,0.50461143,1
-1,0.31562546,0
-0,0.00085728854,0
-1,0.7799668,1
-1,0.60938144,1
-1,0.70686054,1
-0,0.14313947,0
-0,0.0019669319,0
-0,0.0014327079,0
-1,0.35944593,0
-0,0.001155221,0
-0,0.0012412227,0
-0,0.65372455,1
-0,0.09133776,0
-0,0.00088334584,0
-0,0.312847,0
-0,0.00094589975,0
-0,0.02113242,0
-0,0.00085028814,0
-1,0.679118,1
-0,0.0010827653,0
-0,0.6941961,1
-0,0.22313194,0
-0,0.545091,1
-0,0.0009083876,0
-0,0.46453863,0
-0,0.26321778,0
-1,0.75012404,1
-0,0.0009906325,0
-1,0.6568036,1
-0,0.0009753449,0
-0,0.07195825,0
-1,0.6351574,1
-0,0.1408972,0
-0,0.001222103,0
-1,0.77539057,1
-0,0.5444014,1
-0,0.0010118657,0
-1,0.48471546,0
-0,0.0016892834,0
-0,0.0010253623,0
-0,0.52112716,1
-0,0.0014120834,0
-0,0.00082746346,0
-0,0.027199497,0
-0,0.05421511,0
-1,0.31981403,0
-0,0.0008672601,0
-0,0.00398289,0
-0,0.00096599694,0
-0,0.0008861936,0
-0,0.001077711,0
-0,0.0040958757,0
-1,0.73662597,1
-1,0.7852967,1
-0,0.0009536717,0
-0,0.22212833,0
-0,0.0012771386,0
-0,0.14009744,0
-0,0.22057438,0
-1,0.57046175,1
-1,0.77926654,1
-0,0.00082518166,0
-0,0.0009310219,0
-0,0.00091517466,0
-0,0.00088250585,0
-0,0.5822396,1
-0,0.00081105594,0
-1,0.2343802,0
-0,0.0011367487,0
-0,0.00117123,0
-1,0.60446376,1
-1,0.31510806,0
-0,0.2251927,0
-1,0.6748813,1
-0,0.0010593968,0
-1,0.32355276,0
-0,0.0011763906,0
-0,0.0021751213,0
-0,0.0026256007,0
-0,0.0009737066,0
-1,0.75336325,1
-0,0.0010090931,0
-0,0.00094570237,0
-0,0.17809871,0
-0,0.0008351583,0
-1,0.6894662,1
-0,0.16999531,0
-0,0.0009210333,0
-0,0.00092500576,0
-1,0.6222378,1
-0,0.5345441,1
-0,0.10253339,0
-1,0.5398624,1
-1,0.70694554,1
-0,0.14037986,0
-1,0.73327917,1
-1,0.46298742,0
-0,0.0013656756,0
-0,0.0009719117,0
-0,0.32651603,0
-0,0.0010383307,0
-0,0.00082847173,0
-0,0.3687943,0
-0,0.17860247,0
-0,0.8183667,1
-0,0.00082963787,0
-0,0.80729526,1
-0,0.15151022,0
-0,0.0011345402,0
-0,0.00081908173,0
-0,0.39142883,0
-0,0.0016399269,0
-0,0.27851093,0
-1,0.7010973,1
-0,0.007820642,0
-0,0.0018496078,0
-0,0.0019736353,0
-0,0.0009546348,0
-0,0.0014451279,0
-1,0.5517119,1
-0,0.010127031,0
-0,0.0009856458,0
-0,0.00091644377,0
-0,0.00095582794,0
-0,0.0011918532,0
-0,0.0010818146,0
-0,0.0013008608,0
-0,0.45276558,0
-0,0.001402461,0
-0,0.0010138496,0
-0,0.0019704031,0
-0,0.43915036,0
-0,0.0036121928,0
-1,0.7637573,1
-0,0.0010554554,0
-0,0.001165051,0
-0,0.37836757,0
-1,0.38657185,0
-0,0.00091857504,0
-1,0.7066505,1
-0,0.0009607843,0
-0,0.0009117129,0
-0,0.5112803,1
-0,0.0009124838,0
-0,0.0007892426,0
-0,0.0008387871,0
-1,0.57059675,1
-0,0.0008657701,0
-0,0.7825752,1
-0,0.0010295792,0
-0,0.0008365171,0
-1,0.791482,1
-1,0.11556648,0
-0,0.035818722,0
-1,0.67135835,1
-1,0.8084183,1
-0,0.0013094479,0
-0,0.7851747,1
-0,0.0012969027,0
-1,0.6496594,1
-1,0.7269179,1
-0,0.0010638919,0
-0,0.0008244636,0
-1,0.2654086,0
-0,0.33399442,0
-0,0.0028439101,0
-1,0.44304168,0
-0,0.0008866658,0
-0,0.00084273505,0
-0,0.0020351016,0
-0,0.0008314609,0
-0,0.0012523923,0
-0,0.00095604156,0
-0,0.0017479339,0
-0,0.051447127,0
-0,0.0009557702,0
-0,0.0009667441,0
-0,0.002198847,0
-0,0.12981664,0
-0,0.0012121114,0
-0,0.0079048155,0
-0,0.00077453733,0
-0,0.61904925,1
-1,0.51487285,1
-1,0.71629065,1
-0,0.0010181725,0
-1,0.20445453,0
-0,0.0009306721,0
-0,0.00092179497,0
-1,0.53567547,1
-0,0.0008878104,0
-0,0.14483893,0
-0,0.0022033292,0
-0,0.0012140451,0
-0,0.0011769638,0
-0,0.00094178476,0
-0,0.0013351414,0
-0,0.0010231428,0
-0,0.0028764764,0
-0,0.00088782865,0
-0,0.0010239044,0
-0,0.0008212328,0
-0,0.00097123464,0
-0,0.0011692323,0
-0,0.00095932,0
-0,0.0009633353,0
-0,0.35725975,0
-1,0.69560957,1
-0,0.0010163616,0
-0,0.00086547283,0
-1,0.8560721,1
-0,0.00092080346,0
-1,0.44005793,0
-0,0.0009872523,0
-0,0.0008560454,0
-0,0.0012110769,0
-0,0.0008555859,0
-0,0.20835932,0
-0,0.0012324532,0
-0,0.0011165539,0
-1,0.73311216,1
-0,0.0012142817,0
-1,0.72214335,1
-0,0.0012687293,0
-0,0.0007941535,0
-0,0.052786954,0
-0,0.0011843917,0
-0,0.0008150518,0
-0,0.0010267626,0
-1,0.6452032,1
-0,0.3575322,0
-1,0.7990738,1
-0,0.0010931097,0
-0,0.0016088708,0
-1,0.6296911,1
-1,0.20946522,0
-0,0.0010309608,0
-1,0.7263279,1
-1,0.63071215,1
-0,0.013318661,0
-0,0.0012621472,0
-1,0.72783095,1
-0,0.0011353592,0
-0,0.00095678604,0
-0,0.001396949,0
-1,0.55972546,1
-0,0.0010440184,0
-0,0.27277997,0
-0,0.00091744936,0
-0,0.23269582,0
-0,0.0008658086,0
-0,0.00085602904,0
-0,0.0010250561,0
-0,0.00084345846,0
-1,0.66574997,1
-0,0.0011059438,0
-0,0.0011090867,0
-1,0.33194953,0
-0,0.0011119393,0
-0,0.6470907,1
-1,0.7327194,1
-1,0.7710486,1
-0,0.11899223,0
-0,0.36645073,0
-0,0.110385925,0
-0,0.0010350458,0
-1,0.7796217,1
-0,0.00096380076,0
-0,0.49681517,0
-0,0.0010481027,0
-0,0.6842804,1
-0,0.13682581,0
-0,0.05944811,0
-0,0.0012068587,0
-0,0.005288887,0
-0,0.65641546,1
-0,0.0008991473,0
-0,0.00090689753,0
-0,0.46010032,0
-0,0.14822635,0
-0,0.0011970259,0
-0,0.4271773,0
-0,0.3656186,0
-0,0.8195093,1
-1,0.5701496,1
-0,0.4221881,0
-0,0.0010203084,0
-1,0.74877393,1
-0,0.0009964638,0
-0,0.09252277,0
-0,0.32190117,0
-0,0.2631963,0
-0,0.5640367,1
-0,0.000990022,0
-1,0.20268653,0
-0,0.11683034,0
-0,0.004007565,0
-0,0.0017538378,0
-1,0.78015506,1
-1,0.101948835,0
-0,0.4630364,0
-0,0.0012770256,0
-0,0.0008438017,0
-0,0.0009979869,0
-1,0.51085657,1
-0,0.00088811293,0
-0,0.001140972,0
-0,0.18803759,0
-0,0.003006536,0
-0,0.0011695281,0
-0,0.12834576,0
-1,0.44561812,0
-0,0.0015858911,0
-1,0.3685857,0
-1,0.21091098,0
-0,0.4187445,0
-1,0.78673923,1
-0,0.0011236598,0
-0,0.0010910762,0
-0,0.0009057033,0
-0,0.0014176408,0
-1,0.67693305,1
-0,0.04034407,0
-0,0.0008000367,0
-0,0.0010251978,0
-0,0.001136779,0
-0,0.0014142994,0
-0,0.0008644599,0
-0,0.0009202785,0
-0,0.005384131,0
-0,0.00090876326,0
-0,0.1600891,0
-0,0.37699142,0
-0,0.0010899481,0
-0,0.57948345,1
-0,0.00081056,0
-0,0.0012006091,0
-1,0.77439034,1
-0,0.00091627566,0
-0,0.5347407,1
-0,0.0009845204,0
-1,0.4189611,0
-1,0.53350735,1
-0,0.0010007151,0
-0,0.0020579454,0
-0,0.0012446969,0
-0,0.000945488,0
-0,0.4172918,0
-0,0.000938053,0
-0,0.5179231,1
-0,0.0011195596,0
-0,0.00086770067,0
-1,0.7677404,1
-0,0.40908203,0
-1,0.67145306,1
-0,0.0009983968,0
-0,0.4357218,0
-1,0.40684062,0
-0,0.52875257,1
-0,0.0009659417,0
-0,0.0008981062,0
-0,0.0009528588,0
-0,0.00095595414,0
-1,0.56750405,1
-0,0.0010651433,0
-1,0.13752264,0
-0,0.00079181977,0
-0,0.0008729538,0
-0,0.0015384284,0
-0,0.58224595,1
-0,0.000829641,0
-1,0.45509866,0
-0,0.0008586016,0
-0,0.0008085296,0
-0,0.0009752612,0
-1,0.65100336,1
-0,0.0012406152,0
-1,0.7015345,1
-0,0.0008786278,0
-0,0.0012137983,0
-0,0.00094313076,0
-0,0.47724065,0
-1,0.76214045,1
-0,0.0012182803,0
-1,0.7183965,1
-1,0.46909162,0
-0,0.00082692713,0
-0,0.001267697,0
-1,0.7449686,1
-0,0.0009520602,0
-1,0.2682325,0
-1,0.8336221,1
-0,0.05860209,0
-1,0.5426554,1
-0,0.0015246021,0
-0,0.0009006676,0
-0,0.0009767053,0
-0,0.00081073726,0
-1,0.54334396,1
-1,0.72451127,1
-1,0.64862955,1
-0,0.0015333411,0
-0,0.0009270903,0
-0,0.0008060023,0
-0,0.0015449867,0
-0,0.0008756904,0
-0,0.00089255895,0
-1,0.4240961,0
-0,0.0007905915,0
-1,0.6176073,1
-0,0.16892567,0
-0,0.0019416468,0
-0,0.0034173308,0
-1,0.48232934,0
-0,0.3651226,0
-1,0.5432746,1
-1,0.53853256,1
-0,0.049414482,0
-1,0.61953956,1
-0,0.00086147717,0
-0,0.5655044,1
-1,0.5725881,1
-0,0.38828382,0
-0,0.0008593283,0
-0,0.6609687,1
-0,0.0031328383,0
-0,0.0016064071,0
-0,0.16550626,0
-0,0.0010093974,0
-1,0.78001803,1
-0,0.0009410012,0
-0,0.019389188,0
-1,0.8118031,1
-0,0.0010585715,0
-0,0.0011178768,0
-0,0.51671094,1
-1,0.14671549,0
-0,0.0012277119,0
-0,0.10688765,0
-0,0.26730445,0
-1,0.41755715,0
-0,0.6532114,1
-1,0.5408069,1
-0,0.0011035037,0
-0,0.38721368,0
-0,0.0009238639,0
-0,0.00087872625,0
-1,0.4663372,0
-0,0.25327864,0
-0,0.0011890574,0
-0,0.08030304,0
-0,0.0012005063,0
-0,0.0008608408,0
-1,0.5602192,1
-1,0.7817263,1
-0,0.00083466305,0
-0,0.0010471261,0
-0,0.001124526,0
-0,0.0012333503,0
-0,0.012108352,0
-0,0.21776897,0
-0,0.0016868615,0
-1,0.82101417,1
-0,0.0009039936,0
-0,0.45584226,0
-0,0.45311502,0
-0,0.00096501835,0
-1,0.66155726,1
-0,0.0008598758,0
-0,0.49483657,0
-0,0.0012214681,0
-0,0.0012331072,0
-0,0.0007621751,0
-0,0.0063888626,0
-0,0.0009407718,0
-0,0.0008379926,0
-0,0.0012965859,0
-1,0.5956936,1
-1,0.27054062,0
-0,0.3623484,0
-1,0.7357546,1
-0,0.00154095,0
-0,0.003388907,0
-0,0.0011851212,0
-0,0.0010015048,0
-0,0.0009875342,0
-0,0.56412435,1
-0,0.17703854,0
-0,0.0049527115,0
-0,0.41469368,0
-0,0.0013113499,0
-0,0.25928286,0
-0,0.00080234197,0
-0,0.0011141081,0
-0,0.0009273143,0
-0,0.15817499,0
-1,0.55791485,1
-0,0.49433956,0
-0,0.44379577,0
-0,0.00097380485,0
-0,0.00081098836,0
-0,0.09645127,0
-0,0.0010052086,0
-0,0.0008312449,0
-0,0.0014976384,0
-0,0.7013783,1
-0,0.7363814,1
-0,0.0017869734,0
-0,0.5600073,1
-1,0.5933672,1
-1,0.7463558,1
-0,0.00087595376,0
-0,0.3700118,0
-0,0.4078047,0
-1,0.70503634,1
-0,0.00087403157,0
-1,0.59753084,1
-1,0.28408647,0
-1,0.346187,0
-0,0.0010321912,0
-0,0.0010446163,0
-0,0.0019384956,0
-0,0.2687528,0
-0,0.002051823,0
-0,0.00088337995,0
-0,0.0009888418,0
-1,0.518932,1
-0,0.0014419496,0
-0,0.28866056,0
-0,0.0018917412,0
-0,0.7118644,1
-0,0.0007667698,0
-0,0.0011144138,0
-0,0.0010100271,0
-0,0.0023653419,0
-1,0.5826993,1
-1,0.2670869,0
-0,0.0012896777,0
-0,0.00094871124,0
-0,0.0011521467,0
-0,0.0010139366,0
-0,0.081099555,0
-0,0.0012478646,0
-0,0.0009908755,0
-0,0.00085459027,0
-1,0.74782383,1
-1,0.63217145,1
-0,0.001039582,0
-0,0.0010658543,0
-0,0.0015608877,0
-0,0.00084603427,0
-1,0.69931394,1
-1,0.6545752,1
-0,0.0012512095,0
-0,0.25088048,0
-1,0.46437111,0
-0,0.00084795104,0
-0,0.0008775966,0
-0,0.0014866656,0
-0,0.0010228608,0
-1,0.7594941,1
-0,0.0010856481,0
-1,0.6442902,1
-0,0.0011075162,0
-1,0.7875049,1
-0,0.0009568744,0
-1,0.8581466,1
-0,0.00096314953,0
-0,0.0009732034,0
-0,0.0011903605,0
-0,0.000776502,0
-1,0.7685739,1
-0,0.0008151252,0
-0,0.001778643,0
-1,0.74879175,1
-0,0.20462619,0
-0,0.0010626859,0
-0,0.00089822855,0
-1,0.30061033,0
-0,0.20656452,0
-0,0.001094946,0
-1,0.7288647,1
-0,0.00084817404,0
-1,0.7852595,1
-0,0.0009366696,0
-1,0.75811154,1
-0,0.43617013,0
-0,0.0009005994,0
-0,0.0008882077,0
-0,0.0008365301,0
-1,0.3134185,0
-0,0.00086332444,0
-0,0.2912862,0
-0,0.0009431514,0
-0,0.0011384224,0
-1,0.7892175,1
-0,0.100751415,0
-0,0.00085538207,0
-0,0.00081628014,0
-0,0.14945064,0
-0,0.0011895921,0
-0,0.0008453175,0
-1,0.7127246,1
-0,0.0014821741,0
-0,0.48956093,0
-1,0.6526354,1
-0,0.70968366,1
-0,0.0013969796,0
-0,0.15343508,0
-0,0.0011304358,0
-1,0.69095427,1
-0,0.0013296176,0
-0,0.0012575921,0
-0,0.00092597713,0
-0,0.0008999059,0
-0,0.24098817,0
-0,0.0016370128,0
-0,0.0012018238,0
-0,0.2096467,0
-1,0.86404353,1
-0,0.022236077,0
-0,0.0010826504,0
-0,0.022941377,0
-1,0.7931688,1
-0,0.001005007,0
-0,0.0014776052,0
-0,0.007663459,0
-0,0.42009014,0
-0,0.0010014976,0
-0,0.0014163717,0
-1,0.28360623,0
-0,0.002345287,0
-0,0.0010593463,0
-0,0.14843744,0
-0,0.31841034,0
-0,0.0011871555,0
-1,0.58255094,1
-0,0.00079310685,0
-0,0.0015736327,0
-1,0.42560086,0
-1,0.79392105,1
-1,0.8037611,1
-0,0.15339166,0
-0,0.44212908,0
-0,0.15695691,0
-0,0.001069759,0
-0,0.0013624149,0
-0,0.10386715,0
-1,0.38520056,0
-0,0.00083144975,0
-1,0.51295435,1
-0,0.3737823,0
-0,0.00096323073,0
-0,0.0011089299,0
-0,0.00083140214,0
-0,0.001142048,0
-1,0.39718005,0
-0,0.6380984,1
-0,0.12990019,0
-0,0.6759345,1
-0,0.000828972,0
-0,0.0065064146,0
-0,0.0010996222,0
-0,0.5806185,1
-0,0.0030489217,0
-0,0.0009842621,0
-0,0.00096107053,0
-1,0.68338144,1
-0,0.0009416806,0
-0,0.001178365,0
-0,0.0016215707,0
-0,0.41308418,0
-0,0.37985086,0
-0,0.0010058667,0
-1,0.27062395,0
-0,0.0013959251,0
-1,0.6292908,1
-0,0.0013573767,0
-0,0.0028528608,0
-1,0.7515089,1
-1,0.58315617,1
-1,0.29764894,0
-0,0.0010242697,0
-0,0.66869485,1
-1,0.34468064,0
-0,0.0009077599,0
-0,0.0012983766,0
-1,0.6648164,1
-0,0.00096312474,0
-0,0.22477408,0
-0,0.0010165646,0
-0,0.0012150696,0
-0,0.001145921,0
-0,0.0008946433,0
-0,0.0009847601,0
-0,0.0019780556,0
-0,0.03994404,0
-0,0.39725897,0
-0,0.0011280741,0
-0,0.14372922,0
-0,0.00097693,0
-0,0.49378365,0
-0,0.0008469307,0
-1,0.23917605,0
-0,0.110021144,0
-0,0.00080322206,0
-0,0.0008781373,0
-0,0.002445047,0
-0,0.00089050585,0
-0,0.0012529906,0
-0,0.0012119152,0
-0,0.0016275514,0
-1,0.6807688,1
-0,0.0013215955,0
-0,0.78445035,1
-0,0.0008880186,0
-0,0.001284776,0
-0,0.0017074918,0
-0,0.0009681374,0
-0,0.6296154,1
-0,0.001095009,0
-0,0.0015562188,0
-1,0.6771481,1
-1,0.18929906,0
-0,0.0008950543,0
-0,0.0008272061,0
-0,0.0017716148,0
-1,0.76254493,1
-0,0.0009578705,0
-1,0.8615882,1
-0,0.00092509435,0
-0,0.0009841388,0
-0,0.0008676494,0
-0,0.0014497808,0
-0,0.0010180256,0
-0,0.0020955966,0
-0,0.0016175646,0
-0,0.0008392496,0
-1,0.6555392,1
-0,0.00096220616,0
-0,0.0012978838,0
-0,0.0020624246,0
-0,0.32421842,0
-0,0.0012323145,0
-1,0.15064883,0
-1,0.2249341,0
-1,0.6194747,1
-1,0.40883264,0
-0,0.00088793685,0
-0,0.5703692,1
-0,0.00080029253,0
-1,0.533915,1
-0,0.0018578873,0
-0,0.0009043615,0
-0,0.61236954,1
-1,0.25842515,0
-1,0.6920323,1
-0,0.0009899612,0
-1,0.21387953,0
-0,0.0023066083,0
-1,0.6479849,1
-0,0.0011913775,0
-0,0.0010240928,0
-0,0.00094176497,0
-0,0.2934143,0
-1,0.32174513,0
-0,0.00091399293,0
-0,0.3778847,0
-0,0.0010437181,0
-0,0.00093027006,0
-1,0.2693971,0
-1,0.695859,1
-0,0.0009045273,0
-0,0.21045512,0
-0,0.0008732916,0
-1,0.615637,1
-0,0.001324741,0
-0,0.00093508593,0
-0,0.00074671215,0
-0,0.33646423,0
-0,0.001223517,0
-0,0.34300792,0
-0,0.0029657178,0
-0,0.0010389285,0
-0,0.00087984215,0
-0,0.0009522997,0
-0,0.0009270687,0
-0,0.0010787433,0
-0,0.001082428,0
-0,0.0010898644,0
-0,0.0015669015,0
-0,0.50207824,1
-1,0.69201577,1
-0,0.0027307745,0
-1,0.65933526,1
-1,0.33638108,0
-0,0.0009279095,0
-1,0.6811094,1
-1,0.6222203,1
-0,0.00087204383,0
-0,0.001260481,0
-0,0.221174,0
-0,0.000790207,0
-0,0.000836068,0
-0,0.34802088,0
-0,0.0011530793,0
-0,0.0008934312,0
-0,0.000901762,0
-0,0.0068717427,0
-0,0.0012540269,0
-0,0.0008931409,0
-0,0.24012849,0
-0,0.00096857373,0
-0,0.0009792737,0
-1,0.73453856,1
-1,0.71891254,1
-0,0.0012205226,0
-1,0.5815703,1
-0,0.0017512442,0
-0,0.0008828233,0
-0,0.0039220187,0
-0,0.0013681627,0
-0,0.85258,1
-0,0.0010186047,0
-1,0.8146731,1
-0,0.05123307,0
-1,0.6553712,1
-0,0.0016004561,0
-0,0.12838942,0
-0,0.0009933491,0
-0,0.0009402028,0
-0,0.000918758,0
-1,0.8279392,1
-1,0.35954273,0
-1,0.76767814,1
-1,0.48393753,0
-1,0.53692836,1
-0,0.0008827944,0
-0,0.0009095459,0
-1,0.6520005,1
-0,0.0010726445,0
-0,0.0013205588,0
-0,0.0012526136,0
-1,0.518195,1
-0,0.48449913,0
-0,0.4973757,0
-0,0.0008689781,0
-1,0.6955714,1
-0,0.0027496573,0
-0,0.70566237,1
-0,0.0008412664,0
-0,0.0013987075,0
-1,0.13971777,0
-0,0.0012735971,0
-0,0.42210418,0
-1,0.64054,1
-0,0.0008632639,0
-0,0.0008357673,0
-0,0.0010104679,0
-0,0.0011264505,0
-0,0.0008627759,0
-1,0.63967973,1
-0,0.0015104539,0
-0,0.7630441,1
-0,0.25452375,0
-0,0.0008738256,0
-0,0.0010032041,0
-0,0.5034312,1
-0,0.00092107285,0
-0,0.0013620523,0
-1,0.91997075,1
-1,0.52912873,1
-0,0.0024464617,0
-0,0.72202426,1
-0,0.12549123,0
-1,0.3554386,0
-1,0.68960494,1
-0,0.0011416401,0
-0,0.062529385,0
-0,0.00089203403,0
-0,0.16887772,0
-0,0.00095955486,0
-1,0.350305,0
-0,0.12311296,0
-0,0.0024356688,0
-0,0.5414857,1
-0,0.0008605533,0
-1,0.7618979,1
-0,0.0009157459,0
-0,0.0010139028,0
-0,0.33930495,0
-0,0.0007996137,0
-0,0.0010549367,0
-0,0.003367107,0

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef9845cf64c142ff16fc915402953a1383e36ecb1c76b6174fae75c0dec59cd4
+size 54904

tokenizer/.ipynb_checkpoints/my_tokenizers-checkpoint.py ADDED Viewed

	@@ -0,0 +1,398 @@

+import collections
+import logging
+import os
+import re
+import codecs
+import unicodedata
+from typing import List, Optional
+from transformers import PreTrainedTokenizer
+from SmilesPE.tokenizer import SPE_Tokenizer
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+class Atomwise_Tokenizer(object):
+    """Run atom-level SMILES tokenization"""
+    def __init__(self):
+        """ Constructs a atom-level Tokenizer.
+        """
+        # self.regex_pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
+        self.regex_pattern = r"(\([^\(\)]{0,4}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/\/?|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
+        self.regex = re.compile(self.regex_pattern)
+    def tokenize(self, text):
+        """ Basic Tokenization of a SMILES.
+        """
+        tokens = [token for token in self.regex.findall(text)]
+        return tokens
+class SMILES_SPE_Tokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+    Args:
+        vocab_file (:obj:`string`):
+            File containing the vocabulary.
+        spe_file (:obj:`string`):
+            File containing the trained SMILES Pair Encoding vocabulary.
+        unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+    """
+    def __init__(self, vocab_file, spe_file,
+                unk_token="[UNK]",
+                sep_token="[SEP]",
+                pad_token="[PAD]",
+                cls_token="[CLS]",
+                mask_token="[MASK]",
+                **kwargs):
+        if not os.path.isfile(vocab_file):
+            raise ValueError("Can't find a vocabulary file at path '{}'.".format(vocab_file))
+        if not os.path.isfile(spe_file):
+            raise ValueError("Can't find a SPE vocabulary file at path '{}'.".format(spe_file))
+        self.vocab = load_vocab(vocab_file)
+        self.spe_vocab = open(spe_file, 'r', encoding='utf-8')
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.spe_tokenizer = SPE_Tokenizer(self.spe_vocab)
+        super().__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs)
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+    def _tokenize(self, text):
+        return self.spe_tokenizer.tokenize(text).split(' ')
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+        text = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+        return self.convert_tokens_to_string(text)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A BERT sequence has the following format:
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        A BERT sequence pair mask has the following format:
+        ::
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+    def save_vocabulary(self, vocab_path):
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+        Args:
+            vocab_path (:obj:`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
+        index = 0
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
+        else:
+            vocab_file = vocab_path
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+class SMILES_Atomwise_Tokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+    Args:
+        vocab_file (:obj:`string`):
+            File containing the vocabulary.
+        unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+    """
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        super().__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'.".format(vocab_file)
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.tokenizer = Atomwise_Tokenizer()
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+    def _tokenize(self, text):
+        return self.tokenizer.tokenize(text)
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A BERT sequence has the following format:
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        A BERT sequence pair mask has the following format:
+        ::
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+    def save_vocabulary(self, vocab_path):
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+        Args:
+            vocab_path (:obj:`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
+        index = 0
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
+        else:
+            vocab_file = vocab_path
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)

tokenizer/__pycache__/my_tokenizers.cpython-310.pyc ADDED Viewed

Binary file (15.5 kB). View file

tokenizer/my_tokenizers.py ADDED Viewed

	@@ -0,0 +1,398 @@

+import collections
+import logging
+import os
+import re
+import codecs
+import unicodedata
+from typing import List, Optional
+from transformers import PreTrainedTokenizer
+from SmilesPE.tokenizer import SPE_Tokenizer
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+class Atomwise_Tokenizer(object):
+    """Run atom-level SMILES tokenization"""
+    def __init__(self):
+        """ Constructs a atom-level Tokenizer.
+        """
+        # self.regex_pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
+        self.regex_pattern = r"(\([^\(\)]{0,4}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/\/?|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
+        self.regex = re.compile(self.regex_pattern)
+    def tokenize(self, text):
+        """ Basic Tokenization of a SMILES.
+        """
+        tokens = [token for token in self.regex.findall(text)]
+        return tokens
+class SMILES_SPE_Tokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+    Args:
+        vocab_file (:obj:`string`):
+            File containing the vocabulary.
+        spe_file (:obj:`string`):
+            File containing the trained SMILES Pair Encoding vocabulary.
+        unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+    """
+    def __init__(self, vocab_file, spe_file,
+                unk_token="[UNK]",
+                sep_token="[SEP]",
+                pad_token="[PAD]",
+                cls_token="[CLS]",
+                mask_token="[MASK]",
+                **kwargs):
+        if not os.path.isfile(vocab_file):
+            raise ValueError("Can't find a vocabulary file at path '{}'.".format(vocab_file))
+        if not os.path.isfile(spe_file):
+            raise ValueError("Can't find a SPE vocabulary file at path '{}'.".format(spe_file))
+        self.vocab = load_vocab(vocab_file)
+        self.spe_vocab = open(spe_file, 'r', encoding='utf-8')
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.spe_tokenizer = SPE_Tokenizer(self.spe_vocab)
+        super().__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs)
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+    def _tokenize(self, text):
+        return self.spe_tokenizer.tokenize(text).split(' ')
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+        text = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+        return self.convert_tokens_to_string(text)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A BERT sequence has the following format:
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        A BERT sequence pair mask has the following format:
+        ::
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+    def save_vocabulary(self, vocab_path):
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+        Args:
+            vocab_path (:obj:`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
+        index = 0
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
+        else:
+            vocab_file = vocab_path
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+class SMILES_Atomwise_Tokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+    Args:
+        vocab_file (:obj:`string`):
+            File containing the vocabulary.
+        unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+    """
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        super().__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'.".format(vocab_file)
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.tokenizer = Atomwise_Tokenizer()
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+    def _tokenize(self, text):
+        return self.tokenizer.tokenize(text)
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A BERT sequence has the following format:
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        A BERT sequence pair mask has the following format:
+        ::
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+    def save_vocabulary(self, vocab_path):
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+        Args:
+            vocab_path (:obj:`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
+        index = 0
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
+        else:
+            vocab_file = vocab_path
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)

tokenizer/new_splits.txt ADDED Viewed

	@@ -0,0 +1,159 @@

+c 1
+c 2
+c 3
+c 4
+c 5
+c 6
+c 7
+c 8
+c 9
+( c1
+( c2
+c1 )
+c2 )
+n 1
+n 2
+n 3
+n 4
+n 5
+n 6
+n 7
+n 8
+n 9
+( n1
+( n2
+n1 )
+n2 )
+O 1
+O 2
+O 3
+O 4
+O 5
+O 6
+O 7
+O 8
+O 9
+( O1
+( O2
+O2 )
+O2 )
+= O
+= C
+= c
+= N
+= n
+=C C
+=C N
+=C c
+=c c
+=N C
+=N c
+=n C
+=n c
+# N
+# C
+#N C
+#C C
+#C N
+#N N
+( C
+C )
+( O
+O )
+( N
+N )
+Br c
+( =O
+(=O )
+C (=O)
+C =O
+C =N
+C #N
+C #C
+C C
+CC C
+CC N
+CC O
+CC S
+CC c
+CC n
+C N
+CN C
+CN c
+C O
+CO C
+CO N
+CO c
+C S
+CS C
+CS S
+CS c
+C c
+Cl c
+C n
+F c
+N C
+NC C
+NC c
+N N
+N O
+N c
+N n
+O C
+OC C
+OC O
+OC c
+O N
+O O
+O c
+S C
+SC C
+SC c
+S S
+S c
+c c
+cc c
+cc n
+cc o
+cc s
+cc cc
+c n
+cn c
+cn n
+c o
+co c
+c s
+cs c
+cs n
+n c
+nc c
+nc n
+nc o
+nc s
+n n
+nn c
+nn n
+n o
+no c
+no n
+n s
+ns c
+ns n
+o c
+oc c
+o n
+s c
+sc c
+sc n
+s n
+N P
+P N
+C P
+P C
+N S
+S N
+C S
+S C
+S P
+P S
+C I

tokenizer/new_vocab.txt ADDED Viewed

	@@ -0,0 +1,586 @@

+[PAD]
+[UNK]
+[CLS]
+[SEP]
+[MASK]
+#
+%
+(
+)
++
+-
+/
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+=
+@
+A
+B
+Br
+Brc
+C
+CC
+CCC
+CCN
+CCO
+CCS
+CCc
+CCn
+CN
+CNC
+CNc
+CO
+COC
+CON
+COc
+CS
+CSC
+CSS
+CSc
+Cc
+Cl
+Clc
+Cn
+F
+Fc
+H
+I
+K
+L
+M
+N
+NC
+NCC
+NCc
+NN
+NO
+Nc
+Nn
+O
+OC
+OCC
+OCO
+OCc
+ON
+OO
+Oc
+P
+R
+S
+SC
+SCC
+SCc
+SS
+Sc
+T
+X
+Z
+[
+\\
+(/
+]
+a
+b
+c
+cc
+ccc
+ccn
+cco
+ccs
+cn
+cnc
+cnn
+co
+coc
+cs
+csc
+csn
+e
+g
+i
+l
+n
+nc
+ncc
+ncn
+nco
+ncs
+nn
+nnc
+nnn
+no
+noc
+non
+ns
+nsc
+nsn
+o
+oc
+occ
+on
+p
+r
+s
+sc
+scc
+scn
+sn
+t
+c1
+c2
+c3
+c4
+c5
+c6
+c7
+c8
+c9
+n1
+n2
+n3
+n4
+n5
+n6
+n7
+n8
+n9
+O1
+O2
+O3
+O4
+O5
+O6
+O7
+O8
+O9
+(c1
+(c2
+c1)
+c2)
+(n1
+(n2
+n1)
+n2)
+(O1
+(O2
+O2)
+=O
+=C
+=c
+=N
+=n
+=CC
+=CN
+=Cc
+=cc
+=NC
+=Nc
+=nC
+=nc
+#C
+#CC
+#CN
+#N
+#NC
+#NN
+(C
+C)
+(O
+O)
+(N
+N)
+NP
+PN
+CP
+PC
+NS
+SN
+SP
+PS
+C(=O)
+(/Br)
+(/C#N)
+(/C)
+(/C=N)
+(/C=O)
+(/CBr)
+(/CC)
+(/CCC)
+(/CCF)
+(/CCN)
+(/CCO)
+(/CCl)
+(/CI)
+(/CN)
+(/CO)
+(/CS)
+(/Cl)
+(/F)
+(/I)
+(/N)
+(/NC)
+(/NCC)
+(/NO)
+(/O)
+(/OC)
+(/OCC)
+(/S)
+(/SC)
+(=C)
+(=C/C)
+(=C/F)
+(=C/I)
+(=C/N)
+(=C/O)
+(=CBr)
+(=CC)
+(=CCF)
+(=CCN)
+(=CCO)
+(=CCl)
+(=CF)
+(=CI)
+(=CN)
+(=CO)
+(=C\\C)
+(=C\\F)
+(=C\\I)
+(=C\\N)
+(=C\\O)
+(=N)
+(=N/C)
+(=N/N)
+(=N/O)
+(=NBr)
+(=NC)
+(=NCC)
+(=NCl)
+(=NN)
+(=NO)
+(=NOC)
+(=N\\C)
+(=N\\N)
+(=N\\O)
+(=O)
+(=S)
+(B)
+(Br)
+(C#C)
+(C#CC)
+(C#CI)
+(C#CO)
+(C#N)
+(C#SN)
+(C)
+(C=C)
+(C=CF)
+(C=CI)
+(C=N)
+(C=NN)
+(C=NO)
+(C=O)
+(C=S)
+(CBr)
+(CC#C)
+(CC#N)
+(CC)
+(CC=C)
+(CC=O)
+(CCBr)
+(CCC)
+(CCCC)
+(CCCF)
+(CCCI)
+(CCCN)
+(CCCO)
+(CCCS)
+(CCCl)
+(CCF)
+(CCI)
+(CCN)
+(CCNC)
+(CCNN)
+(CCNO)
+(CCO)
+(CCOC)
+(CCON)
+(CCS)
+(CCSC)
+(CCl)
+(CF)
+(CI)
+(CN)
+(CN=O)
+(CNC)
+(CNCC)
+(CNCO)
+(CNN)
+(CNNC)
+(CNO)
+(CNOC)
+(CO)
+(COC)
+(COCC)
+(COCI)
+(COCN)
+(COCO)
+(COF)
+(CON)
+(COO)
+(CS)
+(CSC)
+(CSCC)
+(CSCF)
+(CSO)
+(Cl)
+(F)
+(I)
+(N)
+(N=N)
+(N=NO)
+(N=O)
+(N=S)
+(NBr)
+(NC#N)
+(NC)
+(NC=N)
+(NC=O)
+(NC=S)
+(NCBr)
+(NCC)
+(NCCC)
+(NCCF)
+(NCCN)
+(NCCO)
+(NCCS)
+(NCCl)
+(NCNC)
+(NCO)
+(NCS)
+(NCl)
+(NN)
+(NN=O)
+(NNC)
+(NO)
+(NOC)
+(O)
+(OC#N)
+(OC)
+(OC=C)
+(OC=O)
+(OC=S)
+(OCBr)
+(OCC)
+(OCCC)
+(OCCF)
+(OCCI)
+(OCCN)
+(OCCO)
+(OCCS)
+(OCCl)
+(OCF)
+(OCI)
+(OCO)
+(OCOC)
+(OCON)
+(OCSC)
+(OCl)
+(OI)
+(ON)
+(OO)
+(OOC)
+(OOCC)
+(OOSN)
+(OSC)
+(P)
+(S)
+(SC#N)
+(SC)
+(SCC)
+(SCCC)
+(SCCF)
+(SCCN)
+(SCCO)
+(SCCS)
+(SCCl)
+(SCF)
+(SCN)
+(SCOC)
+(SCSC)
+(SCl)
+(SI)
+(SN)
+(SN=O)
+(SO)
+(SOC)
+(SOOO)
+(SS)
+(SSC)
+(SSCC)
+([At])
+([O-])
+([O])
+([S-])
+(\\Br)
+(\\C#N)
+(\\C)
+(\\C=N)
+(\\C=O)
+(\\CBr)
+(\\CC)
+(\\CCC)
+(\\CCO)
+(\\CCl)
+(\\CF)
+(\\CN)
+(\\CNC)
+(\\CO)
+(\\COC)
+(\\Cl)
+(\\F)
+(\\I)
+(\\N)
+(\\NC)
+(\\NCC)
+(\\NN)
+(\\NO)
+(\\NOC)
+(\\O)
+(\\OC)
+(\\OCC)
+(\\ON)
+(\\S)
+(\\SC)
+(\\SCC)
+[Ag+]
+[Ag-4]
+[Ag]
+[Al-3]
+[Al]
+[As+]
+[AsH3]
+[AsH]
+[As]
+[At]
+[B-]
+[B@-]
+[B@@-]
+[BH-]
+[BH2-]
+[BH3-]
+[B]
+[Ba]
+[Br+2]
+[BrH]
+[Br]
+[C+]
+[C-]
+[C@@H]
+[C@@]
+[C@H]
+[C@]
+[CH-]
+[CH2]
+[CH3]
+[CH]
+[C]
+[CaH2]
+[Ca]
+[Cl+2]
+[Cl+3]
+[Cl+]
+[Cs]
+[FH]
+[F]
+[H]
+[He]
+[I+2]
+[I+3]
+[I+]
+[IH]
+[I]
+[K]
+[Kr]
+[Li+]
+[LiH]
+[MgH2]
+[Mg]
+[N+]
+[N-]
+[N@+]
+[N@@+]
+[N@@]
+[N@]
+[NH+]
+[NH-]
+[NH2+]
+[NH3]
+[NH]
+[N]
+[Na]
+[O+]
+[O-]
+[OH+]
+[OH2]
+[OH]
+[O]
+[P+]
+[P@+]
+[P@@+]
+[P@@]
+[P@]
+[PH2]
+[PH]
+[P]
+[Ra]
+[Rb]
+[S+]
+[S-]
+[S@+]
+[S@@+]
+[S@@]
+[S@]
+[SH+]
+[SH2]
+[SH]
+[S]
+[Se+]
+[Se-2]
+[SeH2]
+[SeH]
+[Se]
+[Si@]
+[SiH2]
+[SiH]
+[Si]
+[SrH2]
+[TeH]
+[Te]
+[Xe]
+[Zn+2]
+[Zn-2]
+[Zn]
+[b-]
+[c+]
+[c-]
+[cH-]
+[cH]
+[c]
+[n+]
+[n-]
+[nH]
+[n]
+[o+]
+[s+]
+[se+]
+[se]
+[te+]
+[te]

training_classifiers/.gitignore ADDED Viewed

File without changes

training_classifiers/.ipynb_checkpoints/binding_affinity_iptm-checkpoint.py ADDED Viewed

	@@ -0,0 +1,132 @@

+#!/usr/bin/env python3
+"""
+extract_iptm_affinity_csv_all.py
+Writes:
+  - out_dir/wt_iptm_affinity_all.csv
+  - out_dir/smiles_iptm_affinity_all.csv
+Also prints:
+  - N
+  - Spearman rho (affinity vs iptm)
+  - Pearson r (affinity vs iptm)
+"""
+from pathlib import Path
+import numpy as np
+import pandas as pd
+def corr_stats(df: pd.DataFrame, x: str, y: str):
+    # pandas handles NaNs if we already dropped them; still be safe
+    xx = pd.to_numeric(df[x], errors="coerce")
+    yy = pd.to_numeric(df[y], errors="coerce")
+    m = xx.notna() & yy.notna()
+    xx = xx[m]
+    yy = yy[m]
+    n = int(m.sum())
+    # Pearson r
+    pearson_r = float(xx.corr(yy, method="pearson")) if n > 1 else float("nan")
+    # Spearman rho
+    spearman_rho = float(xx.corr(yy, method="spearman")) if n > 1 else float("nan")
+    return {"n": n, "pearson_r": pearson_r, "spearman_rho": spearman_rho}
+def clean_one(
+    in_csv: Path,
+    out_csv: Path,
+    iptm_col: str,
+    affinity_col: str = "affinity",
+    keep_cols=(),
+):
+    df = pd.read_csv(in_csv)
+    # affinity + iptm must exist
+    need = [affinity_col, iptm_col]
+    missing = [c for c in need if c not in df.columns]
+    if missing:
+        raise ValueError(f"{in_csv} missing columns: {missing}. Found: {list(df.columns)}")
+    # coerce numeric
+    df[affinity_col] = pd.to_numeric(df[affinity_col], errors="coerce")
+    df[iptm_col] = pd.to_numeric(df[iptm_col], errors="coerce")
+    # drop NaNs in either
+    df = df.dropna(subset=[affinity_col, iptm_col]).reset_index(drop=True)
+    # output cols (standardize names)
+    out = pd.DataFrame({
+        "affinity": df[affinity_col].astype(float),
+        "iptm": df[iptm_col].astype(float),
+    })
+    # keep split if present (handy for coloring later, but not used for corr)
+    if "split" in df.columns:
+        out.insert(0, "split", df["split"].astype(str))
+    # optional extras for labeling/debug
+    for c in keep_cols:
+        if c in df.columns:
+            out[c] = df[c]
+    out_csv.parent.mkdir(parents=True, exist_ok=True)
+    out.to_csv(out_csv, index=False)
+    stats = corr_stats(out, "iptm", "affinity")
+    print(f"[write] {out_csv}")
+    print(f"  N={stats['n']} | Pearson r={stats['pearson_r']:.4f} | Spearman rho={stats['spearman_rho']:.4f}")
+    # also save stats json next to csv
+    stats_path = out_csv.with_suffix(".stats.json")
+    with open(stats_path, "w") as f:
+        import json
+        json.dump(
+            {
+                "input_csv": str(in_csv),
+                "output_csv": str(out_csv),
+                "iptm_col": iptm_col,
+                "affinity_col": affinity_col,
+                **stats,
+            },
+            f,
+            indent=2,
+        )
+def main():
+    import argparse
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--wt_meta_csv", type=str, required=True)
+    ap.add_argument("--smiles_meta_csv", type=str, required=True)
+    ap.add_argument("--out_dir", type=str, required=True)
+    ap.add_argument("--wt_iptm_col", type=str, default="wt_iptm_score")
+    ap.add_argument("--smiles_iptm_col", type=str, default="smiles_iptm_score")
+    ap.add_argument("--affinity_col", type=str, default="affinity")
+    args = ap.parse_args()
+    out_dir = Path(args.out_dir)
+    clean_one(
+        Path(args.wt_meta_csv),
+        out_dir / "wt_iptm_affinity_all.csv",
+        iptm_col=args.wt_iptm_col,
+        affinity_col=args.affinity_col,
+        keep_cols=("seq1", "seq2", "Fasta2SMILES", "REACT_SMILES"),
+    )
+    clean_one(
+        Path(args.smiles_meta_csv),
+        out_dir / "smiles_iptm_affinity_all.csv",
+        iptm_col=args.smiles_iptm_col,
+        affinity_col=args.affinity_col,
+        keep_cols=("seq1", "seq2", "Fasta2SMILES", "REACT_SMILES", "smiles_sequence"),
+    )
+    print(f"\n[DONE] CSVs + stats JSONs in: {out_dir}")
+if __name__ == "__main__":
+    main()

training_classifiers/.ipynb_checkpoints/binding_affinity_split-checkpoint.py ADDED Viewed

	@@ -0,0 +1,847 @@

+#!/usr/bin/env python3
+import os
+import math
+from pathlib import Path
+import sys
+from contextlib import contextmanager
+import numpy as np
+import pandas as pd
+import torch
+# tqdm is optional; we’ll disable it by default in notebooks
+from tqdm import tqdm
+sys.path.append("/vast/projects/pranam/lab/yz927/projects/Classifier_Weight")
+from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
+from datasets import Dataset, DatasetDict, Features, Value, Sequence as HFSequence
+from transformers import AutoTokenizer, EsmModel, AutoModelForMaskedLM
+# -------------------------
+# Config
+# -------------------------
+CSV_PATH = Path("/vast/projects/pranam/lab/yz927/projects/Classifier_Weight/c-binding_with_openfold_scores.csv")
+OUT_ROOT = Path(
+    "/vast/projects/pranam/lab/yz927/projects/Classifier_Weight/training_data_cleaned/binding_affinity"
+)
+# WT (seq) embedding model
+WT_MODEL_NAME = "facebook/esm2_t33_650M_UR50D"
+WT_MAX_LEN = 1022
+WT_BATCH = 32
+# SMILES embedding model + tokenizer
+SMI_MODEL_NAME = "aaronfeller/PeptideCLM-23M-all"
+TOKENIZER_VOCAB = "/vast/projects/pranam/lab/yz927/projects/Classifier_Weight/tokenizer/new_vocab.txt"
+TOKENIZER_SPLITS = "/vast/projects/pranam/lab/yz927/projects/Classifier_Weight/tokenizer/new_splits.txt"
+SMI_MAX_LEN = 768
+SMI_BATCH = 128
+# Split config
+TRAIN_FRAC = 0.80
+RANDOM_SEED = 1986
+AFFINITY_Q_BINS = 30
+# Columns expected in CSV
+COL_SEQ1 = "seq1"
+COL_SEQ2 = "seq2"
+COL_AFF = "affinity"
+COL_F2S = "Fasta2SMILES"
+COL_REACT = "REACT_SMILES"
+COL_WT_IPTM = "wt_iptm_score"
+COL_SMI_IPTM = "smiles_iptm_score"
+# Device
+DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# -------------------------
+# Quiet / notebook-safe output controls
+# -------------------------
+QUIET = True       # suppress most prints
+USE_TQDM = False   # disable tqdm bars (recommended in Jupyter to avoid crashing)
+LOG_FILE = None    # optionally: OUT_ROOT / "build.log"
+def log(msg: str):
+    if LOG_FILE is not None:
+        Path(LOG_FILE).parent.mkdir(parents=True, exist_ok=True)
+        with open(LOG_FILE, "a") as f:
+            f.write(msg.rstrip() + "\n")
+    if not QUIET:
+        print(msg)
+def pbar(it, **kwargs):
+    return tqdm(it, **kwargs) if USE_TQDM else it
+@contextmanager
+def section(title: str):
+    log(f"\n=== {title} ===")
+    yield
+    log(f"=== done: {title} ===")
+# -------------------------
+# Helpers
+# -------------------------
+def has_uaa(seq: str) -> bool:
+    return "X" in str(seq).upper()
+def affinity_to_class(a: float) -> str:
+    # High: >= 9 ; Moderate: [7, 9) ; Low: < 7
+    if a >= 9.0:
+        return "High"
+    elif a >= 7.0:
+        return "Moderate"
+    else:
+        return "Low"
+def make_distribution_matched_split(df: pd.DataFrame) -> pd.DataFrame:
+    df = df.copy()
+    df[COL_AFF] = pd.to_numeric(df[COL_AFF], errors="coerce")
+    df = df.dropna(subset=[COL_AFF]).reset_index(drop=True)
+    df["affinity_class"] = df[COL_AFF].apply(affinity_to_class)
+    try:
+        df["aff_bin"] = pd.qcut(df[COL_AFF], q=AFFINITY_Q_BINS, duplicates="drop")
+        strat_col = "aff_bin"
+    except Exception:
+        df["aff_bin"] = df["affinity_class"]
+        strat_col = "aff_bin"
+    rng = np.random.RandomState(RANDOM_SEED)
+    df["split"] = None
+    for _, g in df.groupby(strat_col, observed=True):
+        idx = g.index.to_numpy()
+        rng.shuffle(idx)
+        n_train = int(math.floor(len(idx) * TRAIN_FRAC))
+        df.loc[idx[:n_train], "split"] = "train"
+        df.loc[idx[n_train:], "split"] = "val"
+    df["split"] = df["split"].fillna("train")
+    return df
+def _summ(x):
+    x = np.asarray(x, dtype=float)
+    x = x[~np.isnan(x)]
+    if len(x) == 0:
+        return {"n": 0, "mean": np.nan, "std": np.nan, "p50": np.nan, "p95": np.nan}
+    return {
+        "n": int(len(x)),
+        "mean": float(np.mean(x)),
+        "std": float(np.std(x)),
+        "p50": float(np.quantile(x, 0.50)),
+        "p95": float(np.quantile(x, 0.95)),
+    }
+def _len_stats(seqs):
+    lens = np.asarray([len(str(s)) for s in seqs], dtype=float)
+    if len(lens) == 0:
+        return {"n": 0, "mean": np.nan, "std": np.nan, "p50": np.nan, "p95": np.nan}
+    return {
+        "n": int(len(lens)),
+        "mean": float(lens.mean()),
+        "std": float(lens.std()),
+        "p50": float(np.quantile(lens, 0.50)),
+        "p95": float(np.quantile(lens, 0.95)),
+    }
+def verify_split_before_embedding(
+    df2: pd.DataFrame,
+    affinity_col: str,
+    split_col: str,
+    seq_col: str,
+    iptm_col: str,
+    aff_class_col: str = "affinity_class",
+    aff_bins: int = 30,
+    save_report_prefix: str | None = None,
+    verbose: bool = False,
+):
+    """
+    Notebook-safe: by default prints only ONE line via `log()`.
+    Optionally writes CSV reports (stats + class proportions).
+    """
+    df2 = df2.copy()
+    df2[affinity_col] = pd.to_numeric(df2[affinity_col], errors="coerce")
+    df2[iptm_col] = pd.to_numeric(df2[iptm_col], errors="coerce")
+    assert split_col in df2.columns, f"Missing split col: {split_col}"
+    assert set(df2[split_col].dropna().unique()).issubset({"train", "val"}), f"Unexpected split values: {df2[split_col].unique()}"
+    assert df2[affinity_col].notna().any(), "No valid affinity values after coercion."
+    try:
+        df2["_aff_bin_dbg"] = pd.qcut(df2[affinity_col], q=aff_bins, duplicates="drop")
+    except Exception:
+        df2["_aff_bin_dbg"] = df2[aff_class_col].astype(str)
+    tr = df2[df2[split_col] == "train"].reset_index(drop=True)
+    va = df2[df2[split_col] == "val"].reset_index(drop=True)
+    tr_aff = _summ(tr[affinity_col].to_numpy())
+    va_aff = _summ(va[affinity_col].to_numpy())
+    tr_len = _len_stats(tr[seq_col].tolist())
+    va_len = _len_stats(va[seq_col].tolist())
+    # bin drift
+    bin_ct = (
+        df2.groupby([split_col, "_aff_bin_dbg"])
+           .size()
+           .groupby(level=0)
+           .apply(lambda s: s / s.sum())
+    )
+    tr_bins = bin_ct.loc["train"]
+    va_bins = bin_ct.loc["val"]
+    all_bins = tr_bins.index.union(va_bins.index)
+    tr_bins = tr_bins.reindex(all_bins, fill_value=0.0)
+    va_bins = va_bins.reindex(all_bins, fill_value=0.0)
+    max_bin_diff = float(np.max(np.abs(tr_bins.values - va_bins.values)))
+    msg = (
+        f"[split-check] rows={len(df2)} train={len(tr)} val={len(va)} | "
+        f"aff(mean±std) train={tr_aff['mean']:.3f}±{tr_aff['std']:.3f} val={va_aff['mean']:.3f}±{va_aff['std']:.3f} | "
+        f"len(p50/p95) train={tr_len['p50']:.1f}/{tr_len['p95']:.1f} val={va_len['p50']:.1f}/{va_len['p95']:.1f} | "
+        f"max_bin_diff={max_bin_diff:.4f}"
+    )
+    log(msg)
+    if verbose and (not QUIET):
+        class_ct = df2.groupby([split_col, aff_class_col]).size().unstack(fill_value=0)
+        class_prop = class_ct.div(class_ct.sum(axis=1), axis=0)
+        print("\n[verbose] affinity_class counts:\n", class_ct)
+        print("\n[verbose] affinity_class proportions:\n", class_prop.round(4))
+    if save_report_prefix is not None:
+        out = Path(save_report_prefix)
+        out.parent.mkdir(parents=True, exist_ok=True)
+        stats_df = pd.DataFrame([
+            {"split": "train", **{f"aff_{k}": v for k, v in tr_aff.items()}, **{f"len_{k}": v for k, v in tr_len.items()}},
+            {"split": "val",   **{f"aff_{k}": v for k, v in va_aff.items()}, **{f"len_{k}": v for k, v in va_len.items()}},
+        ])
+        class_ct = df2.groupby([split_col, aff_class_col]).size().unstack(fill_value=0)
+        class_prop = class_ct.div(class_ct.sum(axis=1), axis=0).reset_index()
+        stats_df.to_csv(out.with_suffix(".stats.csv"), index=False)
+        class_prop.to_csv(out.with_suffix(".class_prop.csv"), index=False)
+# -------------------------
+# WT pooled (ESM2)
+# -------------------------
+@torch.no_grad()
+def wt_pooled_embeddings(seqs, tokenizer, model, batch_size=32, max_length=1022):
+    embs = []
+    for i in pbar(range(0, len(seqs), batch_size)):
+        batch = seqs[i:i + batch_size]
+        inputs = tokenizer(
+            batch,
+            padding=True,
+            truncation=True,
+            max_length=max_length,
+            return_tensors="pt",
+        )
+        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+        out = model(**inputs)
+        h = out.last_hidden_state  # (B, L, H)
+        attn = inputs["attention_mask"].unsqueeze(-1)  # (B, L, 1)
+        summed = (h * attn).sum(dim=1)                 # (B, H)
+        denom = attn.sum(dim=1).clamp(min=1e-9)        # (B, 1)
+        pooled = (summed / denom).detach().cpu().numpy()
+        embs.append(pooled)
+    return np.vstack(embs)
+# -------------------------
+# WT unpooled (ESM2)
+# -------------------------
+@torch.no_grad()
+def wt_unpooled_one(seq, tokenizer, model, cls_id, eos_id, max_length=1022):
+    tok = tokenizer(seq, padding=False, truncation=True, max_length=max_length, return_tensors="pt")
+    tok = {k: v.to(DEVICE) for k, v in tok.items()}
+    out = model(**tok)
+    h = out.last_hidden_state[0]           # (L, H)
+    attn = tok["attention_mask"][0].bool() # (L,)
+    ids = tok["input_ids"][0]
+    keep = attn.clone()
+    if cls_id is not None:
+        keep &= (ids != cls_id)
+    if eos_id is not None:
+        keep &= (ids != eos_id)
+    return h[keep].detach().cpu().to(torch.float16).numpy()
+def build_wt_unpooled_dataset(df_split: pd.DataFrame, out_dir: Path, tokenizer, model):
+    """
+    Expects df_split to have:
+      - target_sequence  (seq1)
+      - sequence         (binder seq2; WT binder)
+      - label, affinity_class, COL_AFF, COL_WT_IPTM
+    Saves a dataset where each row contains BOTH:
+      - target_embedding (Lt,H), target_attention_mask, target_length
+      - binder_embedding (Lb,H), binder_attention_mask, binder_length
+    """
+    cls_id = tokenizer.cls_token_id
+    eos_id = tokenizer.eos_token_id
+    H = model.config.hidden_size
+    features = Features({
+        "target_sequence": Value("string"),
+        "sequence": Value("string"),
+        "label": Value("float32"),
+        "affinity": Value("float32"),
+        "affinity_class": Value("string"),
+        "target_embedding": HFSequence(HFSequence(Value("float16"), length=H)),
+        "target_attention_mask": HFSequence(Value("int8")),
+        "target_length": Value("int64"),
+        "binder_embedding": HFSequence(HFSequence(Value("float16"), length=H)),
+        "binder_attention_mask": HFSequence(Value("int8")),
+        "binder_length": Value("int64"),
+        COL_WT_IPTM: Value("float32"),
+        COL_AFF: Value("float32"),
+    })
+    def gen_rows(df: pd.DataFrame):
+        for r in pbar(df.itertuples(index=False), total=len(df)):
+            tgt = str(getattr(r, "target_sequence")).strip()
+            bnd = str(getattr(r, "sequence")).strip()
+            y = float(getattr(r, "label"))
+            aff = float(getattr(r, COL_AFF))
+            acls = str(getattr(r, "affinity_class"))
+            iptm = getattr(r, COL_WT_IPTM)
+            iptm = float(iptm) if pd.notna(iptm) else np.nan
+            # token embeddings for target + binder (both ESM)
+            t_emb = wt_unpooled_one(tgt, tokenizer, model, cls_id, eos_id, max_length=WT_MAX_LEN)  # (Lt,H)
+            b_emb = wt_unpooled_one(bnd, tokenizer, model, cls_id, eos_id, max_length=WT_MAX_LEN)  # (Lb,H)
+            t_list = t_emb.tolist()
+            b_list = b_emb.tolist()
+            Lt = len(t_list)
+            Lb = len(b_list)
+            yield {
+                "target_sequence": tgt,
+                "sequence": bnd,
+                "label": np.float32(y),
+                "affinity": np.float32(aff),
+                "affinity_class": acls,
+                "target_embedding": t_list,
+                "target_attention_mask": [1] * Lt,
+                "target_length": int(Lt),
+                "binder_embedding": b_list,
+                "binder_attention_mask": [1] * Lb,
+                "binder_length": int(Lb),
+                COL_WT_IPTM: np.float32(iptm) if not np.isnan(iptm) else np.float32(np.nan),
+                COL_AFF: np.float32(aff),
+            }
+    out_dir.mkdir(parents=True, exist_ok=True)
+    ds = Dataset.from_generator(lambda: gen_rows(df_split), features=features)
+    ds.save_to_disk(str(out_dir), max_shard_size="1GB")
+    return ds
+def build_smiles_unpooled_paired_dataset(df_split: pd.DataFrame, out_dir: Path, wt_tokenizer, wt_model_unpooled,
+                                        smi_tok, smi_roformer):
+    """
+    df_split must have:
+      - target_sequence (seq1)
+      - sequence        (binder smiles string)
+      - label, affinity_class, COL_AFF, COL_SMI_IPTM
+    Saves rows with:
+      target_embedding (Lt,Ht) from ESM
+      binder_embedding (Lb,Hb) from PeptideCLM
+    """
+    cls_id = wt_tokenizer.cls_token_id
+    eos_id = wt_tokenizer.eos_token_id
+    Ht = wt_model_unpooled.config.hidden_size
+    # Infer Hb from one forward pass? easiest: run one mini batch outside in main if you want.
+    # Here: we’ll infer from model config if available.
+    Hb = getattr(smi_roformer.config, "hidden_size", None)
+    if Hb is None:
+        Hb = getattr(smi_roformer.config, "dim", None)
+    if Hb is None:
+        raise ValueError("Cannot infer Hb from smi_roformer config; print(smi_roformer.config) and set Hb manually.")
+    features = Features({
+        "target_sequence": Value("string"),
+        "sequence": Value("string"),
+        "label": Value("float32"),
+        "affinity": Value("float32"),
+        "affinity_class": Value("string"),
+        "target_embedding": HFSequence(HFSequence(Value("float16"), length=Ht)),
+        "target_attention_mask": HFSequence(Value("int8")),
+        "target_length": Value("int64"),
+        "binder_embedding": HFSequence(HFSequence(Value("float16"), length=Hb)),
+        "binder_attention_mask": HFSequence(Value("int8")),
+        "binder_length": Value("int64"),
+        COL_SMI_IPTM: Value("float32"),
+        COL_AFF: Value("float32"),
+    })
+    def gen_rows(df: pd.DataFrame):
+        for r in pbar(df.itertuples(index=False), total=len(df)):
+            tgt = str(getattr(r, "target_sequence")).strip()
+            bnd = str(getattr(r, "sequence")).strip()
+            y = float(getattr(r, "label"))
+            aff = float(getattr(r, COL_AFF))
+            acls = str(getattr(r, "affinity_class"))
+            iptm = getattr(r, COL_SMI_IPTM)
+            iptm = float(iptm) if pd.notna(iptm) else np.nan
+            # target token embeddings (ESM)
+            t_emb = wt_unpooled_one(tgt, wt_tokenizer, wt_model_unpooled, cls_id, eos_id, max_length=WT_MAX_LEN)
+            t_list = t_emb.tolist()
+            Lt = len(t_list)
+            # binder token embeddings (PeptideCLM) — single-item batch
+            _, tok_list, mask_list, lengths = smiles_embed_batch_return_both(
+                [bnd], smi_tok, smi_roformer, max_length=SMI_MAX_LEN
+            )
+            b_emb = tok_list[0]  # np.float16 (Lb, Hb)
+            b_list = b_emb.tolist()
+            Lb = int(lengths[0])
+            b_mask = mask_list[0].astype(np.int8).tolist()
+            yield {
+                "target_sequence": tgt,
+                "sequence": bnd,
+                "label": np.float32(y),
+                "affinity": np.float32(aff),
+                "affinity_class": acls,
+                "target_embedding": t_list,
+                "target_attention_mask": [1] * Lt,
+                "target_length": int(Lt),
+                "binder_embedding": b_list,
+                "binder_attention_mask": [int(x) for x in b_mask],
+                "binder_length": int(Lb),
+                COL_SMI_IPTM: np.float32(iptm) if not np.isnan(iptm) else np.float32(np.nan),
+                COL_AFF: np.float32(aff),
+            }
+    out_dir.mkdir(parents=True, exist_ok=True)
+    ds = Dataset.from_generator(lambda: gen_rows(df_split), features=features)
+    ds.save_to_disk(str(out_dir), max_shard_size="1GB")
+    return ds
+# -------------------------
+# SMILES pooled + unpooled (PeptideCLM)
+# -------------------------
+def get_special_ids(tokenizer_obj):
+    cand = [
+        getattr(tokenizer_obj, "pad_token_id", None),
+        getattr(tokenizer_obj, "cls_token_id", None),
+        getattr(tokenizer_obj, "sep_token_id", None),
+        getattr(tokenizer_obj, "bos_token_id", None),
+        getattr(tokenizer_obj, "eos_token_id", None),
+        getattr(tokenizer_obj, "mask_token_id", None),
+    ]
+    return sorted({x for x in cand if x is not None})
+@torch.no_grad()
+def smiles_embed_batch_return_both(batch_sequences, tokenizer_obj, model_roformer, max_length):
+    tok = tokenizer_obj(
+        batch_sequences,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=max_length,
+    )
+    input_ids = tok["input_ids"].to(DEVICE)
+    attention_mask = tok["attention_mask"].to(DEVICE)
+    outputs = model_roformer(input_ids=input_ids, attention_mask=attention_mask)
+    last_hidden = outputs.last_hidden_state  # (B, L, H)
+    special_ids = get_special_ids(tokenizer_obj)
+    valid = attention_mask.bool()
+    if len(special_ids) > 0:
+        sid = torch.tensor(special_ids, device=DEVICE, dtype=torch.long)
+        if hasattr(torch, "isin"):
+            valid = valid & (~torch.isin(input_ids, sid))
+        else:
+            m = torch.zeros_like(valid)
+            for s in special_ids:
+                m |= (input_ids == s)
+            valid = valid & (~m)
+    valid_f = valid.unsqueeze(-1).float()
+    summed = torch.sum(last_hidden * valid_f, dim=1)
+    denom = torch.clamp(valid_f.sum(dim=1), min=1e-9)
+    pooled = (summed / denom).detach().cpu().numpy()
+    token_emb_list, mask_list, lengths = [], [], []
+    for b in range(last_hidden.shape[0]):
+        emb = last_hidden[b, valid[b]]  # (Li, H)
+        token_emb_list.append(emb.detach().cpu().to(torch.float16).numpy())
+        li = emb.shape[0]
+        lengths.append(int(li))
+        mask_list.append(np.ones((li,), dtype=np.int8))
+    return pooled, token_emb_list, mask_list, lengths
+def smiles_generate_embeddings_batched_both(seqs, tokenizer_obj, model_roformer, batch_size, max_length):
+    pooled_all = []
+    token_emb_all = []
+    mask_all = []
+    lengths_all = []
+    for i in pbar(range(0, len(seqs), batch_size)):
+        batch = seqs[i:i + batch_size]
+        pooled, tok_list, m_list, lens = smiles_embed_batch_return_both(
+            batch, tokenizer_obj, model_roformer, max_length
+        )
+        pooled_all.append(pooled)
+        token_emb_all.extend(tok_list)
+        mask_all.extend(m_list)
+        lengths_all.extend(lens)
+    return np.vstack(pooled_all), token_emb_all, mask_all, lengths_all
+# -------------------------
+# Target embedding cache (NO extra ESM runs)
+# We will compute target pooled embeddings ONCE from WT view, then reuse for SMILES.
+# -------------------------
+def build_target_cache_from_wt_view(wt_view_train: pd.DataFrame, wt_view_val: pd.DataFrame):
+    wt_tok = AutoTokenizer.from_pretrained(WT_MODEL_NAME)
+    wt_model = EsmModel.from_pretrained(WT_MODEL_NAME).to(DEVICE).eval()
+    # compute target pooled embeddings once
+    tgt_wt_train = wt_view_train["target_sequence"].astype(str).tolist()
+    tgt_wt_val   = wt_view_val["target_sequence"].astype(str).tolist()
+    wt_train_tgt_emb = wt_pooled_embeddings(
+        tgt_wt_train, wt_tok, wt_model, batch_size=WT_BATCH, max_length=WT_MAX_LEN
+    )
+    wt_val_tgt_emb = wt_pooled_embeddings(
+        tgt_wt_val, wt_tok, wt_model, batch_size=WT_BATCH, max_length=WT_MAX_LEN
+    )
+    # build dict: target_sequence -> embedding (float32 array)
+    # if duplicates exist, last wins; you can add checks if needed
+    train_map = {s: e for s, e in zip(tgt_wt_train, wt_train_tgt_emb)}
+    val_map   = {s: e for s, e in zip(tgt_wt_val,   wt_val_tgt_emb)}
+    return wt_tok, wt_model, wt_train_tgt_emb, wt_val_tgt_emb, train_map, val_map
+# -------------------------
+# Main
+# -------------------------
+def main():
+    log(f"[INFO] DEVICE: {DEVICE}")
+    OUT_ROOT.mkdir(parents=True, exist_ok=True)
+    # 1) Load
+    with section("load csv + dedup"):
+        df = pd.read_csv(CSV_PATH)
+        for c in [COL_SEQ1, COL_SEQ2, COL_F2S, COL_REACT]:
+            if c in df.columns:
+                df[c] = df[c].apply(lambda x: x.strip() if isinstance(x, str) else x)
+        # Dedup on the full identity tuple you want
+        DEDUP_COLS = [COL_SEQ1, COL_SEQ2, COL_F2S, COL_REACT]
+        df = df.drop_duplicates(subset=DEDUP_COLS).reset_index(drop=True)
+        print("Rows after dedup on", DEDUP_COLS, ":", len(df))
+        need = [COL_SEQ1, COL_SEQ2, COL_AFF, COL_F2S, COL_REACT, COL_WT_IPTM, COL_SMI_IPTM]
+        missing = [c for c in need if c not in df.columns]
+        if missing:
+            raise ValueError(f"Missing required columns: {missing}")
+        # numeric affinity for both branches
+        df[COL_AFF] = pd.to_numeric(df[COL_AFF], errors="coerce")
+    # 2) Build WT subset + SMILES subset separately (NO global dropping)
+    with section("prepare wt/smiles subsets"):
+        # WT: requires a canonical peptide sequence (no X) + affinity
+        df_wt = df.copy()
+        df_wt["wt_sequence"] = df_wt[COL_SEQ2].astype(str).str.strip()
+        df_wt = df_wt.dropna(subset=[COL_AFF]).reset_index(drop=True)
+        df_wt = df_wt[df_wt["wt_sequence"].notna() & (df_wt["wt_sequence"] != "")]
+        df_wt = df_wt[~df_wt["wt_sequence"].str.contains("X", case=False, na=False)].reset_index(drop=True)
+        # SMILES: requires affinity + a usable picked SMILES (UAA->REACT, else->Fasta2SMILES)
+        df_smi = df.copy()
+        df_smi = df_smi.dropna(subset=[COL_AFF]).reset_index(drop=True)
+        df_smi = df_smi[
+            pd.to_numeric(df_smi[COL_SMI_IPTM], errors="coerce").notna()
+        ].reset_index(drop=True) # empty iptm means sth wrong with their smiles sequenc
+        is_uaa = df_smi[COL_SEQ2].astype(str).str.contains("X", case=False, na=False)
+        df_smi["smiles_sequence"] = np.where(is_uaa, df_smi[COL_REACT], df_smi[COL_F2S])
+        df_smi["smiles_sequence"] = df_smi["smiles_sequence"].astype(str).str.strip()
+        df_smi = df_smi[df_smi["smiles_sequence"].notna() & (df_smi["smiles_sequence"] != "")]
+        df_smi = df_smi[~df_smi["smiles_sequence"].isin(["nan", "None"])].reset_index(drop=True)
+        log(f"[counts] WT rows={len(df_wt)} | SMILES rows={len(df_smi)} (after per-branch filtering)")
+    # 3) Split separately (different sizes and memberships are expected)
+    with section("split wt and smiles separately"):
+        df_wt2 = make_distribution_matched_split(df_wt)
+        df_smi2 = make_distribution_matched_split(df_smi)
+        # save split tables
+        wt_split_csv = OUT_ROOT / "binding_affinity_wt_meta_with_split.csv"
+        smi_split_csv = OUT_ROOT / "binding_affinity_smiles_meta_with_split.csv"
+        df_wt2.to_csv(wt_split_csv, index=False)
+        df_smi2.to_csv(smi_split_csv, index=False)
+        log(f"Saved WT split meta: {wt_split_csv}")
+        log(f"Saved SMILES split meta: {smi_split_csv}")
+        # lightweight double-check (one-line)
+        verify_split_before_embedding(
+            df2=df_wt2,
+            affinity_col=COL_AFF,
+            split_col="split",
+            seq_col="wt_sequence",
+            iptm_col=COL_WT_IPTM,
+            aff_class_col="affinity_class",
+            aff_bins=AFFINITY_Q_BINS,
+            save_report_prefix=str(OUT_ROOT / "wt_split_doublecheck_report"),
+            verbose=False,
+        )
+        verify_split_before_embedding(
+            df2=df_smi2,
+            affinity_col=COL_AFF,
+            split_col="split",
+            seq_col="smiles_sequence",
+            iptm_col=COL_SMI_IPTM,
+            aff_class_col="affinity_class",
+            aff_bins=AFFINITY_Q_BINS,
+            save_report_prefix=str(OUT_ROOT / "smiles_split_doublecheck_report"),
+            verbose=False,
+        )
+    # Prepare split views
+    def prep_view(df_in: pd.DataFrame, binder_seq_col: str, iptm_col: str) -> pd.DataFrame:
+        out = df_in.copy()
+        out["target_sequence"] = out[COL_SEQ1].astype(str).str.strip()   # <-- NEW
+        out["sequence"] = out[binder_seq_col].astype(str).str.strip()   # binder
+        out["label"] = pd.to_numeric(out[COL_AFF], errors="coerce")
+        out[iptm_col] = pd.to_numeric(out[iptm_col], errors="coerce")
+        out[COL_AFF] = pd.to_numeric(out[COL_AFF], errors="coerce")
+        out = out.dropna(subset=["target_sequence", "sequence", "label"]).reset_index(drop=True)
+        return out[["target_sequence", "sequence", "label", "split", iptm_col, COL_AFF, "affinity_class"]]
+    wt_view = prep_view(df_wt2, "wt_sequence", COL_WT_IPTM)
+    smi_view = prep_view(df_smi2, "smiles_sequence", COL_SMI_IPTM)
+    # -------------------------
+    # Split views
+    # -------------------------
+    wt_train = wt_view[wt_view["split"] == "train"].reset_index(drop=True)
+    wt_val   = wt_view[wt_view["split"] == "val"].reset_index(drop=True)
+    smi_train = smi_view[smi_view["split"] == "train"].reset_index(drop=True)
+    smi_val   = smi_view[smi_view["split"] == "val"].reset_index(drop=True)
+    # =========================
+    # TARGET pooled embeddings (ESM) — SEPARATE per branch
+    # =========================
+    with section("TARGET pooled embeddings (ESM) — WT + SMILES separately"):
+        wt_tok = AutoTokenizer.from_pretrained(WT_MODEL_NAME)
+        wt_esm = EsmModel.from_pretrained(WT_MODEL_NAME).to(DEVICE).eval()
+        # ---- WT targets ----
+        wt_train_tgt_emb = wt_pooled_embeddings(
+            wt_train["target_sequence"].astype(str).str.strip().tolist(),
+            wt_tok, wt_esm,
+            batch_size=WT_BATCH,
+            max_length=WT_MAX_LEN,
+        ).astype(np.float32)
+        wt_val_tgt_emb = wt_pooled_embeddings(
+            wt_val["target_sequence"].astype(str).str.strip().tolist(),
+            wt_tok, wt_esm,
+            batch_size=WT_BATCH,
+            max_length=WT_MAX_LEN,
+        ).astype(np.float32)
+        # ---- SMILES targets (independent; may include UAA-only targets) ----
+        smi_train_tgt_emb = wt_pooled_embeddings(
+            smi_train["target_sequence"].astype(str).str.strip().tolist(),
+            wt_tok, wt_esm,
+            batch_size=WT_BATCH,
+            max_length=WT_MAX_LEN,
+        ).astype(np.float32)
+        smi_val_tgt_emb = wt_pooled_embeddings(
+            smi_val["target_sequence"].astype(str).str.strip().tolist(),
+            wt_tok, wt_esm,
+            batch_size=WT_BATCH,
+            max_length=WT_MAX_LEN,
+        ).astype(np.float32)
+    # =========================
+    # WT pooled binder embeddings (binder = WT peptide)
+    # =========================
+    with section("WT pooled binder embeddings + save"):
+        wt_train_emb = wt_pooled_embeddings(
+            wt_train["sequence"].astype(str).str.strip().tolist(),
+            wt_tok, wt_esm,
+            batch_size=WT_BATCH,
+            max_length=WT_MAX_LEN,
+        ).astype(np.float32)
+        wt_val_emb = wt_pooled_embeddings(
+            wt_val["sequence"].astype(str).str.strip().tolist(),
+            wt_tok, wt_esm,
+            batch_size=WT_BATCH,
+            max_length=WT_MAX_LEN,
+        ).astype(np.float32)
+        wt_train_ds = Dataset.from_dict({
+            "target_sequence": wt_train["target_sequence"].tolist(),
+            "sequence": wt_train["sequence"].tolist(),
+            "label": wt_train["label"].astype(float).tolist(),
+            "target_embedding": wt_train_tgt_emb,
+            "embedding": wt_train_emb,
+            COL_WT_IPTM: wt_train[COL_WT_IPTM].astype(float).tolist(),
+            COL_AFF: wt_train[COL_AFF].astype(float).tolist(),
+            "affinity_class": wt_train["affinity_class"].tolist(),
+        })
+        wt_val_ds = Dataset.from_dict({
+            "target_sequence": wt_val["target_sequence"].tolist(),
+            "sequence": wt_val["sequence"].tolist(),
+            "label": wt_val["label"].astype(float).tolist(),
+            "target_embedding": wt_val_tgt_emb,
+            "embedding": wt_val_emb,
+            COL_WT_IPTM: wt_val[COL_WT_IPTM].astype(float).tolist(),
+            COL_AFF: wt_val[COL_AFF].astype(float).tolist(),
+            "affinity_class": wt_val["affinity_class"].tolist(),
+        })
+        wt_pooled_dd = DatasetDict({"train": wt_train_ds, "val": wt_val_ds})
+        wt_pooled_out = OUT_ROOT / "pair_wt_wt_pooled"
+        wt_pooled_dd.save_to_disk(str(wt_pooled_out))
+        log(f"Saved WT pooled -> {wt_pooled_out}")
+    # =========================
+    # SMILES pooled binder embeddings (binder = SMILES via PeptideCLM)
+    # =========================
+    with section("SMILES pooled binder embeddings + save"):
+        smi_tok = SMILES_SPE_Tokenizer(TOKENIZER_VOCAB, TOKENIZER_SPLITS)
+        smi_roformer = (
+            AutoModelForMaskedLM
+            .from_pretrained(SMI_MODEL_NAME)
+            .roformer
+            .to(DEVICE)
+            .eval()
+        )
+        smi_train_pooled, _, _, _ = smiles_generate_embeddings_batched_both(
+            smi_train["sequence"].astype(str).str.strip().tolist(),
+            smi_tok, smi_roformer,
+            batch_size=SMI_BATCH,
+            max_length=SMI_MAX_LEN,
+        )
+        smi_val_pooled, _, _, _ = smiles_generate_embeddings_batched_both(
+            smi_val["sequence"].astype(str).str.strip().tolist(),
+            smi_tok, smi_roformer,
+            batch_size=SMI_BATCH,
+            max_length=SMI_MAX_LEN,
+        )
+        smi_train_ds = Dataset.from_dict({
+            "target_sequence": smi_train["target_sequence"].tolist(),
+            "sequence": smi_train["sequence"].tolist(),
+            "label": smi_train["label"].astype(float).tolist(),
+            "target_embedding": smi_train_tgt_emb,
+            "embedding": smi_train_pooled.astype(np.float32),
+            COL_SMI_IPTM: smi_train[COL_SMI_IPTM].astype(float).tolist(),
+            COL_AFF: smi_train[COL_AFF].astype(float).tolist(),
+            "affinity_class": smi_train["affinity_class"].tolist(),
+        })
+        smi_val_ds = Dataset.from_dict({
+            "target_sequence": smi_val["target_sequence"].tolist(),
+            "sequence": smi_val["sequence"].tolist(),
+            "label": smi_val["label"].astype(float).tolist(),
+            "target_embedding": smi_val_tgt_emb,
+            "embedding": smi_val_pooled.astype(np.float32),
+            COL_SMI_IPTM: smi_val[COL_SMI_IPTM].astype(float).tolist(),
+            COL_AFF: smi_val[COL_AFF].astype(float).tolist(),
+            "affinity_class": smi_val["affinity_class"].tolist(),
+        })
+        smi_pooled_dd = DatasetDict({"train": smi_train_ds, "val": smi_val_ds})
+        smi_pooled_out = OUT_ROOT / "pair_wt_smiles_pooled"
+        smi_pooled_dd.save_to_disk(str(smi_pooled_out))
+        log(f"Saved SMILES pooled -> {smi_pooled_out}")
+        # =========================
+    # WT unpooled paired (ESM target + ESM binder) + save
+    # =========================
+    with section("WT unpooled paired embeddings + save"):
+        wt_tok_unpooled = wt_tok                       # reuse tokenizer
+        wt_esm_unpooled = wt_esm                       # reuse model
+        wt_unpooled_out = OUT_ROOT / "pair_wt_wt_unpooled"
+        wt_unpooled_dd = DatasetDict({
+            "train": build_wt_unpooled_dataset(wt_train, wt_unpooled_out / "train",
+                                               wt_tok_unpooled, wt_esm_unpooled),
+            "val":   build_wt_unpooled_dataset(wt_val,   wt_unpooled_out / "val",
+                                               wt_tok_unpooled, wt_esm_unpooled),
+        })
+        # (Optional) also save as DatasetDict root if you want a single load_from_disk path:
+        wt_unpooled_dd.save_to_disk(str(wt_unpooled_out))
+        log(f"Saved WT unpooled -> {wt_unpooled_out}")
+    # =========================
+    # SMILES unpooled paired (ESM target + PeptideCLM binder) + save
+    # =========================
+    with section("SMILES unpooled paired embeddings + save"):
+        # reuse already-loaded smi_tok/smi_roformer from pooled section if still in scope;
+        # otherwise re-init here:
+        # smi_tok = SMILES_SPE_Tokenizer(TOKENIZER_VOCAB, TOKENIZER_SPLITS)
+        # smi_roformer = AutoModelForMaskedLM.from_pretrained(SMI_MODEL_NAME).roformer.to(DEVICE).eval()
+        smi_unpooled_out = OUT_ROOT / "pair_wt_smiles_unpooled"
+        smi_unpooled_dd = DatasetDict({
+            "train": build_smiles_unpooled_paired_dataset(
+                smi_train, smi_unpooled_out / "train",
+                wt_tok, wt_esm,
+                smi_tok, smi_roformer
+            ),
+            "val": build_smiles_unpooled_paired_dataset(
+                smi_val, smi_unpooled_out / "val",
+                wt_tok, wt_esm,
+                smi_tok, smi_roformer
+            ),
+        })
+        smi_unpooled_dd.save_to_disk(str(smi_unpooled_out))
+        log(f"Saved SMILES unpooled -> {smi_unpooled_out}")
+    log(f"\n[DONE] All datasets saved under: {OUT_ROOT}")
+if __name__ == "__main__":
+    main()

training_classifiers/.ipynb_checkpoints/binding_training-checkpoint.py ADDED Viewed

	@@ -0,0 +1,414 @@

+import os, json
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+import optuna
+from datasets import load_from_disk, DatasetDict
+from scipy.stats import spearmanr
+from lightning.pytorch import seed_everything
+seed_everything(1986)
+DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+def safe_spearmanr(y_true: np.ndarray, y_pred: np.ndarray) -> float:
+    rho = spearmanr(y_true, y_pred).correlation
+    if rho is None or np.isnan(rho):
+        return 0.0
+    return float(rho)
+# -----------------------------
+# Affinity class thresholds (final spec)
+# High >= 9 ; Moderate 7-9 ; Low < 7
+# 0=High, 1=Moderate, 2=Low
+# -----------------------------
+def affinity_to_class_tensor(y: torch.Tensor) -> torch.Tensor:
+    high = y >= 9.0
+    low  = y < 7.0
+    mid  = ~(high | low)
+    cls = torch.zeros_like(y, dtype=torch.long)
+    cls[mid] = 1
+    cls[low] = 2
+    return cls
+# -----------------------------
+# Load paired DatasetDict
+# -----------------------------
+def load_split_paired(path: str):
+    dd = load_from_disk(path)
+    if not isinstance(dd, DatasetDict):
+        raise ValueError(f"Expected DatasetDict at {path}")
+    if "train" not in dd or "val" not in dd:
+        raise ValueError(f"DatasetDict missing train/val at {path}")
+    return dd["train"], dd["val"]
+# -----------------------------
+# Collate: pooled paired
+# -----------------------------
+def collate_pair_pooled(batch):
+    Pt = torch.tensor([x["target_embedding"] for x in batch], dtype=torch.float32)  # (B,Ht)
+    Pb = torch.tensor([x["binder_embedding"] for x in batch], dtype=torch.float32)  # (B,Hb)
+    y  = torch.tensor([float(x["label"]) for x in batch], dtype=torch.float32)
+    return Pt, Pb, y
+# -----------------------------
+# Collate: unpooled paired
+# -----------------------------
+def collate_pair_unpooled(batch):
+    B = len(batch)
+    Ht = len(batch[0]["target_embedding"][0])
+    Hb = len(batch[0]["binder_embedding"][0])
+    Lt_max = max(int(x["target_length"]) for x in batch)
+    Lb_max = max(int(x["binder_length"]) for x in batch)
+    Pt = torch.zeros(B, Lt_max, Ht, dtype=torch.float32)
+    Pb = torch.zeros(B, Lb_max, Hb, dtype=torch.float32)
+    Mt = torch.zeros(B, Lt_max, dtype=torch.bool)
+    Mb = torch.zeros(B, Lb_max, dtype=torch.bool)
+    y  = torch.tensor([float(x["label"]) for x in batch], dtype=torch.float32)
+    for i, x in enumerate(batch):
+        t = torch.tensor(x["target_embedding"], dtype=torch.float32)
+        b = torch.tensor(x["binder_embedding"], dtype=torch.float32)
+        lt, lb = t.shape[0], b.shape[0]
+        Pt[i, :lt] = t
+        Pb[i, :lb] = b
+        Mt[i, :lt] = torch.tensor(x["target_attention_mask"][:lt], dtype=torch.bool)
+        Mb[i, :lb] = torch.tensor(x["binder_attention_mask"][:lb], dtype=torch.bool)
+    return Pt, Mt, Pb, Mb, y
+# -----------------------------
+# Cross-attention models
+# -----------------------------
+class CrossAttnPooled(nn.Module):
+    """
+    pooled vectors -> treat as single-token sequences for cross attention
+    """
+    def __init__(self, Ht, Hb, hidden=512, n_heads=8, n_layers=3, dropout=0.1):
+        super().__init__()
+        self.t_proj = nn.Sequential(nn.Linear(Ht, hidden), nn.LayerNorm(hidden))
+        self.b_proj = nn.Sequential(nn.Linear(Hb, hidden), nn.LayerNorm(hidden))
+        self.layers = nn.ModuleList([])
+        for _ in range(n_layers):
+            self.layers.append(nn.ModuleDict({
+                "attn_tb": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=False),
+                "attn_bt": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=False),
+                "n1t": nn.LayerNorm(hidden),
+                "n2t": nn.LayerNorm(hidden),
+                "n1b": nn.LayerNorm(hidden),
+                "n2b": nn.LayerNorm(hidden),
+                "fft": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
+                "ffb": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
+            }))
+        self.shared = nn.Sequential(nn.Linear(2*hidden, hidden), nn.GELU(), nn.Dropout(dropout))
+        self.reg = nn.Linear(hidden, 1)
+        self.cls = nn.Linear(hidden, 3)
+    def forward(self, t_vec, b_vec):
+        # (B,Ht),(B,Hb)
+        t = self.t_proj(t_vec).unsqueeze(0)  # (1,B,H)
+        b = self.b_proj(b_vec).unsqueeze(0)  # (1,B,H)
+        for L in self.layers:
+            t_attn, _ = L["attn_tb"](t, b, b)
+            t = L["n1t"]((t + t_attn).transpose(0,1)).transpose(0,1)
+            t = L["n2t"]((t + L["fft"](t)).transpose(0,1)).transpose(0,1)
+            b_attn, _ = L["attn_bt"](b, t, t)
+            b = L["n1b"]((b + b_attn).transpose(0,1)).transpose(0,1)
+            b = L["n2b"]((b + L["ffb"](b)).transpose(0,1)).transpose(0,1)
+        t0 = t[0]
+        b0 = b[0]
+        z = torch.cat([t0, b0], dim=-1)
+        h = self.shared(z)
+        return self.reg(h).squeeze(-1), self.cls(h)
+class CrossAttnUnpooled(nn.Module):
+    """
+    token sequences with masks; alternating cross attention.
+    """
+    def __init__(self, Ht, Hb, hidden=512, n_heads=8, n_layers=3, dropout=0.1):
+        super().__init__()
+        self.t_proj = nn.Sequential(nn.Linear(Ht, hidden), nn.LayerNorm(hidden))
+        self.b_proj = nn.Sequential(nn.Linear(Hb, hidden), nn.LayerNorm(hidden))
+        self.layers = nn.ModuleList([])
+        for _ in range(n_layers):
+            self.layers.append(nn.ModuleDict({
+                "attn_tb": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True),
+                "attn_bt": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True),
+                "n1t": nn.LayerNorm(hidden),
+                "n2t": nn.LayerNorm(hidden),
+                "n1b": nn.LayerNorm(hidden),
+                "n2b": nn.LayerNorm(hidden),
+                "fft": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
+                "ffb": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
+            }))
+        self.shared = nn.Sequential(nn.Linear(2*hidden, hidden), nn.GELU(), nn.Dropout(dropout))
+        self.reg = nn.Linear(hidden, 1)
+        self.cls = nn.Linear(hidden, 3)
+    def masked_mean(self, X, M):
+        Mf = M.unsqueeze(-1).float()
+        denom = Mf.sum(dim=1).clamp(min=1.0)
+        return (X * Mf).sum(dim=1) / denom
+    def forward(self, T, Mt, B, Mb):
+        # T:(B,Lt,Ht), Mt:(B,Lt) ; B:(B,Lb,Hb), Mb:(B,Lb)
+        T = self.t_proj(T)
+        Bx = self.b_proj(B)
+        kp_t = ~Mt  # key_padding_mask True = pad
+        kp_b = ~Mb
+        for L in self.layers:
+            # T attends to B
+            T_attn, _ = L["attn_tb"](T, Bx, Bx, key_padding_mask=kp_b)
+            T = L["n1t"](T + T_attn)
+            T = L["n2t"](T + L["fft"](T))
+            # B attends to T
+            B_attn, _ = L["attn_bt"](Bx, T, T, key_padding_mask=kp_t)
+            Bx = L["n1b"](Bx + B_attn)
+            Bx = L["n2b"](Bx + L["ffb"](Bx))
+        t_pool = self.masked_mean(T, Mt)
+        b_pool = self.masked_mean(Bx, Mb)
+        z = torch.cat([t_pool, b_pool], dim=-1)
+        h = self.shared(z)
+        return self.reg(h).squeeze(-1), self.cls(h)
+# -----------------------------
+# Train/eval
+# -----------------------------
+@torch.no_grad()
+def eval_spearman_pooled(model, loader):
+    model.eval()
+    ys, ps = [], []
+    for t, b, y in loader:
+        t = t.to(DEVICE, non_blocking=True)
+        b = b.to(DEVICE, non_blocking=True)
+        pred, _ = model(t, b)
+        ys.append(y.numpy())
+        ps.append(pred.detach().cpu().numpy())
+    return safe_spearmanr(np.concatenate(ys), np.concatenate(ps))
+@torch.no_grad()
+def eval_spearman_unpooled(model, loader):
+    model.eval()
+    ys, ps = [], []
+    for T, Mt, B, Mb, y in loader:
+        T = T.to(DEVICE, non_blocking=True)
+        Mt = Mt.to(DEVICE, non_blocking=True)
+        B = B.to(DEVICE, non_blocking=True)
+        Mb = Mb.to(DEVICE, non_blocking=True)
+        pred, _ = model(T, Mt, B, Mb)
+        ys.append(y.numpy())
+        ps.append(pred.detach().cpu().numpy())
+    return safe_spearmanr(np.concatenate(ys), np.concatenate(ps))
+def train_one_epoch_pooled(model, loader, opt, loss_reg, loss_cls, cls_w=1.0, clip=1.0):
+    model.train()
+    for t, b, y in loader:
+        t = t.to(DEVICE, non_blocking=True)
+        b = b.to(DEVICE, non_blocking=True)
+        y = y.to(DEVICE, non_blocking=True)
+        y_cls = affinity_to_class_tensor(y)
+        opt.zero_grad(set_to_none=True)
+        pred, logits = model(t, b)
+        L = loss_reg(pred, y) + cls_w * loss_cls(logits, y_cls)
+        L.backward()
+        if clip is not None:
+            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
+        opt.step()
+def train_one_epoch_unpooled(model, loader, opt, loss_reg, loss_cls, cls_w=1.0, clip=1.0):
+    model.train()
+    for T, Mt, B, Mb, y in loader:
+        T = T.to(DEVICE, non_blocking=True)
+        Mt = Mt.to(DEVICE, non_blocking=True)
+        B = B.to(DEVICE, non_blocking=True)
+        Mb = Mb.to(DEVICE, non_blocking=True)
+        y = y.to(DEVICE, non_blocking=True)
+        y_cls = affinity_to_class_tensor(y)
+        opt.zero_grad(set_to_none=True)
+        pred, logits = model(T, Mt, B, Mb)
+        L = loss_reg(pred, y) + cls_w * loss_cls(logits, y_cls)
+        L.backward()
+        if clip is not None:
+            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
+        opt.step()
+# -----------------------------
+# Optuna objective
+# -----------------------------
+def objective_crossattn(trial: optuna.Trial, mode: str, train_ds, val_ds) -> float:
+    lr = trial.suggest_float("lr", 1e-5, 3e-3, log=True)
+    wd = trial.suggest_float("weight_decay", 1e-10, 1e-2, log=True)
+    dropout = trial.suggest_float("dropout", 0.0, 0.4)
+    hidden = trial.suggest_categorical("hidden_dim", [256, 384, 512, 768])
+    n_heads = trial.suggest_categorical("n_heads", [4, 8])
+    n_layers = trial.suggest_int("n_layers", 1, 4)
+    cls_w = trial.suggest_float("cls_weight", 0.1, 2.0, log=True)
+    batch = trial.suggest_categorical("batch_size", [16, 32, 64, 128])
+    # infer dims from first row
+    if mode == "pooled":
+        Ht = len(train_ds[0]["target_embedding"])
+        Hb = len(train_ds[0]["binder_embedding"])
+        collate = collate_pair_pooled
+        model = CrossAttnPooled(Ht, Hb, hidden=hidden, n_heads=n_heads, n_layers=n_layers, dropout=dropout).to(DEVICE)
+        train_loader = DataLoader(train_ds, batch_size=batch, shuffle=True, num_workers=4, pin_memory=True, collate_fn=collate)
+        val_loader   = DataLoader(val_ds,   batch_size=batch, shuffle=False, num_workers=4, pin_memory=True, collate_fn=collate)
+        eval_fn = eval_spearman_pooled
+        train_fn = train_one_epoch_pooled
+    else:
+        Ht = len(train_ds[0]["target_embedding"][0])
+        Hb = len(train_ds[0]["binder_embedding"][0])
+        collate = collate_pair_unpooled
+        model = CrossAttnUnpooled(Ht, Hb, hidden=hidden, n_heads=n_heads, n_layers=n_layers, dropout=dropout).to(DEVICE)
+        train_loader = DataLoader(train_ds, batch_size=batch, shuffle=True, num_workers=4, pin_memory=True, collate_fn=collate)
+        val_loader   = DataLoader(val_ds,   batch_size=batch, shuffle=False, num_workers=4, pin_memory=True, collate_fn=collate)
+        eval_fn = eval_spearman_unpooled
+        train_fn = train_one_epoch_unpooled
+    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
+    loss_reg = nn.MSELoss()
+    loss_cls = nn.CrossEntropyLoss()
+    best = -1e9
+    bad = 0
+    patience = 10
+    for ep in range(1, 61):
+        train_fn(model, train_loader, opt, loss_reg, loss_cls, cls_w=cls_w)
+        rho = eval_fn(model, val_loader)
+        trial.report(rho, ep)
+        if trial.should_prune():
+            raise optuna.TrialPruned()
+        if rho > best + 1e-6:
+            best = rho
+            bad = 0
+        else:
+            bad += 1
+            if bad >= patience:
+                break
+    return float(best)
+# -----------------------------
+# Run: optuna + refit best
+# -----------------------------
+def run(dataset_path: str, out_dir: str, mode: str, n_trials: int = 50):
+    out_dir = Path(out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    train_ds, val_ds = load_split_paired(dataset_path)
+    print(f"[Data] Train={len(train_ds)} Val={len(val_ds)} | mode={mode}")
+    study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner())
+    study.optimize(lambda t: objective_crossattn(t, mode, train_ds, val_ds), n_trials=n_trials)
+    study.trials_dataframe().to_csv(out_dir / "optuna_trials.csv", index=False)
+    best = study.best_trial
+    best_params = dict(best.params)
+    # refit longer
+    lr = float(best_params["lr"])
+    wd = float(best_params["weight_decay"])
+    dropout = float(best_params["dropout"])
+    hidden = int(best_params["hidden_dim"])
+    n_heads = int(best_params["n_heads"])
+    n_layers = int(best_params["n_layers"])
+    cls_w = float(best_params["cls_weight"])
+    batch = int(best_params["batch_size"])
+    loss_reg = nn.MSELoss()
+    loss_cls = nn.CrossEntropyLoss()
+    if mode == "pooled":
+        Ht = len(train_ds[0]["target_embedding"])
+        Hb = len(train_ds[0]["binder_embedding"])
+        model = CrossAttnPooled(Ht, Hb, hidden=hidden, n_heads=n_heads, n_layers=n_layers, dropout=dropout).to(DEVICE)
+        collate = collate_pair_pooled
+        train_loader = DataLoader(train_ds, batch_size=batch, shuffle=True, num_workers=4, pin_memory=True, collate_fn=collate)
+        val_loader   = DataLoader(val_ds,   batch_size=batch, shuffle=False, num_workers=4, pin_memory=True, collate_fn=collate)
+        eval_fn = eval_spearman_pooled
+        train_fn = train_one_epoch_pooled
+    else:
+        Ht = len(train_ds[0]["target_embedding"][0])
+        Hb = len(train_ds[0]["binder_embedding"][0])
+        model = CrossAttnUnpooled(Ht, Hb, hidden=hidden, n_heads=n_heads, n_layers=n_layers, dropout=dropout).to(DEVICE)
+        collate = collate_pair_unpooled
+        train_loader = DataLoader(train_ds, batch_size=batch, shuffle=True, num_workers=4, pin_memory=True, collate_fn=collate)
+        val_loader   = DataLoader(val_ds,   batch_size=batch, shuffle=False, num_workers=4, pin_memory=True, collate_fn=collate)
+        eval_fn = eval_spearman_unpooled
+        train_fn = train_one_epoch_unpooled
+    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
+    best_rho = -1e9
+    bad = 0
+    patience = 20
+    best_state = None
+    for ep in range(1, 201):
+        train_fn(model, train_loader, opt, loss_reg, loss_cls, cls_w=cls_w)
+        rho = eval_fn(model, val_loader)
+        if rho > best_rho + 1e-6:
+            best_rho = rho
+            bad = 0
+            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
+        else:
+            bad += 1
+            if bad >= patience:
+                break
+    if best_state is not None:
+        model.load_state_dict(best_state)
+    # save
+    torch.save({"mode": mode, "best_params": best_params, "state_dict": model.state_dict()}, out_dir / "best_model.pt")
+    with open(out_dir / "best_params.json", "w") as f:
+        json.dump(best_params, f, indent=2)
+    print(f"[DONE] {out_dir} | best_optuna_rho={study.best_value:.4f} | refit_best_rho={best_rho:.4f}")
+if __name__ == "__main__":
+    import argparse
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--dataset_path", type=str, required=True, help="Paired DatasetDict path (pair_*)")
+    ap.add_argument("--mode", type=str, choices=["pooled", "unpooled"], required=True)
+    ap.add_argument("--out_dir", type=str, required=True)
+    ap.add_argument("--n_trials", type=int, default=50)
+    args = ap.parse_args()
+    run(
+        dataset_path=args.dataset_path,
+        out_dir=args.out_dir,
+        mode=args.mode,
+        n_trials=args.n_trials,
+    )

training_classifiers/.ipynb_checkpoints/binding_wt-checkpoint.bash ADDED Viewed

	@@ -0,0 +1,31 @@

+#!/bin/bash
+#SBATCH --job-name=b-data
+#SBATCH --partition=dgx-b200
+#SBATCH --gpus=1
+#SBATCH --cpus-per-task=10
+#SBATCH --mem=100G
+#SBATCH --time=48:00:00
+#SBATCH --output=%x_%j.out
+HOME_LOC=/vast/projects/pranam/lab/yz927
+SCRIPT_LOC=$HOME_LOC/projects/Classifier_Weight/training_classifiers
+DATA_LOC=$HOME_LOC/projects/Classifier_Weight/training_data_cleaned
+OBJECTIVE='binding_affinity'
+WT='smiles' #wt/smiles
+STATUS='pooled' #pooled/unpooled
+DATA_FILE="pair_wt_${WT}_${STATUS}"
+LOG_LOC=$SCRIPT_LOC
+DATE=$(date +%m_%d)
+SPECIAL_PREFIX="binding_affinity_data_generation"
+# Create log directory if it doesn't exist
+mkdir -p $LOG_LOC
+cd $SCRIPT_LOC
+source /vast/projects/pranam/lab/shared/miniconda3/etc/profile.d/conda.sh
+conda activate /vast/projects/pranam/lab/shared/miniconda3/envs/metal
+python -u binding_affinity_split.py > "${LOG_LOC}/${DATE}_${SPECIAL_PREFIX}.log" 2>&1
+echo "Script completed at $(date)"
+conda deactivate

training_classifiers/.ipynb_checkpoints/finetune_boost-checkpoint.py ADDED Viewed

	@@ -0,0 +1,508 @@

+#!/usr/bin/env python3
+# finetune_xgb_halflife_cv_optuna.py
+import os
+import json
+import math
+import hashlib
+from dataclasses import dataclass
+from typing import Dict, Any, Optional, Tuple, List
+import numpy as np
+import pandas as pd
+import optuna
+from sklearn.model_selection import KFold
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+from scipy.stats import spearmanr
+import torch
+from transformers import AutoTokenizer, AutoModel
+import xgboost as xgb
+# -----------------------------
+# Repro
+# -----------------------------
+SEED = 1986
+np.random.seed(SEED)
+torch.manual_seed(SEED)
+# -----------------------------
+# Metrics (mirrors your stability script style)
+# -----------------------------
+def safe_spearmanr(y_true: np.ndarray, y_pred: np.ndarray) -> float:
+    rho = spearmanr(y_true, y_pred).correlation
+    if rho is None or np.isnan(rho):
+        return 0.0
+    return float(rho)
+def eval_regression(y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str, float]:
+    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
+    mae = float(mean_absolute_error(y_true, y_pred))
+    r2 = float(r2_score(y_true, y_pred))
+    rho = float(safe_spearmanr(y_true, y_pred))
+    return {"rmse": rmse, "mae": mae, "r2": r2, "spearman_rho": rho}
+# -----------------------------
+# ESM-2 embeddings (cached)
+# -----------------------------
+@dataclass
+class ESMEmbedderConfig:
+    model_name: str = "facebook/esm2_t33_650M_UR50D"
+    batch_size: int = 8
+    max_length: int = 1024   # truncate very long proteins
+    fp16: bool = True
+class ESM2Embedder:
+    """
+    Mean-pooled last hidden state (excluding special tokens) -> (H,) per sequence.
+    """
+    def __init__(self, cfg: ESMEmbedderConfig, device: str = "cuda"):
+        self.cfg = cfg
+        self.device = device if (device == "cuda" and torch.cuda.is_available()) else "cpu"
+        self.tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, do_lower_case=False)
+        self.model = AutoModel.from_pretrained(cfg.model_name)
+        self.model.eval()
+        self.model.to(self.device)
+        # Turn off gradients
+        for p in self.model.parameters():
+            p.requires_grad = False
+    @torch.inference_mode()
+    def embed(self, seqs: List[str]) -> np.ndarray:
+        out = []
+        bs = self.cfg.batch_size
+        use_amp = (self.cfg.fp16 and self.device == "cuda")
+        autocast = torch.cuda.amp.autocast if use_amp else torch.cpu.amp.autocast  # safe fallback
+        for i in range(0, len(seqs), bs):
+            batch = [s.strip().upper() for s in seqs[i:i+bs]]
+            toks = self.tokenizer(
+                batch,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=self.cfg.max_length,
+                add_special_tokens=True,
+            )
+            toks = {k: v.to(self.device) for k, v in toks.items()}
+            attn = toks["attention_mask"]  # (B, L)
+            with autocast(enabled=use_amp):
+                h = self.model(**toks).last_hidden_state  # (B, L, H)
+            # mask out special tokens: first token is <cls>; last non-pad token is usually <eos>
+            mask = attn.clone()
+            mask[:, 0] = 0
+            lengths = attn.sum(dim=1)  # includes special tokens
+            # zero out last real token position per sequence
+            eos_pos = (lengths - 1).clamp(min=0)
+            mask[torch.arange(mask.size(0), device=mask.device), eos_pos] = 0
+            denom = mask.sum(dim=1).clamp(min=1).unsqueeze(-1)  # (B,1)
+            pooled = (h * mask.unsqueeze(-1)).sum(dim=1) / denom  # (B,H)
+            out.append(pooled.float().detach().cpu().numpy())
+        return np.concatenate(out, axis=0).astype(np.float32)
+def dataset_fingerprint(seqs: List[str], y: np.ndarray, extra: str = "") -> str:
+    h = hashlib.sha256()
+    for s in seqs:
+        h.update(s.encode("utf-8"))
+        h.update(b"\n")
+    h.update(np.asarray(y, dtype=np.float32).tobytes())
+    h.update(extra.encode("utf-8"))
+    return h.hexdigest()[:16]
+def load_or_compute_embeddings(
+    df: pd.DataFrame,
+    out_dir: str,
+    embed_cfg: ESMEmbedderConfig,
+    device: str,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    os.makedirs(out_dir, exist_ok=True)
+    seqs = df["sequence"].astype(str).tolist()
+    y = df["half_life_hours"].astype(float).to_numpy(dtype=np.float32)
+    fp = dataset_fingerprint(seqs, y, extra=f"{embed_cfg.model_name}|{embed_cfg.max_length}")
+    emb_path = os.path.join(out_dir, f"esm2_embeddings_{fp}.npy")
+    meta_path = os.path.join(out_dir, f"esm2_embeddings_{fp}.json")
+    if os.path.exists(emb_path) and os.path.exists(meta_path):
+        X = np.load(emb_path).astype(np.float32)
+        return X, y, np.asarray(seqs)
+    embedder = ESM2Embedder(embed_cfg, device=device)
+    X = embedder.embed(seqs)  # (N,H)
+    np.save(emb_path, X)
+    with open(meta_path, "w") as f:
+        json.dump(
+            {
+                "fingerprint": fp,
+                "model_name": embed_cfg.model_name,
+                "max_length": embed_cfg.max_length,
+                "n": len(seqs),
+                "dim": int(X.shape[1]),
+            },
+            f,
+            indent=2,
+        )
+    return X, y, np.asarray(seqs)
+# -----------------------------
+# XGBoost training (supports "finetune" via xgb_model)
+# -----------------------------
+def train_xgb_reg(
+    X_train: np.ndarray,
+    y_train: np.ndarray,
+    X_val: np.ndarray,
+    y_val: np.ndarray,
+    params: Dict[str, Any],
+    base_model_json: Optional[str] = None,
+) -> Tuple[xgb.Booster, np.ndarray, np.ndarray, int]:
+    dtrain = xgb.DMatrix(X_train, label=y_train)
+    dval = xgb.DMatrix(X_val, label=y_val)
+    num_boost_round = int(params.pop("num_boost_round"))
+    early_stopping_rounds = int(params.pop("early_stopping_rounds"))
+    # Important: load a fresh base model each fold (avoid leakage)
+    xgb_model = None
+    if base_model_json is not None:
+        booster0 = xgb.Booster()
+        booster0.load_model(base_model_json)
+        xgb_model = booster0
+    booster = xgb.train(
+        params=params,
+        dtrain=dtrain,
+        num_boost_round=num_boost_round,
+        evals=[(dval, "val")],
+        early_stopping_rounds=early_stopping_rounds,
+        verbose_eval=False,
+        xgb_model=xgb_model,  # <-- "finetune": continue boosting from base model
+    )
+    p_train = booster.predict(dtrain)
+    p_val = booster.predict(dval)
+    best_iter = int(getattr(booster, "best_iteration", num_boost_round - 1))
+    return booster, p_train, p_val, best_iter
+# -----------------------------
+# Optuna objective: 5-fold mean Spearman rho
+# -----------------------------
+def make_cv_objective(
+    X: np.ndarray,
+    y: np.ndarray,
+    n_splits: int,
+    device: str,
+    base_model_json: Optional[str],
+    target_transform: str,
+):
+    kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)
+    # Optional target transform (sometimes helps with heavy-tailed half-life)
+    if target_transform == "log1p":
+        y_used = np.log1p(np.clip(y, a_min=0.0, a_max=None)).astype(np.float32)
+    elif target_transform == "none":
+        y_used = y.astype(np.float32)
+    else:
+        raise ValueError(f"Unknown target_transform: {target_transform}")
+    def objective(trial: optuna.Trial) -> float:
+        # Hyperparam ranges patterned after your stability script :contentReference[oaicite:1]{index=1}
+        params = {
+            "objective": "reg:squarederror",
+            "eval_metric": "rmse",
+            "lambda": trial.suggest_float("lambda", 1e-10, 100.0, log=True),
+            "alpha":  trial.suggest_float("alpha",  1e-10, 100.0, log=True),
+            "gamma":  trial.suggest_float("gamma",  0.0, 10.0),
+            "max_depth": trial.suggest_int("max_depth", 2, 12),
+            "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 200.0, log=True),
+            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
+            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 1.0),
+            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.2, log=True),
+            "tree_method": "hist",
+            "device": "cuda" if (device == "cuda" and torch.cuda.is_available()) else "cpu",
+        }
+        params["num_boost_round"] = trial.suggest_int("num_boost_round", 30, 1500)
+        params["early_stopping_rounds"] = trial.suggest_int("early_stopping_rounds", 10, 150)
+        fold_metrics = []
+        fold_best_iters = []
+        for fold, (tr_idx, va_idx) in enumerate(kf.split(X), start=1):
+            Xtr, ytr = X[tr_idx], y_used[tr_idx]
+            Xva, yva = X[va_idx], y_used[va_idx]
+            _, _, p_va, best_iter = train_xgb_reg(
+                Xtr, ytr, Xva, yva, params.copy(),
+                base_model_json=base_model_json,
+            )
+            m = eval_regression(yva, p_va)
+            fold_metrics.append(m)
+            fold_best_iters.append(best_iter)
+        mean_rho = float(np.mean([m["spearman_rho"] for m in fold_metrics]))
+        mean_rmse = float(np.mean([m["rmse"] for m in fold_metrics]))
+        mean_mae = float(np.mean([m["mae"] for m in fold_metrics]))
+        mean_r2 = float(np.mean([m["r2"] for m in fold_metrics]))
+        mean_best_iter = float(np.mean(fold_best_iters))
+        trial.set_user_attr("cv_spearman_rho", mean_rho)
+        trial.set_user_attr("cv_rmse", mean_rmse)
+        trial.set_user_attr("cv_mae", mean_mae)
+        trial.set_user_attr("cv_r2", mean_r2)
+        trial.set_user_attr("cv_mean_best_iter", mean_best_iter)
+        # maximize Spearman rho (same as your stability workflow :contentReference[oaicite:2]{index=2})
+        return mean_rho
+    return objective
+def refit_and_save(
+    X: np.ndarray,
+    y: np.ndarray,
+    seqs: np.ndarray,
+    out_dir: str,
+    best_params: Dict[str, Any],
+    n_splits: int,
+    device: str,
+    base_model_json: Optional[str],
+    target_transform: str,
+):
+    os.makedirs(out_dir, exist_ok=True)
+    # Transform target consistently
+    if target_transform == "log1p":
+        y_used = np.log1p(np.clip(y, a_min=0.0, a_max=None)).astype(np.float32)
+    else:
+        y_used = y.astype(np.float32)
+    kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)
+    # 1) get OOF preds + average best_iteration
+    oof_pred = np.zeros_like(y_used, dtype=np.float32)
+    best_iters = []
+    fold_rows = []
+    for fold, (tr_idx, va_idx) in enumerate(kf.split(X), start=1):
+        Xtr, ytr = X[tr_idx], y_used[tr_idx]
+        Xva, yva = X[va_idx], y_used[va_idx]
+        _, _, p_va, best_iter = train_xgb_reg(
+            Xtr, ytr, Xva, yva, best_params.copy(),
+            base_model_json=base_model_json,
+        )
+        oof_pred[va_idx] = p_va.astype(np.float32)
+        best_iters.append(best_iter)
+        m = eval_regression(yva, p_va)
+        fold_rows.append({"fold": fold, **m, "best_iter": int(best_iter)})
+    fold_df = pd.DataFrame(fold_rows)
+    fold_df.to_csv(os.path.join(out_dir, "cv_fold_metrics.csv"), index=False)
+    cv_metrics = eval_regression(y_used, oof_pred)
+    with open(os.path.join(out_dir, "cv_oof_summary.json"), "w") as f:
+        json.dump(cv_metrics, f, indent=2)
+    oof_df = pd.DataFrame({
+        "sequence": seqs,
+        "y_true_used": y_used.astype(float),
+        "y_pred_oof": oof_pred.astype(float),
+        "residual": (y_used - oof_pred).astype(float),
+    })
+    oof_df.to_csv(os.path.join(out_dir, "cv_oof_predictions.csv"), index=False)
+    mean_best_iter = int(round(float(np.mean(best_iters))))
+    final_rounds = max(mean_best_iter + 1, 10)
+    # 2) train final model on ALL data (no early stopping here; use final_rounds)
+    dtrain_all = xgb.DMatrix(X, label=y_used)
+    xgb_model = None
+    if base_model_json is not None:
+        booster0 = xgb.Booster()
+        booster0.load_model(base_model_json)
+        xgb_model = booster0
+    final_params = best_params.copy()
+    final_params.pop("early_stopping_rounds", None)
+    final_params["device"] = "cuda" if (device == "cuda" and torch.cuda.is_available()) else "cpu"
+    booster = xgb.train(
+        params=final_params,
+        dtrain=dtrain_all,
+        num_boost_round=int(final_params.pop("num_boost_round", final_rounds)),
+        evals=[],
+        verbose_eval=False,
+        xgb_model=xgb_model,
+    )
+    model_path = os.path.join(out_dir, "best_model_finetuned.json")
+    booster.save_model(model_path)
+    with open(os.path.join(out_dir, "final_training_notes.json"), "w") as f:
+        json.dump(
+            {
+                "target_transform": target_transform,
+                "final_rounds_used": int(final_rounds),
+                "cv_oof_metrics_on_used_target": cv_metrics,
+                "model_path": model_path,
+            },
+            f,
+            indent=2,
+        )
+    print("=" * 72)
+    print("[Final] CV OOF metrics (on transformed target if enabled):")
+    print(json.dumps(cv_metrics, indent=2))
+    print(f"[Final] Saved finetuned model -> {model_path}")
+    print("=" * 72)
+def main():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--csv_path", type=str, default="/scratch/pranamlab/tong/data/halflife/wt_halflife_merged_dedup.csv")
+    parser.add_argument("--out_dir", type=str, default="/scratch/pranamlab/tong/PeptiVerse/src/halflife/finetune_stability_xgb")
+    # If provided, we will "finetune" by continuing boosting from this model
+    parser.add_argument("--base_model_json", type=str, default='/scratch/pranamlab/tong/PeptiVerse/src/stability/xgboost/best_model.json', help="Path to an existing XGBoost .json model to continue training from")
+    # ESM embedding config
+    parser.add_argument("--esm_model", type=str, default="facebook/esm2_t33_650M_UR50D")
+    parser.add_argument("--esm_batch_size", type=int, default=8)
+    parser.add_argument("--esm_max_length", type=int, default=1024)
+    parser.add_argument("--no_fp16", action="store_true")
+    # Training config
+    parser.add_argument("--n_trials", type=int, default=200)
+    parser.add_argument("--n_splits", type=int, default=5)
+    parser.add_argument("--device", type=str, default="cuda", choices=["cuda", "cpu"])
+    parser.add_argument("--target_transform", type=str, default="none", choices=["none", "log1p"])
+    args = parser.parse_args()
+    os.makedirs(args.out_dir, exist_ok=True)
+    # Load data
+    df = pd.read_csv(args.csv_path)
+    if "sequence" not in df.columns or "half_life_hours" not in df.columns:
+        raise ValueError("CSV must contain columns: sequence, half_life_hours")
+    df = df.dropna(subset=["sequence", "half_life_hours"]).copy()
+    df["sequence"] = df["sequence"].astype(str).str.strip()
+    df = df[df["sequence"].str.len() > 0]
+    df = df.drop_duplicates(subset=["sequence"], keep="first").reset_index(drop=True)
+    print(f"[Data] N={len(df)} from {args.csv_path}")
+    # Embeddings (cached)
+    embed_cfg = ESMEmbedderConfig(
+        model_name=args.esm_model,
+        batch_size=args.esm_batch_size,
+        max_length=args.esm_max_length,
+        fp16=(not args.no_fp16),
+    )
+    X, y, seqs = load_or_compute_embeddings(df, args.out_dir, embed_cfg, device=args.device)
+    print(f"[Embeddings] X={X.shape} (float32)")
+    # Optuna study
+    sampler = optuna.samplers.TPESampler(seed=SEED)
+    study = optuna.create_study(
+        direction="maximize",  # like your stability script :contentReference[oaicite:3]{index=3}
+        sampler=sampler,
+        pruner=optuna.pruners.MedianPruner(),
+    )
+    objective = make_cv_objective(
+        X=X,
+        y=y,
+        n_splits=args.n_splits,
+        device=args.device,
+        base_model_json=args.base_model_json,
+        target_transform=args.target_transform,
+    )
+    study.optimize(objective, n_trials=args.n_trials)
+    # Save trials
+    trials_df = study.trials_dataframe()
+    trials_df.to_csv(os.path.join(args.out_dir, "study_trials.csv"), index=False)
+    best = study.best_trial
+    best_params = dict(best.params)
+    # Build full param dict for refit
+    best_xgb_params = {
+        "objective": "reg:squarederror",
+        "eval_metric": "rmse",
+        "lambda": best_params["lambda"],
+        "alpha": best_params["alpha"],
+        "gamma": best_params["gamma"],
+        "max_depth": best_params["max_depth"],
+        "min_child_weight": best_params["min_child_weight"],
+        "subsample": best_params["subsample"],
+        "colsample_bytree": best_params["colsample_bytree"],
+        "learning_rate": best_params["learning_rate"],
+        "tree_method": "hist",
+        "device": "cuda" if (args.device == "cuda" and torch.cuda.is_available()) else "cpu",
+        "num_boost_round": best_params["num_boost_round"],
+        "early_stopping_rounds": best_params["early_stopping_rounds"],
+    }
+    # Summary
+    summary = {
+        "best_trial_number": int(best.number),
+        "best_value_cv_spearman_rho": float(best.value),
+        "best_user_attrs": best.user_attrs,
+        "best_params": best_params,
+        "best_xgb_params_full": best_xgb_params,
+        "base_model_json": args.base_model_json,
+        "target_transform": args.target_transform,
+        "esm_model": args.esm_model,
+        "esm_max_length": args.esm_max_length,
+    }
+    with open(os.path.join(args.out_dir, "optimization_summary.json"), "w") as f:
+        json.dump(summary, f, indent=2)
+    print("=" * 72)
+    print("[Optuna] Best CV Spearman rho:", float(best.value))
+    print("[Optuna] Best params:\n", json.dumps(best_params, indent=2))
+    print("=" * 72)
+    # Refit + save final finetuned model + OOF predictions
+    refit_and_save(
+        X=X,
+        y=y,
+        seqs=seqs,
+        out_dir=args.out_dir,
+        best_params=best_xgb_params,
+        n_splits=args.n_splits,
+        device=args.device,
+        base_model_json=args.base_model_json,
+        target_transform=args.target_transform,
+    )
+if __name__ == "__main__":
+    main()

training_classifiers/.ipynb_checkpoints/generate_binding_val-checkpoint.py ADDED Viewed

	@@ -0,0 +1,309 @@

+#!/usr/bin/env python3
+# export_val_preds_csv.py
+import argparse
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from datasets import load_from_disk, DatasetDict
+# -----------------------------
+# Repro / device
+# -----------------------------
+def seed_all(seed=1986):
+    import random
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+seed_all(1986)
+DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# -----------------------------
+# Load paired DatasetDict
+# -----------------------------
+def load_split_paired(path: str):
+    dd = load_from_disk(path)
+    if not isinstance(dd, DatasetDict):
+        raise ValueError(f"Expected DatasetDict at {path}")
+    if "train" not in dd or "val" not in dd:
+        raise ValueError(f"DatasetDict missing train/val at {path}")
+    return dd["train"], dd["val"]
+# -----------------------------
+# Collate fns (same as yours)
+# -----------------------------
+def collate_pair_pooled(batch):
+    Pt = torch.tensor([x["target_embedding"] for x in batch], dtype=torch.float32)
+    Pb = torch.tensor([x["binder_embedding"] for x in batch], dtype=torch.float32)
+    y  = torch.tensor([float(x["label"]) for x in batch], dtype=torch.float32)
+    return Pt, Pb, y
+def collate_pair_unpooled(batch):
+    B = len(batch)
+    Ht = len(batch[0]["target_embedding"][0])
+    Hb = len(batch[0]["binder_embedding"][0])
+    Lt_max = max(int(x["target_length"]) for x in batch)
+    Lb_max = max(int(x["binder_length"]) for x in batch)
+    Pt = torch.zeros(B, Lt_max, Ht, dtype=torch.float32)
+    Pb = torch.zeros(B, Lb_max, Hb, dtype=torch.float32)
+    Mt = torch.zeros(B, Lt_max, dtype=torch.bool)
+    Mb = torch.zeros(B, Lb_max, dtype=torch.bool)
+    y  = torch.tensor([float(x["label"]) for x in batch], dtype=torch.float32)
+    for i, x in enumerate(batch):
+        t = torch.tensor(x["target_embedding"], dtype=torch.float32)
+        b = torch.tensor(x["binder_embedding"], dtype=torch.float32)
+        lt, lb = t.shape[0], b.shape[0]
+        Pt[i, :lt] = t
+        Pb[i, :lb] = b
+        Mt[i, :lt] = torch.tensor(x["target_attention_mask"][:lt], dtype=torch.bool)
+        Mb[i, :lb] = torch.tensor(x["binder_attention_mask"][:lb], dtype=torch.bool)
+    return Pt, Mt, Pb, Mb, y
+# -----------------------------
+# Models (same as yours)
+# -----------------------------
+class CrossAttnPooled(nn.Module):
+    def __init__(self, Ht, Hb, hidden=512, n_heads=8, n_layers=3, dropout=0.1):
+        super().__init__()
+        self.t_proj = nn.Sequential(nn.Linear(Ht, hidden), nn.LayerNorm(hidden))
+        self.b_proj = nn.Sequential(nn.Linear(Hb, hidden), nn.LayerNorm(hidden))
+        self.layers = nn.ModuleList([])
+        for _ in range(n_layers):
+            self.layers.append(nn.ModuleDict({
+                "attn_tb": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=False),
+                "attn_bt": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=False),
+                "n1t": nn.LayerNorm(hidden),
+                "n2t": nn.LayerNorm(hidden),
+                "n1b": nn.LayerNorm(hidden),
+                "n2b": nn.LayerNorm(hidden),
+                "fft": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
+                "ffb": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
+            }))
+        self.shared = nn.Sequential(nn.Linear(2*hidden, hidden), nn.GELU(), nn.Dropout(dropout))
+        self.reg = nn.Linear(hidden, 1)
+        self.cls = nn.Linear(hidden, 3)
+    def forward(self, t_vec, b_vec):
+        t = self.t_proj(t_vec).unsqueeze(0)  # (1,B,H)
+        b = self.b_proj(b_vec).unsqueeze(0)  # (1,B,H)
+        for L in self.layers:
+            t_attn, _ = L["attn_tb"](t, b, b)
+            t = L["n1t"]((t + t_attn).transpose(0,1)).transpose(0,1)
+            t = L["n2t"]((t + L["fft"](t)).transpose(0,1)).transpose(0,1)
+            b_attn, _ = L["attn_bt"](b, t, t)
+            b = L["n1b"]((b + b_attn).transpose(0,1)).transpose(0,1)
+            b = L["n2b"]((b + L["ffb"](b)).transpose(0,1)).transpose(0,1)
+        z = torch.cat([t[0], b[0]], dim=-1)
+        h = self.shared(z)
+        return self.reg(h).squeeze(-1), self.cls(h)
+class CrossAttnUnpooled(nn.Module):
+    def __init__(self, Ht, Hb, hidden=512, n_heads=8, n_layers=3, dropout=0.1):
+        super().__init__()
+        self.t_proj = nn.Sequential(nn.Linear(Ht, hidden), nn.LayerNorm(hidden))
+        self.b_proj = nn.Sequential(nn.Linear(Hb, hidden), nn.LayerNorm(hidden))
+        self.layers = nn.ModuleList([])
+        for _ in range(n_layers):
+            self.layers.append(nn.ModuleDict({
+                "attn_tb": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True),
+                "attn_bt": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True),
+                "n1t": nn.LayerNorm(hidden),
+                "n2t": nn.LayerNorm(hidden),
+                "n1b": nn.LayerNorm(hidden),
+                "n2b": nn.LayerNorm(hidden),
+                "fft": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
+                "ffb": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
+            }))
+        self.shared = nn.Sequential(nn.Linear(2*hidden, hidden), nn.GELU(), nn.Dropout(dropout))
+        self.reg = nn.Linear(hidden, 1)
+        self.cls = nn.Linear(hidden, 3)
+    def masked_mean(self, X, M):
+        Mf = M.unsqueeze(-1).float()
+        denom = Mf.sum(dim=1).clamp(min=1.0)
+        return (X * Mf).sum(dim=1) / denom
+    def forward(self, T, Mt, B, Mb):
+        T = self.t_proj(T)
+        Bx = self.b_proj(B)
+        kp_t = ~Mt
+        kp_b = ~Mb
+        for L in self.layers:
+            T_attn, _ = L["attn_tb"](T, Bx, Bx, key_padding_mask=kp_b)
+            T = L["n1t"](T + T_attn)
+            T = L["n2t"](T + L["fft"](T))
+            B_attn, _ = L["attn_bt"](Bx, T, T, key_padding_mask=kp_t)
+            Bx = L["n1b"](Bx + B_attn)
+            Bx = L["n2b"](Bx + L["ffb"](Bx))
+        t_pool = self.masked_mean(T, Mt)
+        b_pool = self.masked_mean(Bx, Mb)
+        z = torch.cat([t_pool, b_pool], dim=-1)
+        h = self.shared(z)
+        return self.reg(h).squeeze(-1), self.cls(h)
+# -----------------------------
+# Helpers
+# -----------------------------
+def softmax_np(logits: np.ndarray) -> np.ndarray:
+    x = logits - logits.max(axis=1, keepdims=True)
+    ex = np.exp(x)
+    return ex / ex.sum(axis=1, keepdims=True)
+def expected_score_from_probs(probs: np.ndarray, class_centers=(9.5, 8.0, 6.0)) -> np.ndarray:
+    centers = np.asarray(class_centers, dtype=np.float32)[None, :]  # (1,3)
+    return (probs * centers).sum(axis=1)
+def load_checkpoint(ckpt_path: str, mode: str, train_ds):
+    ckpt = torch.load(ckpt_path, map_location="cpu")
+    params = ckpt.get("best_params", {})
+    hidden = int(params.get("hidden_dim", 512))
+    n_heads = int(params.get("n_heads", 8))
+    n_layers = int(params.get("n_layers", 3))
+    dropout = float(params.get("dropout", 0.1))
+    if mode == "pooled":
+        Ht = len(train_ds[0]["target_embedding"])
+        Hb = len(train_ds[0]["binder_embedding"])
+        model = CrossAttnPooled(Ht, Hb, hidden=hidden, n_heads=n_heads, n_layers=n_layers, dropout=dropout)
+    else:
+        Ht = len(train_ds[0]["target_embedding"][0])
+        Hb = len(train_ds[0]["binder_embedding"][0])
+        model = CrossAttnUnpooled(Ht, Hb, hidden=hidden, n_heads=n_heads, n_layers=n_layers, dropout=dropout)
+    model.load_state_dict(ckpt["state_dict"], strict=True)
+    model.to(DEVICE).eval()
+    return model
+@torch.no_grad()
+def export_val_preds_csv(dataset_path: str, ckpt_path: str, mode: str,
+                         out_csv: str, batch_size: int, num_workers: int,
+                         class_centers=(9.5, 8.0, 6.0)):
+    train_ds, val_ds = load_split_paired(dataset_path)
+    model = load_checkpoint(ckpt_path, mode, train_ds)
+    if mode == "pooled":
+        loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False,
+                            num_workers=num_workers, pin_memory=True,
+                            collate_fn=collate_pair_pooled)
+        y_all, pred_reg_all, logits_all = [], [], []
+        for t, b, y in loader:
+            t = t.to(DEVICE, non_blocking=True)
+            b = b.to(DEVICE, non_blocking=True)
+            pred_reg, logits = model(t, b)
+            y_all.append(y.numpy())
+            pred_reg_all.append(pred_reg.detach().cpu().numpy())
+            logits_all.append(logits.detach().cpu().numpy())
+    else:
+        loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False,
+                            num_workers=num_workers, pin_memory=True,
+                            collate_fn=collate_pair_unpooled)
+        y_all, pred_reg_all, logits_all = [], [], []
+        for T, Mt, B, Mb, y in loader:
+            T = T.to(DEVICE, non_blocking=True)
+            Mt = Mt.to(DEVICE, non_blocking=True)
+            B = B.to(DEVICE, non_blocking=True)
+            Mb = Mb.to(DEVICE, non_blocking=True)
+            pred_reg, logits = model(T, Mt, B, Mb)
+            y_all.append(y.numpy())
+            pred_reg_all.append(pred_reg.detach().cpu().numpy())
+            logits_all.append(logits.detach().cpu().numpy())
+    y_true = np.concatenate(y_all)
+    y_pred_reg = np.concatenate(pred_reg_all)
+    logits = np.concatenate(logits_all)
+    probs = softmax_np(logits)  # (N,3)
+    y_pred_cls_score = expected_score_from_probs(probs, class_centers=class_centers)
+    # Build CSV rows
+    out = Path(out_csv)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    header = [
+        "split", "mode",
+        "y_true",
+        "y_pred_reg",
+        "p_high", "p_moderate", "p_low",
+        "y_pred_cls_score",
+        "center_high", "center_moderate", "center_low",
+    ]
+    centers = list(class_centers)
+    rows = np.column_stack([
+        y_true,
+        y_pred_reg,
+        probs[:, 0], probs[:, 1], probs[:, 2],
+        y_pred_cls_score,
+        np.full_like(y_true, centers[0], dtype=np.float32),
+        np.full_like(y_true, centers[1], dtype=np.float32),
+        np.full_like(y_true, centers[2], dtype=np.float32),
+    ])
+    with out.open("w") as f:
+        f.write(",".join(header) + "\n")
+        for i in range(rows.shape[0]):
+            f.write(
+                "val," + mode + "," +
+                ",".join(f"{rows[i, j]:.8f}" for j in range(rows.shape[1])) +
+                "\n"
+            )
+    print(f"[Data] Val N={len(y_true)} | mode={mode}")
+    print(f"[Saved] {out}")
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--dataset_path", required=True, help="Paired DatasetDict path (pair_*)")
+    ap.add_argument("--ckpt", required=True, help="Path to best_model.pt")
+    ap.add_argument("--mode", choices=["pooled", "unpooled"], required=True)
+    ap.add_argument("--out_csv", required=True)
+    ap.add_argument("--batch_size", type=int, default=128)
+    ap.add_argument("--num_workers", type=int, default=4)
+    # Optional: choose class-centers for expected-score conversion
+    ap.add_argument("--center_high", type=float, default=9.5)
+    ap.add_argument("--center_moderate", type=float, default=8.0)
+    ap.add_argument("--center_low", type=float, default=6.0)
+    args = ap.parse_args()
+    export_val_preds_csv(
+        dataset_path=args.dataset_path,
+        ckpt_path=args.ckpt,
+        mode=args.mode,
+        out_csv=args.out_csv,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        class_centers=(args.center_high, args.center_moderate, args.center_low),
+    )
+if __name__ == "__main__":
+    main()

training_classifiers/.ipynb_checkpoints/peptiverse_filelist-checkpoint.txt ADDED Viewed

	@@ -0,0 +1,234 @@

+./hemolysis/cnn_smiles/optimization_summary.txt
+./hemolysis/cnn_smiles/pr_curve.png
+./hemolysis/cnn_smiles/roc_curve.png
+./hemolysis/cnn_smiles/study_trials.csv
+./hemolysis/cnn_smiles/train_predictions.csv
+./hemolysis/cnn_smiles/val_predictions.csv
+./hemolysis/cnn_wt/optimization_summary.txt
+./hemolysis/cnn_wt/pr_curve.png
+./hemolysis/cnn_wt/roc_curve.png
+./hemolysis/cnn_wt/study_trials.csv
+./hemolysis/cnn_wt/train_predictions.csv
+./hemolysis/cnn_wt/val_predictions.csv
+./hemolysis/enet_gpu/optimization_summary.txt
+./hemolysis/enet_gpu/pr_curve.png
+./hemolysis/enet_gpu/roc_curve.png
+./hemolysis/enet_gpu/study_trials.csv
+./hemolysis/enet_gpu/train_predictions.csv
+./hemolysis/enet_gpu/val_predictions.csv
+./hemolysis/enet_gpu_smiles/optimization_summary.txt
+./hemolysis/enet_gpu_smiles/pr_curve.png
+./hemolysis/enet_gpu_smiles/roc_curve.png
+./hemolysis/enet_gpu_smiles/study_trials.csv
+./hemolysis/enet_gpu_smiles/train_predictions.csv
+./hemolysis/enet_gpu_smiles/val_predictions.csv
+./hemolysis/enet_gpu_wt/optimization_summary.txt
+./hemolysis/enet_gpu_wt/pr_curve.png
+./hemolysis/enet_gpu_wt/roc_curve.png
+./hemolysis/enet_gpu_wt/study_trials.csv
+./hemolysis/enet_gpu_wt/train_predictions.csv
+./hemolysis/enet_gpu_wt/val_predictions.csv
+./hemolysis/mlp_smiles/optimization_summary.txt
+./hemolysis/mlp_smiles/pr_curve.png
+./hemolysis/mlp_smiles/roc_curve.png
+./hemolysis/mlp_smiles/study_trials.csv
+./hemolysis/mlp_smiles/train_predictions.csv
+./hemolysis/mlp_smiles/val_predictions.csv
+./hemolysis/mlp_wt/optimization_summary.txt
+./hemolysis/mlp_wt/pr_curve.png
+./hemolysis/mlp_wt/roc_curve.png
+./hemolysis/mlp_wt/study_trials.csv
+./hemolysis/mlp_wt/train_predictions.csv
+./hemolysis/mlp_wt/val_predictions.csv
+./hemolysis/svm_gpu_wt/optimization_summary.txt
+./hemolysis/svm_gpu_wt/pr_curve.png
+./hemolysis/svm_gpu_wt/roc_curve.png
+./hemolysis/svm_gpu_wt/study_trials.csv
+./hemolysis/svm_gpu_wt/train_predictions.csv
+./hemolysis/svm_gpu_wt/val_predictions.csv
+./hemolysis/transformer_smiles/optimization_summary.txt
+./hemolysis/transformer_smiles/pr_curve.png
+./hemolysis/transformer_smiles/roc_curve.png
+./hemolysis/transformer_smiles/study_trials.csv
+./hemolysis/transformer_smiles/train_predictions.csv
+./hemolysis/transformer_smiles/val_predictions.csv
+./hemolysis/transformer_wt/optimization_summary.txt
+./hemolysis/transformer_wt/pr_curve.png
+./hemolysis/transformer_wt/roc_curve.png
+./hemolysis/transformer_wt/study_trials.csv
+./hemolysis/transformer_wt/train_predictions.csv
+./hemolysis/transformer_wt/val_predictions.csv
+./hemolysis/xgb/optimization_summary.txt
+./hemolysis/xgb/pr_curve.png
+./hemolysis/xgb/roc_curve.png
+./hemolysis/xgb/study_trials.csv
+./hemolysis/xgb/train_predictions.csv
+./hemolysis/xgb/val_predictions.csv
+./hemolysis/xgb_smiles/optimization_summary.txt
+./hemolysis/xgb_smiles/pr_curve.png
+./hemolysis/xgb_smiles/roc_curve.png
+./hemolysis/xgb_smiles/study_trials.csv
+./hemolysis/xgb_smiles/train_predictions.csv
+./hemolysis/xgb_smiles/val_predictions.csv
+./hemolysis/xgb_wt/optimization_summary.txt
+./hemolysis/xgb_wt/pr_curve.png
+./hemolysis/xgb_wt/roc_curve.png
+./hemolysis/xgb_wt/study_trials.csv
+./hemolysis/xgb_wt/train_predictions.csv
+./hemolysis/xgb_wt/val_predictions.csv
+./nf/cnn/optimization_summary.txt
+./nf/cnn/pr_curve.png
+./nf/cnn/roc_curve.png
+./nf/cnn/study_trials.csv
+./nf/cnn/train_predictions.csv
+./nf/cnn/val_predictions.csv
+./nf/cnn_wt/optimization_summary.txt
+./nf/cnn_wt/pr_curve.png
+./nf/cnn_wt/roc_curve.png
+./nf/cnn_wt/study_trials.csv
+./nf/cnn_wt/train_predictions.csv
+./nf/cnn_wt/val_predictions.csv
+./nf/enet_gpu/optimization_summary.txt
+./nf/enet_gpu/pr_curve.png
+./nf/enet_gpu/roc_curve.png
+./nf/enet_gpu/study_trials.csv
+./nf/enet_gpu/train_predictions.csv
+./nf/enet_gpu/val_predictions.csv
+./nf/enet_gpu_smiles/optimization_summary.txt
+./nf/enet_gpu_smiles/pr_curve.png
+./nf/enet_gpu_smiles/roc_curve.png
+./nf/enet_gpu_smiles/study_trials.csv
+./nf/enet_gpu_smiles/train_predictions.csv
+./nf/enet_gpu_smiles/val_predictions.csv
+./nf/enet_gpu_wt/optimization_summary.txt
+./nf/enet_gpu_wt/pr_curve.png
+./nf/enet_gpu_wt/roc_curve.png
+./nf/enet_gpu_wt/study_trials.csv
+./nf/enet_gpu_wt/train_predictions.csv
+./nf/enet_gpu_wt/val_predictions.csv
+./nf/mlp/optimization_summary.txt
+./nf/mlp/pr_curve.png
+./nf/mlp/roc_curve.png
+./nf/mlp/study_trials.csv
+./nf/mlp/train_predictions.csv
+./nf/mlp/val_predictions.csv
+./nf/mlp_wt/optimization_summary.txt
+./nf/mlp_wt/pr_curve.png
+./nf/mlp_wt/roc_curve.png
+./nf/mlp_wt/study_trials.csv
+./nf/mlp_wt/train_predictions.csv
+./nf/mlp_wt/val_predictions.csv
+./nf/svm_gpu/optimization_summary.txt
+./nf/svm_gpu/pr_curve.png
+./nf/svm_gpu/roc_curve.png
+./nf/svm_gpu/study_trials.csv
+./nf/svm_gpu/train_predictions.csv
+./nf/svm_gpu/val_predictions.csv
+./nf/svm_gpu_wt/optimization_summary.txt
+./nf/svm_gpu_wt/pr_curve.png
+./nf/svm_gpu_wt/roc_curve.png
+./nf/svm_gpu_wt/study_trials.csv
+./nf/svm_gpu_wt/train_predictions.csv
+./nf/svm_gpu_wt/val_predictions.csv
+./nf/transformer/optimization_summary.txt
+./nf/transformer/pr_curve.png
+./nf/transformer/roc_curve.png
+./nf/transformer/study_trials.csv
+./nf/transformer/train_predictions.csv
+./nf/transformer/val_predictions.csv
+./nf/transformer_wt/optimization_summary.txt
+./nf/transformer_wt/pr_curve.png
+./nf/transformer_wt/roc_curve.png
+./nf/transformer_wt/study_trials.csv
+./nf/transformer_wt/train_predictions.csv
+./nf/transformer_wt/val_predictions.csv
+./nf/xgb_wt/optimization_summary.txt
+./nf/xgb_wt/pr_curve.png
+./nf/xgb_wt/roc_curve.png
+./nf/xgb_wt/study_trials.csv
+./nf/xgb_wt/train_predictions.csv
+./nf/xgb_wt/val_predictions.csv
+./permeability_caco2/cnn_smiles/optimization_summary.txt
+./permeability_caco2/cnn_smiles/study_trials.csv
+./permeability_caco2/cnn_smiles/train_predictions.csv
+./permeability_caco2/cnn_smiles/val_predictions.csv
+./permeability_caco2/enet_gpu_smiles/optimization_summary.txt
+./permeability_caco2/enet_gpu_smiles/study_trials.csv
+./permeability_caco2/enet_gpu_smiles/train_predictions.csv
+./permeability_caco2/enet_gpu_smiles/val_predictions.csv
+./permeability_caco2/mlp_smiles/optimization_summary.txt
+./permeability_caco2/mlp_smiles/study_trials.csv
+./permeability_caco2/mlp_smiles/train_predictions.csv
+./permeability_caco2/mlp_smiles/val_predictions.csv
+./permeability_caco2/svr_smiles/optimization_summary.txt
+./permeability_caco2/svr_smiles/study_trials.csv
+./permeability_caco2/svr_smiles/train_predictions.csv
+./permeability_caco2/svr_smiles/val_predictions.csv
+./permeability_caco2/transformer_smiles/optimization_summary.txt
+./permeability_caco2/transformer_smiles/study_trials.csv
+./permeability_caco2/transformer_smiles/train_predictions.csv
+./permeability_caco2/transformer_smiles/val_predictions.csv
+./permeability_caco2/xgb_reg_smiles/optimization_summary.txt
+./permeability_caco2/xgb_reg_smiles/study_trials.csv
+./permeability_caco2/xgb_reg_smiles/train_predictions.csv
+./permeability_caco2/xgb_reg_smiles/val_predictions.csv
+./permeability_pampa/cnn_smiles/optimization_summary.txt
+./permeability_pampa/cnn_smiles/study_trials.csv
+./permeability_pampa/cnn_smiles/train_predictions.csv
+./permeability_pampa/cnn_smiles/val_predictions.csv
+./permeability_pampa/enet_gpu_smiles/optimization_summary.txt
+./permeability_pampa/enet_gpu_smiles/study_trials.csv
+./permeability_pampa/enet_gpu_smiles/train_predictions.csv
+./permeability_pampa/enet_gpu_smiles/val_predictions.csv
+./permeability_pampa/mlp_smiles/optimization_summary.txt
+./permeability_pampa/mlp_smiles/study_trials.csv
+./permeability_pampa/mlp_smiles/train_predictions.csv
+./permeability_pampa/mlp_smiles/val_predictions.csv
+./permeability_pampa/transformer_smiles/optimization_summary.txt
+./permeability_pampa/transformer_smiles/study_trials.csv
+./permeability_pampa/transformer_smiles/train_predictions.csv
+./permeability_pampa/transformer_smiles/val_predictions.csv
+./permeability_pampa/xgb_reg_smiles/optimization_summary.txt
+./permeability_pampa/xgb_reg_smiles/study_trials.csv
+./permeability_pampa/xgb_reg_smiles/train_predictions.csv
+./permeability_pampa/xgb_reg_smiles/val_predictions.csv
+./solubility/cnn_wt/optimization_summary.txt
+./solubility/cnn_wt/pr_curve.png
+./solubility/cnn_wt/roc_curve.png
+./solubility/cnn_wt/study_trials.csv
+./solubility/cnn_wt/train_predictions.csv
+./solubility/cnn_wt/val_predictions.csv
+./solubility/enet_gpu/optimization_summary.txt
+./solubility/enet_gpu/pr_curve.png
+./solubility/enet_gpu/roc_curve.png
+./solubility/enet_gpu/study_trials.csv
+./solubility/enet_gpu/train_predictions.csv
+./solubility/enet_gpu/val_predictions.csv
+./solubility/mlp_wt/optimization_summary.txt
+./solubility/mlp_wt/pr_curve.png
+./solubility/mlp_wt/roc_curve.png
+./solubility/mlp_wt/study_trials.csv
+./solubility/mlp_wt/train_predictions.csv
+./solubility/mlp_wt/val_predictions.csv
+./solubility/svm_gpu/optimization_summary.txt
+./solubility/svm_gpu/pr_curve.png
+./solubility/svm_gpu/roc_curve.png
+./solubility/svm_gpu/study_trials.csv
+./solubility/svm_gpu/train_predictions.csv
+./solubility/svm_gpu/val_predictions.csv
+./solubility/transformer_wt/optimization_summary.txt
+./solubility/transformer_wt/pr_curve.png
+./solubility/transformer_wt/roc_curve.png
+./solubility/transformer_wt/study_trials.csv
+./solubility/transformer_wt/train_predictions.csv
+./solubility/transformer_wt/val_predictions.csv
+./solubility/xgb/optimization_summary.txt
+./solubility/xgb/pr_curve.png
+./solubility/xgb/roc_curve.png
+./solubility/xgb/study_trials.csv
+./solubility/xgb/train_predictions.csv
+./solubility/xgb/val_predictions.csv
+./binding_affinity/wt_wt_pooled/optuna_trials.csv
+./binding_affinity/wt_smiles_pooled/optuna_trials.csv
+./binding_affinity/wt_smiles_unpooled/optuna_trials.csv
+./binding_affinity/wt_wt_unpooled/optuna_trials.csv

training_classifiers/.ipynb_checkpoints/train_boost-checkpoint.py ADDED Viewed

	@@ -0,0 +1,417 @@

+import os
+import json
+import joblib
+import optuna
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from dataclasses import dataclass
+from typing import Dict, Any, Tuple, Optional
+from datasets import load_from_disk, DatasetDict
+from sklearn.metrics import (
+    f1_score, roc_auc_score, average_precision_score,
+    precision_recall_curve, roc_curve
+)
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.tree import DecisionTreeClassifier
+from linearboost import LinearBoostClassifier
+import xgboost as xgb
+from lightning.pytorch import seed_everything
+seed_everything(1986)
+# -----------------------------
+# Data loading
+# -----------------------------
+@dataclass
+class SplitData:
+    X_train: np.ndarray
+    y_train: np.ndarray
+    seq_train: Optional[np.ndarray]
+    X_val: np.ndarray
+    y_val: np.ndarray
+    seq_val: Optional[np.ndarray]
+def _stack_embeddings(col) -> np.ndarray:
+    # HF datasets often store embeddings as list-of-floats per row
+    arr = np.asarray(col, dtype=np.float32)
+    if arr.ndim != 2:
+        arr = np.stack(col).astype(np.float32)
+    return arr
+def load_split_data(dataset_path: str) -> SplitData:
+    ds = load_from_disk(dataset_path)
+    # Case A: DatasetDict with train/val
+    if isinstance(ds, DatasetDict) and "train" in ds and "val" in ds:
+        train_ds, val_ds = ds["train"], ds["val"]
+    else:
+        # Case B: Single dataset with "split" column
+        if "split" not in ds.column_names:
+            raise ValueError(
+                "Dataset must be a DatasetDict(train/val) or have a 'split' column."
+            )
+        train_ds = ds.filter(lambda x: x["split"] == "train")
+        val_ds   = ds.filter(lambda x: x["split"] == "val")
+    for required in ["embedding", "label"]:
+        if required not in train_ds.column_names:
+            raise ValueError(f"Missing column '{required}' in train split.")
+        if required not in val_ds.column_names:
+            raise ValueError(f"Missing column '{required}' in val split.")
+    X_train = _stack_embeddings(train_ds["embedding"])
+    y_train = np.asarray(train_ds["label"], dtype=np.int64)
+    X_val = _stack_embeddings(val_ds["embedding"])
+    y_val = np.asarray(val_ds["label"], dtype=np.int64)
+    seq_train = None
+    seq_val = None
+    if "sequence" in train_ds.column_names:
+        seq_train = np.asarray(train_ds["sequence"])
+    if "sequence" in val_ds.column_names:
+        seq_val = np.asarray(val_ds["sequence"])
+    return SplitData(X_train, y_train, seq_train, X_val, y_val, seq_val)
+# -----------------------------
+# Metrics + thresholding
+# -----------------------------
+def best_f1_threshold(y_true: np.ndarray, y_prob: np.ndarray) -> Tuple[float, float]:
+    """
+    Find threshold maximizing F1 on the given set.
+    Returns (best_threshold, best_f1).
+    """
+    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
+    # precision_recall_curve returns thresholds of length n-1
+    # compute F1 for those thresholds
+    f1s = (2 * precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1] + 1e-12)
+    best_idx = int(np.nanargmax(f1s))
+    return float(thresholds[best_idx]), float(f1s[best_idx])
+def eval_binary(y_true: np.ndarray, y_prob: np.ndarray, threshold: float) -> Dict[str, float]:
+    y_pred = (y_prob >= threshold).astype(int)
+    return {
+        "f1": float(f1_score(y_true, y_pred)),
+        "auc": float(roc_auc_score(y_true, y_prob)),
+        "ap": float(average_precision_score(y_true, y_prob)),
+        "threshold": float(threshold),
+    }
+# -----------------------------
+# Model factories
+# -----------------------------
+def train_xgb(
+    X_train, y_train, X_val, y_val, params: Dict[str, Any]
+) -> Tuple[xgb.Booster, np.ndarray, np.ndarray]:
+    dtrain = xgb.DMatrix(X_train, label=y_train)
+    dval   = xgb.DMatrix(X_val, label=y_val)
+    num_boost_round = int(params.pop("num_boost_round"))
+    early_stopping_rounds = int(params.pop("early_stopping_rounds"))
+    booster = xgb.train(
+        params=params,
+        dtrain=dtrain,
+        num_boost_round=num_boost_round,
+        evals=[(dval, "val")],
+        early_stopping_rounds=early_stopping_rounds,
+        verbose_eval=False,
+    )
+    p_train = booster.predict(dtrain)
+    p_val   = booster.predict(dval)
+    return booster, p_train, p_val
+def train_adaboost(
+    X_train, y_train, X_val, y_val, params: Dict[str, Any]
+) -> Tuple[AdaBoostClassifier, np.ndarray, np.ndarray]:
+    base_depth = int(params.pop("base_depth"))
+    clf = AdaBoostClassifier(
+        estimator=DecisionTreeClassifier(max_depth=base_depth),
+        n_estimators=int(params["n_estimators"]),
+        learning_rate=float(params["learning_rate"]),
+        algorithm="SAMME",
+    )
+    clf.fit(X_train, y_train)
+    p_train = clf.predict_proba(X_train)[:, 1]
+    p_val   = clf.predict_proba(X_val)[:, 1]
+    return clf, p_train, p_val
+def train_linearboost(X_train, y_train, X_val, y_val, params):
+    clf = LinearBoostClassifier(**params)
+    clf.fit(X_train, y_train)
+    p_train = clf.predict_proba(X_train)[:, 1]
+    p_val   = clf.predict_proba(X_val)[:, 1]
+    return clf, p_train, p_val
+def suggest_linearboost_params(trial):
+    # Core boosting params
+    params = {
+        "n_estimators": trial.suggest_int("n_estimators", 50, 800),
+        "learning_rate": trial.suggest_float("learning_rate", 0.01, 1.0, log=True),
+        "algorithm": trial.suggest_categorical("algorithm", ["SAMME.R", "SAMME"]),
+        # Scaling choices from docs (you can expand this list if you want)
+        "scaler": trial.suggest_categorical(
+            "scaler",
+            ["minmax", "standard", "robust", "quantile-uniform", "quantile-normal", "power"]
+        ),
+        # useful for imbalanced splits
+        "class_weight": trial.suggest_categorical("class_weight", [None, "balanced"]),
+        # kernel trick
+        "kernel": trial.suggest_categorical("kernel", ["linear", "rbf", "poly", "sigmoid"]),
+    }
+    # Kernel-specific params (only when relevant)
+    if params["kernel"] in ["rbf", "poly"]:
+        params["gamma"] = trial.suggest_float("gamma", 1e-6, 10.0, log=True)
+    else:
+        params["gamma"] = None  # docs: default treated as 1/n_features for rbf/poly :contentReference[oaicite:5]{index=5}
+    if params["kernel"] == "poly":
+        params["degree"] = trial.suggest_int("degree", 2, 6)  # docs default=3 :contentReference[oaicite:6]{index=6}
+        params["coef0"]  = trial.suggest_float("coef0", 0.0, 5.0)  # docs default=1 :contentReference[oaicite:7]{index=7}
+    else:
+        # safe defaults
+        params["degree"] = 3
+        params["coef0"]  = 1.0
+    return params
+# -----------------------------
+# Saving artifacts
+# -----------------------------
+def save_predictions_csv(
+    out_dir: str,
+    split_name: str,
+    y_true: np.ndarray,
+    y_prob: np.ndarray,
+    threshold: float,
+    sequences: Optional[np.ndarray] = None,
+):
+    os.makedirs(out_dir, exist_ok=True)
+    df = pd.DataFrame({
+        "y_true": y_true.astype(int),
+        "y_prob": y_prob.astype(float),
+        "y_pred": (y_prob >= threshold).astype(int),
+    })
+    if sequences is not None:
+        df.insert(0, "sequence", sequences)
+    df.to_csv(os.path.join(out_dir, f"{split_name}_predictions.csv"), index=False)
+def plot_curves(out_dir: str, y_true: np.ndarray, y_prob: np.ndarray):
+    os.makedirs(out_dir, exist_ok=True)
+    # PR
+    precision, recall, _ = precision_recall_curve(y_true, y_prob)
+    plt.figure()
+    plt.plot(recall, precision)
+    plt.xlabel("Recall")
+    plt.ylabel("Precision")
+    plt.title("Precision-Recall Curve")
+    plt.tight_layout()
+    plt.savefig(os.path.join(out_dir, "pr_curve.png"))
+    plt.close()
+    # ROC
+    fpr, tpr, _ = roc_curve(y_true, y_prob)
+    plt.figure()
+    plt.plot(fpr, tpr)
+    plt.xlabel("False Positive Rate")
+    plt.ylabel("True Positive Rate")
+    plt.title("ROC Curve")
+    plt.tight_layout()
+    plt.savefig(os.path.join(out_dir, "roc_curve.png"))
+    plt.close()
+# -----------------------------
+# Optuna objectives
+# -----------------------------
+def make_objective(model_name: str, data: SplitData, out_dir: str):
+    Xtr, ytr, Xva, yva = data.X_train, data.y_train, data.X_val, data.y_val
+    def objective(trial: optuna.Trial) -> float:
+        if model_name == "xgb":
+            params = {
+                "objective": "binary:logistic",
+                "eval_metric": "logloss",
+                "lambda": trial.suggest_float("lambda", 1e-8, 50.0, log=True),
+                "alpha": trial.suggest_float("alpha", 1e-8, 50.0, log=True),
+                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 1.0),
+                "subsample": trial.suggest_float("subsample", 0.5, 1.0),
+                "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
+                "max_depth": trial.suggest_int("max_depth", 2, 15),
+                "min_child_weight": trial.suggest_int("min_child_weight", 1, 500),
+                "gamma": trial.suggest_float("gamma", 0.0, 10.0),
+                "tree_method": "hist",
+                "device": "cuda",
+            }
+            # Optional GPU: set env CUDA_VISIBLE_DEVICES externally if you want.
+            # If you *know* you want GPU and your xgboost supports it:
+            # params["device"] = "cuda"
+            params["num_boost_round"] = trial.suggest_int("num_boost_round", 50, 1500)
+            params["early_stopping_rounds"] = trial.suggest_int("early_stopping_rounds", 20, 200)
+            model, p_tr, p_va = train_xgb(Xtr, ytr, Xva, yva, params.copy())
+        elif model_name == "adaboost":
+            params = {
+                "n_estimators": trial.suggest_int("n_estimators", 50, 800),
+                "learning_rate": trial.suggest_float("learning_rate", 1e-3, 2.0, log=True),
+                "base_depth": trial.suggest_int("base_depth", 1, 4),
+            }
+            model, p_tr, p_va = train_adaboost(Xtr, ytr, Xva, yva, params)
+        elif model_name == "linearboost":
+            params = suggest_linearboost_params(trial)
+            model, p_tr, p_va = train_linearboost(Xtr, ytr, Xva, yva, params)
+        else:
+            raise ValueError(f"Unknown model_name={model_name}")
+        # Threshold picked on val for fair comparison across models
+        thr, f1_at_thr = best_f1_threshold(yva, p_va)
+        metrics = eval_binary(yva, p_va, thr)
+        # Track best trial artifacts inside the study directory
+        trial.set_user_attr("threshold", thr)
+        trial.set_user_attr("auc", metrics["auc"])
+        trial.set_user_attr("ap", metrics["ap"])
+        return f1_at_thr
+    return objective
+# -----------------------------
+# Main runner
+# -----------------------------
+def run_optuna_and_refit(
+    dataset_path: str,
+    out_dir: str,
+    model_name: str,
+    n_trials: int = 200,
+):
+    os.makedirs(out_dir, exist_ok=True)
+    data = load_split_data(dataset_path)
+    print(f"[Data] Train: {data.X_train.shape}, Val: {data.X_val.shape}")
+    study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner())
+    study.optimize(make_objective(model_name, data, out_dir), n_trials=n_trials)
+    # Save trials table
+    trials_df = study.trials_dataframe()
+    trials_df.to_csv(os.path.join(out_dir, "study_trials.csv"), index=False)
+    best = study.best_trial
+    best_params = dict(best.params)
+    best_thr = float(best.user_attrs["threshold"])
+    best_auc = float(best.user_attrs["auc"])
+    best_ap  = float(best.user_attrs["ap"])
+    best_f1  = float(best.value)
+    # Refit best model on train (same protocol as objective)
+    if model_name == "xgb":
+        # Reconstruct full param dict
+        params = {
+            "objective": "binary:logistic",
+            "eval_metric": "logloss",
+            "lambda": best_params["lambda"],
+            "alpha": best_params["alpha"],
+            "colsample_bytree": best_params["colsample_bytree"],
+            "subsample": best_params["subsample"],
+            "learning_rate": best_params["learning_rate"],
+            "max_depth": best_params["max_depth"],
+            "min_child_weight": best_params["min_child_weight"],
+            "gamma": best_params["gamma"],
+            "tree_method": "hist",
+            "num_boost_round": best_params["num_boost_round"],
+            "early_stopping_rounds": best_params["early_stopping_rounds"],
+        }
+        model, p_tr, p_va = train_xgb(
+            data.X_train, data.y_train, data.X_val, data.y_val, params
+        )
+        model_path = os.path.join(out_dir, "best_model.json")
+        model.save_model(model_path)
+    elif model_name == "adaboost":
+        params = best_params
+        model, p_tr, p_va = train_adaboost(
+            data.X_train, data.y_train, data.X_val, data.y_val, params
+        )
+        model_path = os.path.join(out_dir, "best_model.joblib")
+        joblib.dump(model, model_path)
+    elif model_name == "linearboost":
+        params = best_params
+        model, p_tr, p_va = train_linearboost(
+            data.X_train, data.y_train, data.X_val, data.y_val, params
+        )
+        model_path = os.path.join(out_dir, "best_model.joblib")
+        joblib.dump(model, model_path)
+    else:
+        raise ValueError(model_name)
+    # Save predictions CSVs
+    save_predictions_csv(out_dir, "train", data.y_train, p_tr, best_thr, data.seq_train)
+    save_predictions_csv(out_dir, "val",   data.y_val,   p_va, best_thr, data.seq_val)
+    # Plots on val
+    plot_curves(out_dir, data.y_val, p_va)
+    # Summary
+    summary = [
+        "=" * 72,
+        f"MODEL: {model_name}",
+        f"Best trial: {best.number}",
+        f"Best F1 (val @ best-threshold): {best_f1:.4f}",
+        f"Val AUC: {best_auc:.4f}",
+        f"Val AP:  {best_ap:.4f}",
+        f"Best threshold (picked on val): {best_thr:.4f}",
+        f"Model saved to: {model_path}",
+        "Best params:",
+        json.dumps(best_params, indent=2),
+        "=" * 72,
+    ]
+    with open(os.path.join(out_dir, "optimization_summary.txt"), "w") as f:
+        f.write("\n".join(summary))
+    print("\n".join(summary))
+if __name__ == "__main__":
+    # Example usage:
+    # dataset_path = "/vast/projects/pranam/lab/yz927/projects/Classifier_Weight/training_classifiers/data/solubility"
+    # out_dir = "/vast/projects/pranam/lab/yz927/projects/Classifier_Weight/training_classifiers/src/solubility/xgb"
+    # run_optuna_and_refit(dataset_path, out_dir, model_name="xgb", n_trials=200)
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset_path", type=str, required=True)
+    parser.add_argument("--out_dir", type=str, required=True)
+    parser.add_argument("--model", type=str, choices=["xgb", "adaboost", "linearboost"], required=True)
+    parser.add_argument("--n_trials", type=int, default=200)
+    args = parser.parse_args()
+    run_optuna_and_refit(
+        dataset_path=args.dataset_path,
+        out_dir=args.out_dir,
+        model_name=args.model,
+        n_trials=args.n_trials,
+    )

training_classifiers/.ipynb_checkpoints/train_ml-checkpoint.py ADDED Viewed

	@@ -0,0 +1,468 @@

+import os
+import json
+import joblib
+import optuna
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from dataclasses import dataclass
+from typing import Dict, Any, Tuple, Optional
+from datasets import load_from_disk, DatasetDict
+from sklearn.metrics import (
+    f1_score, roc_auc_score, average_precision_score,
+    precision_recall_curve, roc_curve
+)
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC, LinearSVC
+from sklearn.calibration import CalibratedClassifierCV
+import torch
+import time
+import xgboost as xgb
+from lightning.pytorch import seed_everything
+import cupy as cp
+from cuml.svm import SVC as cuSVC
+from cuml.linear_model import LogisticRegression as cuLogReg
+seed_everything(1986)
+def to_gpu(X: np.ndarray):
+    if isinstance(X, cp.ndarray):
+        return X
+    return cp.asarray(X, dtype=cp.float32)
+def to_cpu(x):
+    if isinstance(x, cp.ndarray):
+        return cp.asnumpy(x)
+    return np.asarray(x
+@dataclass
+class SplitData:
+    X_train: np.ndarray
+    y_train: np.ndarray
+    seq_train: Optional[np.ndarray]
+    X_val: np.ndarray
+    y_val: np.ndarray
+    seq_val: Optional[np.ndarray]
+def _stack_embeddings(col) -> np.ndarray:
+    arr = np.asarray(col, dtype=np.float32)
+    if arr.ndim != 2:
+        arr = np.stack(col).astype(np.float32)
+    return arr
+def load_split_data(dataset_path: str) -> SplitData:
+    ds = load_from_disk(dataset_path)
+    # Case A: DatasetDict with train/val
+    if isinstance(ds, DatasetDict) and "train" in ds and "val" in ds:
+        train_ds, val_ds = ds["train"], ds["val"]
+    else:
+        # Case B: Single dataset with "split" column
+        if "split" not in ds.column_names:
+            raise ValueError(
+                "Dataset must be a DatasetDict(train/val) or have a 'split' column."
+            )
+        train_ds = ds.filter(lambda x: x["split"] == "train")
+        val_ds   = ds.filter(lambda x: x["split"] == "val")
+    for required in ["embedding", "label"]:
+        if required not in train_ds.column_names:
+            raise ValueError(f"Missing column '{required}' in train split.")
+        if required not in val_ds.column_names:
+            raise ValueError(f"Missing column '{required}' in val split.")
+    X_train = _stack_embeddings(train_ds["embedding"])
+    y_train = np.asarray(train_ds["label"], dtype=np.int64)
+    X_val = _stack_embeddings(val_ds["embedding"])
+    y_val = np.asarray(val_ds["label"], dtype=np.int64)
+    seq_train = None
+    seq_val = None
+    if "sequence" in train_ds.column_names:
+        seq_train = np.asarray(train_ds["sequence"])
+    if "sequence" in val_ds.column_names:
+        seq_val = np.asarray(val_ds["sequence"])
+    return SplitData(X_train, y_train, seq_train, X_val, y_val, seq_val)
+def best_f1_threshold(y_true: np.ndarray, y_prob: np.ndarray) -> Tuple[float, float]:
+    """
+    Find threshold maximizing F1 on the given set.
+    Returns (best_threshold, best_f1).
+    """
+    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
+    f1s = (2 * precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1] + 1e-12)
+    best_idx = int(np.nanargmax(f1s))
+    return float(thresholds[best_idx]), float(f1s[best_idx])
+def eval_binary(y_true: np.ndarray, y_prob: np.ndarray, threshold: float) -> Dict[str, float]:
+    y_pred = (y_prob >= threshold).astype(int)
+    return {
+        "f1": float(f1_score(y_true, y_pred)),
+        "auc": float(roc_auc_score(y_true, y_prob)),
+        "ap": float(average_precision_score(y_true, y_prob)),
+        "threshold": float(threshold),
+    }
+# -----------------------------
+# Model
+# -----------------------------
+def train_xgb(
+    X_train, y_train, X_val, y_val, params: Dict[str, Any]
+) -> Tuple[xgb.Booster, np.ndarray, np.ndarray]:
+    dtrain = xgb.DMatrix(X_train, label=y_train)
+    dval   = xgb.DMatrix(X_val, label=y_val)
+    num_boost_round = int(params.pop("num_boost_round"))
+    early_stopping_rounds = int(params.pop("early_stopping_rounds"))
+    booster = xgb.train(
+        params=params,
+        dtrain=dtrain,
+        num_boost_round=num_boost_round,
+        evals=[(dval, "val")],
+        early_stopping_rounds=early_stopping_rounds,
+        verbose_eval=False,
+    )
+    p_train = booster.predict(dtrain)
+    p_val   = booster.predict(dval)
+    return booster, p_train, p_val
+def train_cuml_svc(X_train, y_train, X_val, y_val, params):
+    Xtr = to_gpu(X_train)
+    Xva = to_gpu(X_val)
+    ytr = to_gpu(y_train).astype(cp.int32)
+    clf = cuSVC(
+        C=float(params["C"]),
+        kernel=params["kernel"],
+        gamma=params.get("gamma", "scale"),
+        class_weight=params.get("class_weight", None),
+        probability=bool(params.get("probability", True)),
+        random_state=1986,
+        max_iter=int(params.get("max_iter", 1000)),
+        tol=float(params.get("tol", 1e-4)),
+    )
+    clf.fit(Xtr, ytr)
+    p_train = to_cpu(clf.predict_proba(Xtr)[:, 1])
+    p_val   = to_cpu(clf.predict_proba(Xva)[:, 1])
+    return clf, p_train, p_val
+def train_cuml_elastic_net(X_train, y_train, X_val, y_val, params):
+    Xtr = to_gpu(X_train)
+    Xva = to_gpu(X_val)
+    ytr = to_gpu(y_train).astype(cp.int32)
+    clf = cuLogReg(
+        penalty="elasticnet",
+        C=float(params["C"]),
+        l1_ratio=float(params["l1_ratio"]),
+        class_weight=params.get("class_weight", None),
+        max_iter=int(params.get("max_iter", 1000)),
+        tol=float(params.get("tol", 1e-4)),
+        solver="qn",
+        fit_intercept=True,
+    )
+    clf.fit(Xtr, ytr)
+    p_train = to_cpu(clf.predict_proba(Xtr)[:, 1])
+    p_val   = to_cpu(clf.predict_proba(Xva)[:, 1])
+    return clf, p_train, p_val
+def train_svm(X_train, y_train, X_val, y_val, params):
+    """
+    Kernel SVM via SVC. CPU only in sklearn.
+    probability=True enables predict_proba but is slower.
+    """
+    clf = SVC(
+        C=float(params["C"]),
+        kernel=params["kernel"],
+        gamma=params.get("gamma", "scale"),
+        class_weight=params.get("class_weight", None),
+        probability=True,
+        random_state=1986,
+    )
+    clf.fit(X_train, y_train)
+    p_train = clf.predict_proba(X_train)[:, 1]
+    p_val   = clf.predict_proba(X_val)[:, 1]
+    return clf, p_train, p_val
+def train_linearsvm_calibrated(X_train, y_train, X_val, y_val, params):
+    """
+    Fast linear SVM (LinearSVC) + probability calibration.
+    Usually much faster than SVC on large datasets.
+    """
+    base = LinearSVC(
+        C=float(params["C"]),
+        class_weight=params.get("class_weight", None),
+        max_iter=int(params.get("max_iter", 5000)),
+        random_state=1986,
+    )
+    # calibration to get probabilities for PR/ROC + thresholding
+    clf = CalibratedClassifierCV(base, method="sigmoid", cv=3)
+    clf.fit(X_train, y_train)
+    p_train = clf.predict_proba(X_train)[:, 1]
+    p_val   = clf.predict_proba(X_val)[:, 1]
+    return clf, p_train, p_val
+# -----------------------------
+# Saving artifacts
+# -----------------------------
+def save_predictions_csv(
+    out_dir: str,
+    split_name: str,
+    y_true: np.ndarray,
+    y_prob: np.ndarray,
+    threshold: float,
+    sequences: Optional[np.ndarray] = None,
+):
+    os.makedirs(out_dir, exist_ok=True)
+    df = pd.DataFrame({
+        "y_true": y_true.astype(int),
+        "y_prob": y_prob.astype(float),
+        "y_pred": (y_prob >= threshold).astype(int),
+    })
+    if sequences is not None:
+        df.insert(0, "sequence", sequences)
+    df.to_csv(os.path.join(out_dir, f"{split_name}_predictions.csv"), index=False)
+def plot_curves(out_dir: str, y_true: np.ndarray, y_prob: np.ndarray):
+    os.makedirs(out_dir, exist_ok=True)
+    # PR
+    precision, recall, _ = precision_recall_curve(y_true, y_prob)
+    plt.figure()
+    plt.plot(recall, precision)
+    plt.xlabel("Recall")
+    plt.ylabel("Precision")
+    plt.title("Precision-Recall Curve")
+    plt.tight_layout()
+    plt.savefig(os.path.join(out_dir, "pr_curve.png"))
+    plt.close()
+    # ROC
+    fpr, tpr, _ = roc_curve(y_true, y_prob)
+    plt.figure()
+    plt.plot(fpr, tpr)
+    plt.xlabel("False Positive Rate")
+    plt.ylabel("True Positive Rate")
+    plt.title("ROC Curve")
+    plt.tight_layout()
+    plt.savefig(os.path.join(out_dir, "roc_curve.png"))
+    plt.close()
+# -----------------------------
+# Optuna objectives
+# -----------------------------
+def make_objective(model_name: str, data: SplitData, out_dir: str):
+    Xtr, ytr, Xva, yva = data.X_train, data.y_train, data.X_val, data.y_val
+    def objective(trial: optuna.Trial) -> float:
+        if model_name == "xgb":
+            params = {
+                "objective": "binary:logistic",
+                "eval_metric": "logloss",
+                "lambda": trial.suggest_float("lambda", 1e-8, 50.0, log=True),
+                "alpha": trial.suggest_float("alpha", 1e-8, 50.0, log=True),
+                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 1.0),
+                "subsample": trial.suggest_float("subsample", 0.5, 1.0),
+                "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
+                "max_depth": trial.suggest_int("max_depth", 2, 15),
+                "min_child_weight": trial.suggest_int("min_child_weight", 1, 500),
+                "gamma": trial.suggest_float("gamma", 0.0, 10.0),
+                "tree_method": "hist",
+                "device": "cuda",
+            }
+            params["num_boost_round"] = trial.suggest_int("num_boost_round", 50, 1500)
+            params["early_stopping_rounds"] = trial.suggest_int("early_stopping_rounds", 20, 200)
+            model, p_tr, p_va = train_xgb(Xtr, ytr, Xva, yva, params.copy())
+        elif model_name == "svm":
+            svm_kind = trial.suggest_categorical("svm_kind", ["svc", "linear_calibrated"])
+            if svm_kind == "svc":
+                params = {
+                    "C": trial.suggest_float("C", 1e-3, 1e3, log=True),
+                    "kernel": trial.suggest_categorical("kernel", ["rbf", "linear", "poly", "sigmoid"]),
+                    "class_weight": trial.suggest_categorical("class_weight", [None, "balanced"]),
+                }
+                if params["kernel"] in ["rbf", "poly", "sigmoid"]:
+                    params["gamma"] = trial.suggest_float("gamma", 1e-6, 10.0, log=True)
+                else:
+                    params["gamma"] = "scale"
+                model, p_tr, p_va = train_svm(Xtr, ytr, Xva, yva, params)
+            else:
+                params = {
+                    "C": trial.suggest_float("C", 1e-3, 1e3, log=True),
+                    "class_weight": trial.suggest_categorical("class_weight", [None, "balanced"]),
+                    "max_iter": trial.suggest_int("max_iter", 2000, 20000),
+                }
+                model, p_tr, p_va = train_linearsvm_calibrated(Xtr, ytr, Xva, yva, params)
+        elif model_name == "svm_gpu":
+            params = {
+                "C": trial.suggest_float("C", 1e-3, 1e3, log=True),
+                "kernel": trial.suggest_categorical("kernel", ["rbf", "linear", "poly", "sigmoid"]),
+                "class_weight": trial.suggest_categorical("class_weight", [None, "balanced"]),
+                "probability": True,
+                "max_iter": trial.suggest_int("max_iter", 200, 5000),
+                "tol": trial.suggest_float("tol", 1e-6, 1e-2, log=True),
+            }
+            if params["kernel"] in ["rbf", "poly", "sigmoid"]:
+                params["gamma"] = trial.suggest_float("gamma", 1e-6, 10.0, log=True)
+            else:
+                params["gamma"] = "scale"
+            model, p_tr, p_va = train_cuml_svc(Xtr, ytr, Xva, yva, params)
+        elif model_name == "enet_gpu":
+            params = {
+                "C": trial.suggest_float("C", 1e-4, 1e3, log=True),
+                "l1_ratio": trial.suggest_float("l1_ratio", 0.0, 1.0),
+                "class_weight": trial.suggest_categorical("class_weight", [None, "balanced"]),
+                "max_iter": trial.suggest_int("max_iter", 200, 5000),
+                "tol": trial.suggest_float("tol", 1e-6, 1e-2, log=True),
+            }
+            model, p_tr, p_va = train_cuml_elastic_net(Xtr, ytr, Xva, yva, params)
+        else:
+            raise ValueError(f"Unknown model_name={model_name}")
+        thr, f1_at_thr = best_f1_threshold(yva, p_va)
+        metrics = eval_binary(yva, p_va, thr)
+        trial.set_user_attr("threshold", thr)
+        trial.set_user_attr("auc", metrics["auc"])
+        trial.set_user_attr("ap", metrics["ap"])
+        return f1_at_thr
+    return objective
+# -----------------------------
+# Main
+# -----------------------------
+def run_optuna_and_refit(
+    dataset_path: str,
+    out_dir: str,
+    model_name: str,
+    n_trials: int = 200,
+):
+    os.makedirs(out_dir, exist_ok=True)
+    data = load_split_data(dataset_path)
+    print(f"[Data] Train: {data.X_train.shape}, Val: {data.X_val.shape}")
+    study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner())
+    study.optimize(make_objective(model_name, data, out_dir), n_trials=n_trials)
+    trials_df = study.trials_dataframe()
+    trials_df.to_csv(os.path.join(out_dir, "study_trials.csv"), index=False)
+    best = study.best_trial
+    best_params = dict(best.params)
+    best_thr = float(best.user_attrs["threshold"])
+    best_auc = float(best.user_attrs["auc"])
+    best_ap  = float(best.user_attrs["ap"])
+    best_f1  = float(best.value)
+    # Refit best model on train
+    if model_name == "xgb":
+        params = {
+            "objective": "binary:logistic",
+            "eval_metric": "logloss",
+            "lambda": best_params["lambda"],
+            "alpha": best_params["alpha"],
+            "colsample_bytree": best_params["colsample_bytree"],
+            "subsample": best_params["subsample"],
+            "learning_rate": best_params["learning_rate"],
+            "max_depth": best_params["max_depth"],
+            "min_child_weight": best_params["min_child_weight"],
+            "gamma": best_params["gamma"],
+            "tree_method": "hist",
+            "num_boost_round": best_params["num_boost_round"],
+            "early_stopping_rounds": best_params["early_stopping_rounds"],
+        }
+        model, p_tr, p_va = train_xgb(
+            data.X_train, data.y_train, data.X_val, data.y_val, params
+        )
+        model_path = os.path.join(out_dir, "best_model.json")
+        model.save_model(model_path)
+    elif model_name == "svm":
+        svm_kind = best_params["svm_kind"]
+        if svm_kind == "svc":
+            model, p_tr, p_va = train_svm(data.X_train, data.y_train, data.X_val, data.y_val, best_params)
+        else:
+            model, p_tr, p_va = train_linearsvm_calibrated(data.X_train, data.y_train, data.X_val, data.y_val, best_params)
+        model_path = os.path.join(out_dir, "best_model.joblib")
+        joblib.dump(model, model_path)
+    elif model_name == "svm_gpu":
+        model, p_tr, p_va = train_cuml_svc(
+            data.X_train, data.y_train, data.X_val, data.y_val, best_params
+        )
+        model_path = os.path.join(out_dir, "best_model_cuml_svc.joblib")
+        joblib.dump(model, model_path)
+    elif model_name == "enet_gpu":
+        model, p_tr, p_va = train_cuml_elastic_net(
+            data.X_train, data.y_train, data.X_val, data.y_val, best_params
+        )
+        model_path = os.path.join(out_dir, "best_model_cuml_enet.joblib")
+        joblib.dump(model, model_path)
+    else:
+        raise ValueError(model_name)
+    # Save predictions CSVs
+    save_predictions_csv(out_dir, "train", data.y_train, p_tr, best_thr, data.seq_train)
+    save_predictions_csv(out_dir, "val",   data.y_val,   p_va, best_thr, data.seq_val)
+    # Plots on val
+    plot_curves(out_dir, data.y_val, p_va)
+    summary = [
+        "=" * 72,
+        f"MODEL: {model_name}",
+        f"Best trial: {best.number}",
+        f"Best F1 (val @ best-threshold): {best_f1:.4f}",
+        f"Val AUC: {best_auc:.4f}",
+        f"Val AP:  {best_ap:.4f}",
+        f"Best threshold (picked on val): {best_thr:.4f}",
+        f"Model saved to: {model_path}",
+        "Best params:",
+        json.dumps(best_params, indent=2),
+        "=" * 72,
+    ]
+    with open(os.path.join(out_dir, "optimization_summary.txt"), "w") as f:
+        f.write("\n".join(summary))
+    print("\n".join(summary))
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset_path", type=str, required=True)
+    parser.add_argument("--out_dir", type=str, required=True)
+    parser.add_argument("--model", type=str, choices=["xgb", "svm_gpu", "enet_gpu"], required=True)
+    parser.add_argument("--n_trials", type=int, default=200)
+    args = parser.parse_args()
+    run_optuna_and_refit(
+        dataset_path=args.dataset_path,
+        out_dir=args.out_dir,
+        model_name=args.model,
+        n_trials=args.n_trials,
+    )

training_classifiers/.ipynb_checkpoints/train_ml_regression-checkpoint.py ADDED Viewed

	@@ -0,0 +1,410 @@

+import os
+import json
+import joblib
+import optuna
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from dataclasses import dataclass
+from typing import Dict, Any, Tuple, Optional
+from datasets import load_from_disk, DatasetDict
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+from sklearn.svm import SVR
+import xgboost as xgb
+from lightning.pytorch import seed_everything
+import cupy as cp
+from cuml.linear_model import ElasticNet as cuElasticNet
+from scipy.stats import spearmanr
+seed_everything(1986)
+# -----------------------------
+# GPU/CPU helpers
+# -----------------------------
+def to_gpu(X: np.ndarray):
+    if isinstance(X, cp.ndarray):
+        return X
+    return cp.asarray(X, dtype=cp.float32)
+def to_cpu(x):
+    if isinstance(x, cp.ndarray):
+        return cp.asnumpy(x)
+    return np.asarray(x)
+# -----------------------------
+# Data loading
+# -----------------------------
+@dataclass
+class SplitData:
+    X_train: np.ndarray
+    y_train: np.ndarray
+    seq_train: Optional[np.ndarray]
+    X_val: np.ndarray
+    y_val: np.ndarray
+    seq_val: Optional[np.ndarray]
+def _stack_embeddings(col) -> np.ndarray:
+    arr = np.asarray(col, dtype=np.float32)
+    if arr.ndim != 2:
+        arr = np.stack(col).astype(np.float32)
+    return arr
+def load_split_data(dataset_path: str) -> SplitData:
+    ds = load_from_disk(dataset_path)
+    if isinstance(ds, DatasetDict) and "train" in ds and "val" in ds:
+        train_ds, val_ds = ds["train"], ds["val"]
+    else:
+        if "split" not in ds.column_names:
+            raise ValueError("Dataset must be a DatasetDict(train/val) or have a 'split' column.")
+        train_ds = ds.filter(lambda x: x["split"] == "train")
+        val_ds   = ds.filter(lambda x: x["split"] == "val")
+    for required in ["embedding", "label"]:
+        if required not in train_ds.column_names:
+            raise ValueError(f"Missing column '{required}' in train split.")
+        if required not in val_ds.column_names:
+            raise ValueError(f"Missing column '{required}' in val split.")
+    X_train = _stack_embeddings(train_ds["embedding"]).astype(np.float32)
+    X_val   = _stack_embeddings(val_ds["embedding"]).astype(np.float32)
+    y_train = np.asarray(train_ds["label"], dtype=np.float32)
+    y_val   = np.asarray(val_ds["label"], dtype=np.float32)
+    seq_train = None
+    seq_val = None
+    if "sequence" in train_ds.column_names:
+        seq_train = np.asarray(train_ds["sequence"])
+    if "sequence" in val_ds.column_names:
+        seq_val = np.asarray(val_ds["sequence"])
+    return SplitData(X_train, y_train, seq_train, X_val, y_val, seq_val)
+# -----------------------------
+# Metrics
+# -----------------------------
+def safe_spearmanr(y_true: np.ndarray, y_pred: np.ndarray) -> float:
+    rho = spearmanr(y_true, y_pred).correlation
+    if rho is None or np.isnan(rho):
+        return 0.0
+    return float(rho)
+def eval_regression(y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str, float]:
+    # RMSE
+    try:
+        from sklearn.metrics import root_mean_squared_error
+        rmse = root_mean_squared_error(y_true, y_pred)
+    except Exception:
+        rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
+    mae = float(mean_absolute_error(y_true, y_pred))
+    r2  = float(r2_score(y_true, y_pred))
+    rho = float(safe_spearmanr(y_true, y_pred))
+    return {"rmse": rmse, "mae": mae, "r2": r2, "spearman_rho": rho}
+# -----------------------------
+# Model
+# -----------------------------
+def train_xgb_reg(
+    X_train, y_train, X_val, y_val, params: Dict[str, Any]
+) -> Tuple[xgb.Booster, np.ndarray, np.ndarray]:
+    dtrain = xgb.DMatrix(X_train, label=y_train)
+    dval   = xgb.DMatrix(X_val, label=y_val)
+    num_boost_round = int(params.pop("num_boost_round"))
+    early_stopping_rounds = int(params.pop("early_stopping_rounds"))
+    booster = xgb.train(
+        params=params,
+        dtrain=dtrain,
+        num_boost_round=num_boost_round,
+        evals=[(dval, "val")],
+        early_stopping_rounds=early_stopping_rounds,
+        verbose_eval=False,
+    )
+    p_train = booster.predict(dtrain)
+    p_val   = booster.predict(dval)
+    return booster, p_train, p_val
+def train_cuml_elasticnet_reg(
+    X_train, y_train, X_val, y_val, params: Dict[str, Any]
+):
+    Xtr = to_gpu(X_train)
+    Xva = to_gpu(X_val)
+    ytr = to_gpu(y_train).astype(cp.float32)
+    model = cuElasticNet(
+        alpha=float(params["alpha"]),
+        l1_ratio=float(params["l1_ratio"]),
+        fit_intercept=True,
+        max_iter=int(params.get("max_iter", 5000)),
+        tol=float(params.get("tol", 1e-4)),
+        selection=params.get("selection", "cyclic"),
+    )
+    model.fit(Xtr, ytr)
+    p_train = to_cpu(model.predict(Xtr))
+    p_val   = to_cpu(model.predict(Xva))
+    return model, p_train, p_val
+def train_svr_reg(
+    X_train, y_train, X_val, y_val, params: Dict[str, Any]
+):
+    model = SVR(
+        C=float(params["C"]),
+        epsilon=float(params["epsilon"]),
+        kernel=params["kernel"],
+        gamma=params.get("gamma", "scale"),
+    )
+    model.fit(X_train, y_train)
+    p_train = model.predict(X_train)
+    p_val   = model.predict(X_val)
+    return model, p_train, p_val
+# -----------------------------
+# Saving + plots
+# -----------------------------
+def save_predictions_csv(
+    out_dir: str,
+    split_name: str,
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+    sequences: Optional[np.ndarray] = None,
+):
+    os.makedirs(out_dir, exist_ok=True)
+    df = pd.DataFrame({
+        "y_true": y_true.astype(float),
+        "y_pred": y_pred.astype(float),
+        "residual": (y_true - y_pred).astype(float),
+    })
+    if sequences is not None:
+        df.insert(0, "sequence", sequences)
+    df.to_csv(os.path.join(out_dir, f"{split_name}_predictions.csv"), index=False)
+def plot_regression_diagnostics(out_dir: str, y_true: np.ndarray, y_pred: np.ndarray):
+    os.makedirs(out_dir, exist_ok=True)
+    plt.figure()
+    plt.scatter(y_true, y_pred, s=8, alpha=0.5)
+    plt.xlabel("y_true")
+    plt.ylabel("y_pred")
+    plt.title("Predicted vs True")
+    plt.tight_layout()
+    plt.savefig(os.path.join(out_dir, "pred_vs_true.png"))
+    plt.close()
+    resid = y_true - y_pred
+    plt.figure()
+    plt.hist(resid, bins=50)
+    plt.xlabel("residual (y_true - y_pred)")
+    plt.ylabel("count")
+    plt.title("Residual Histogram")
+    plt.tight_layout()
+    plt.savefig(os.path.join(out_dir, "residual_hist.png"))
+    plt.close()
+    plt.figure()
+    plt.scatter(y_pred, resid, s=8, alpha=0.5)
+    plt.xlabel("y_pred")
+    plt.ylabel("residual")
+    plt.title("Residuals vs Prediction")
+    plt.tight_layout()
+    plt.savefig(os.path.join(out_dir, "residual_vs_pred.png"))
+    plt.close()
+# -----------------------------
+# Optuna objective (OPTIMIZE SPEARMAN RHO)
+# -----------------------------
+def make_objective(model_name: str, data: SplitData):
+    Xtr, ytr, Xva, yva = data.X_train, data.y_train, data.X_val, data.y_val
+    def objective(trial: optuna.Trial) -> float:
+        if model_name == "xgb_reg":
+            params = {
+                "objective": "reg:squarederror",
+                "eval_metric": "rmse",
+                "lambda": trial.suggest_float("lambda", 1e-10, 100.0, log=True),
+                "alpha":  trial.suggest_float("alpha",  1e-10, 100.0, log=True),
+                "gamma":  trial.suggest_float("gamma",  0.0, 10.0),
+                "max_depth": trial.suggest_int("max_depth", 2, 16),
+                "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 500.0, log=True),
+                "subsample": trial.suggest_float("subsample", 0.5, 1.0),
+                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 1.0),
+                "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
+                "tree_method": "hist",
+                "device": "cuda",
+            }
+            params["num_boost_round"] = trial.suggest_int("num_boost_round", 50, 2000)
+            params["early_stopping_rounds"] = trial.suggest_int("early_stopping_rounds", 20, 200)
+            model, p_tr, p_va = train_xgb_reg(Xtr, ytr, Xva, yva, params.copy())
+        elif model_name == "enet_gpu":
+            params = {
+                "alpha": trial.suggest_float("alpha", 1e-8, 10.0, log=True),
+                "l1_ratio": trial.suggest_float("l1_ratio", 0.0, 1.0),
+                "max_iter": trial.suggest_int("max_iter", 1000, 20000),
+                "tol": trial.suggest_float("tol", 1e-6, 1e-2, log=True),
+                "selection": trial.suggest_categorical("selection", ["cyclic", "random"]),
+            }
+            model, p_tr, p_va = train_cuml_elasticnet_reg(Xtr, ytr, Xva, yva, params)
+        elif model_name == "svr":
+            params = {
+                "kernel": trial.suggest_categorical("kernel", ["rbf", "linear", "poly", "sigmoid"]),
+                "C": trial.suggest_float("C", 1e-3, 1e3, log=True),
+                "epsilon": trial.suggest_float("epsilon", 1e-4, 1.0, log=True),
+            }
+            if params["kernel"] in ["rbf", "poly", "sigmoid"]:
+                params["gamma"] = trial.suggest_float("gamma", 1e-6, 10.0, log=True)
+            else:
+                params["gamma"] = "scale"
+            model, p_tr, p_va = train_svr_reg(Xtr, ytr, Xva, yva, params)
+        else:
+            raise ValueError(f"Unknown model_name={model_name}")
+        metrics = eval_regression(yva, p_va)
+        trial.set_user_attr("spearman_rho", metrics["spearman_rho"])
+        trial.set_user_attr("rmse", metrics["rmse"])
+        trial.set_user_attr("mae", metrics["mae"])
+        trial.set_user_attr("r2", metrics["r2"])
+        # OPTUNA OBJECTIVE = maximize Spearman rho
+        return metrics["spearman_rho"]
+    return objective
+# -----------------------------
+# Main
+# -----------------------------
+def run_optuna_and_refit(
+    dataset_path: str,
+    out_dir: str,
+    model_name: str,
+    n_trials: int = 200,
+    standardize_X: bool = True,
+):
+    os.makedirs(out_dir, exist_ok=True)
+    data = load_split_data(dataset_path)
+    print(f"[Data] Train: {data.X_train.shape}, Val: {data.X_val.shape}")
+    # Standardize features (SVR + ElasticNet)
+    if standardize_X:
+        scaler = StandardScaler()
+        data.X_train = scaler.fit_transform(data.X_train).astype(np.float32)
+        data.X_val   = scaler.transform(data.X_val).astype(np.float32)
+        joblib.dump(scaler, os.path.join(out_dir, "scaler.joblib"))
+        print("[Preprocess] Saved StandardScaler -> scaler.joblib")
+    study = optuna.create_study(
+        direction="maximize",
+        pruner=optuna.pruners.MedianPruner()
+    )
+    study.optimize(make_objective(model_name, data), n_trials=n_trials)
+    trials_df = study.trials_dataframe()
+    trials_df.to_csv(os.path.join(out_dir, "study_trials.csv"), index=False)
+    best = study.best_trial
+    best_params = dict(best.params)
+    best_rho  = float(best.user_attrs.get("spearman_rho", best.value))
+    best_rmse = float(best.user_attrs.get("rmse", np.nan))
+    best_mae  = float(best.user_attrs.get("mae", np.nan))
+    best_r2   = float(best.user_attrs.get("r2", np.nan))
+    # Refit best model on train
+    if model_name == "xgb_reg":
+        params = {
+            "objective": "reg:squarederror",
+            "eval_metric": "rmse",
+            "lambda": best_params["lambda"],
+            "alpha": best_params["alpha"],
+            "gamma": best_params["gamma"],
+            "max_depth": best_params["max_depth"],
+            "min_child_weight": best_params["min_child_weight"],
+            "subsample": best_params["subsample"],
+            "colsample_bytree": best_params["colsample_bytree"],
+            "learning_rate": best_params["learning_rate"],
+            "tree_method": "hist",
+            "device": "cuda",
+            "num_boost_round": best_params["num_boost_round"],
+            "early_stopping_rounds": best_params["early_stopping_rounds"],
+        }
+        model, p_tr, p_va = train_xgb_reg(
+            data.X_train, data.y_train, data.X_val, data.y_val, params
+        )
+        model_path = os.path.join(out_dir, "best_model.json")
+        model.save_model(model_path)
+    elif model_name == "enet_gpu":
+        model, p_tr, p_va = train_cuml_elasticnet_reg(
+            data.X_train, data.y_train, data.X_val, data.y_val, best_params
+        )
+        model_path = os.path.join(out_dir, "best_model_cuml_enet.joblib")
+        joblib.dump(model, model_path)
+    elif model_name == "svr":
+        model, p_tr, p_va = train_svr_reg(
+            data.X_train, data.y_train, data.X_val, data.y_val, best_params
+        )
+        model_path = os.path.join(out_dir, "best_model_svr.joblib")
+        joblib.dump(model, model_path)
+    else:
+        raise ValueError(model_name)
+    save_predictions_csv(out_dir, "train", data.y_train, p_tr, data.seq_train)
+    save_predictions_csv(out_dir, "val",   data.y_val,   p_va, data.seq_val)
+    plot_regression_diagnostics(out_dir, data.y_val, p_va)
+    summary = [
+        "=" * 72,
+        f"MODEL: {model_name}",
+        f"Best trial: {best.number}",
+        f"Val Spearman rho (objective): {best_rho:.6f}",
+        f"Val RMSE:                      {best_rmse:.6f}",
+        f"Val MAE:                       {best_mae:.6f}",
+        f"Val R2:                        {best_r2:.6f}",
+        f"Model saved to:                {model_path}",
+        "Best params:",
+        json.dumps(best_params, indent=2),
+        "=" * 72,
+    ]
+    with open(os.path.join(out_dir, "optimization_summary.txt"), "w") as f:
+        f.write("\n".join(summary))
+    print("\n".join(summary))
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset_path", type=str, required=True)
+    parser.add_argument("--out_dir", type=str, required=True)
+    parser.add_argument("--model", type=str, choices=["xgb_reg", "enet_gpu", "svr"], required=True)
+    parser.add_argument("--n_trials", type=int, default=200)
+    parser.add_argument("--no_standardize", action="store_true", help="Disable StandardScaler on X")
+    args = parser.parse_args()
+    run_optuna_and_refit(
+        dataset_path=args.dataset_path,
+        out_dir=args.out_dir,
+        model_name=args.model,
+        n_trials=args.n_trials,
+        standardize_X=(not args.no_standardize),
+    )

training_classifiers/.ipynb_checkpoints/train_nn-checkpoint.py ADDED Viewed

	@@ -0,0 +1,426 @@

+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from datasets import load_from_disk, DatasetDict
+from sklearn.metrics import roc_auc_score, precision_recall_curve, f1_score
+import torch.nn as nn
+import optuna
+import os
+from typing import Dict, Any, Tuple, Optional
+import matplotlib.pyplot as plt
+from sklearn.metrics import (
+    f1_score, roc_auc_score, average_precision_score,
+    precision_recall_curve, roc_curve
+)
+import json
+import joblib
+import pandas as pd
+import time
+def infer_in_dim_from_unpooled_ds(ds) -> int:
+    ex = ds[0]
+    # ex["embedding"] is (L, H) list/array
+    return int(len(ex["embedding"][0]))
+def load_split(dataset_path):
+    ds = load_from_disk(dataset_path)
+    if isinstance(ds, DatasetDict):
+        return ds["train"], ds["val"]
+    raise ValueError("Expected DatasetDict with 'train' and 'val' splits")
+def collate_unpooled(batch):
+    # batch: list of dicts
+    lengths = [int(x["length"]) for x in batch]
+    Lmax = max(lengths)
+    H = len(batch[0]["embedding"][0])  # 1280
+    X = torch.zeros(len(batch), Lmax, H, dtype=torch.float32)
+    M = torch.zeros(len(batch), Lmax, dtype=torch.bool)
+    y = torch.tensor([x["label"] for x in batch], dtype=torch.float32)
+    for i, x in enumerate(batch):
+        emb = torch.tensor(x["embedding"], dtype=torch.float32)  # (L, H)
+        L = emb.shape[0]
+        X[i, :L] = emb
+        if "attention_mask" in x:
+            m = torch.tensor(x["attention_mask"], dtype=torch.bool)
+            M[i, :L] = m[:L]
+        else:
+            M[i, :L] = True
+    return X, M, y
+# ======================== Helper functions =========================================
+def save_predictions_csv(
+    out_dir: str,
+    split_name: str,
+    y_true: np.ndarray,
+    y_prob: np.ndarray,
+    threshold: float,
+    sequences: Optional[np.ndarray] = None,
+):
+    os.makedirs(out_dir, exist_ok=True)
+    df = pd.DataFrame({
+        "y_true": y_true.astype(int),
+        "y_prob": y_prob.astype(float),
+        "y_pred": (y_prob >= threshold).astype(int),
+    })
+    if sequences is not None:
+        df.insert(0, "sequence", sequences)
+    df.to_csv(os.path.join(out_dir, f"{split_name}_predictions.csv"), index=False)
+def plot_curves(out_dir: str, y_true: np.ndarray, y_prob: np.ndarray):
+    os.makedirs(out_dir, exist_ok=True)
+    # PR
+    precision, recall, _ = precision_recall_curve(y_true, y_prob)
+    plt.figure()
+    plt.plot(recall, precision)
+    plt.xlabel("Recall")
+    plt.ylabel("Precision")
+    plt.title("Precision-Recall Curve")
+    plt.tight_layout()
+    plt.savefig(os.path.join(out_dir, "pr_curve.png"))
+    plt.close()
+    # ROC
+    fpr, tpr, _ = roc_curve(y_true, y_prob)
+    plt.figure()
+    plt.plot(fpr, tpr)
+    plt.xlabel("False Positive Rate")
+    plt.ylabel("True Positive Rate")
+    plt.title("ROC Curve")
+    plt.tight_layout()
+    plt.savefig(os.path.join(out_dir, "roc_curve.png"))
+    plt.close()
+# ======================== Shared OPTUNA training scheme =========================================
+def best_f1_threshold(y_true, y_prob):
+    p, r, thr = precision_recall_curve(y_true, y_prob)
+    f1s = (2*p[:-1]*r[:-1])/(p[:-1]+r[:-1]+1e-12)
+    i = int(np.nanargmax(f1s))
+    return float(thr[i]), float(f1s[i])
+@torch.no_grad()
+def eval_probs(model, loader, device):
+    model.eval()
+    ys, ps = [], []
+    for X, M, y in loader:
+        X, M = X.to(device), M.to(device)
+        logits = model(X, M)
+        prob = torch.sigmoid(logits).detach().cpu().numpy()
+        ys.append(y.numpy())
+        ps.append(prob)
+    return np.concatenate(ys), np.concatenate(ps)
+def train_one_epoch(model, loader, optim, criterion, device):
+    model.train()
+    for X, M, y in loader:
+        X, M, y = X.to(device), M.to(device), y.to(device)
+        optim.zero_grad(set_to_none=True)
+        logits = model(X, M)
+        loss = criterion(logits, y)
+        loss.backward()
+        optim.step()
+# ======================== MLP =========================================
+# Still need mean pooling along lengths
+class MaskedMeanPool(nn.Module):
+    def forward(self, X, M):  # X: (B,L,H), M: (B,L)
+        Mf = M.unsqueeze(-1).float()
+        denom = Mf.sum(dim=1).clamp(min=1.0)
+        return (X * Mf).sum(dim=1) / denom  # (B,H)
+class MLPClassifier(nn.Module):
+    def __init__(self, in_dim, hidden=512, dropout=0.1):
+        super().__init__()
+        self.pool = MaskedMeanPool()
+        self.net = nn.Sequential(
+            nn.Linear(in_dim, hidden),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden, 1),
+        )
+    def forward(self, X, M):
+        z = self.pool(X, M)
+        return self.net(z).squeeze(-1)  # logits
+# ======================== CNN =========================================
+# Treat 1280 dimensions as channels
+class CNNClassifier(nn.Module):
+    def __init__(self, in_ch, c=256, k=5, layers=2, dropout=0.1):
+        super().__init__()
+        blocks = []
+        ch = in_ch
+        for _ in range(layers):
+            blocks += [
+                nn.Conv1d(ch, c, kernel_size=k, padding=k//2),
+                nn.GELU(),
+                nn.Dropout(dropout),
+            ]
+            ch = c
+        self.conv = nn.Sequential(*blocks)
+        self.head = nn.Linear(c, 1)
+    def forward(self, X, M):
+        # X: (B,L,H) -> (B,H,L)
+        Xc = X.transpose(1, 2)
+        Y = self.conv(Xc).transpose(1, 2)  # (B,L,C)
+        # masked mean pool over L
+        Mf = M.unsqueeze(-1).float()
+        denom = Mf.sum(dim=1).clamp(min=1.0)
+        pooled = (Y * Mf).sum(dim=1) / denom  # (B,C)
+        return self.head(pooled).squeeze(-1)
+# ========================== Transformer ====================================
+class TransformerClassifier(nn.Module):
+    def __init__(self, in_dim, d_model=256, nhead=8, layers=2, ff=512, dropout=0.1):
+        super().__init__()
+        self.proj = nn.Linear(in_dim, d_model)
+        enc_layer = nn.TransformerEncoderLayer(
+            d_model=d_model, nhead=nhead, dim_feedforward=ff,
+            dropout=dropout, batch_first=True, activation="gelu"
+        )
+        self.enc = nn.TransformerEncoder(enc_layer, num_layers=layers)
+        self.head = nn.Linear(d_model, 1)
+    def forward(self, X, M):
+        # src_key_padding_mask: True = pad positions
+        pad_mask = ~M
+        Z = self.proj(X)             # (B,L,d)
+        Z = self.enc(Z, src_key_padding_mask=pad_mask)  # (B,L,d)
+        Mf = M.unsqueeze(-1).float()
+        denom = Mf.sum(dim=1).clamp(min=1.0)
+        pooled = (Z * Mf).sum(dim=1) / denom
+        return self.head(pooled).squeeze(-1)
+# ========================== OPTUNA ====================================
+def objective_nn(trial, model_name, train_ds, val_ds, device="cuda:0"):
+    # hyperparams shared
+    lr = trial.suggest_float("lr", 1e-5, 3e-3, log=True)
+    wd = trial.suggest_float("weight_decay", 1e-8, 1e-2, log=True)
+    dropout = trial.suggest_float("dropout", 0.0, 0.5)
+    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])
+    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,
+                              collate_fn=collate_unpooled, num_workers=4, pin_memory=True)
+    val_loader = DataLoader(val_ds, batch_size=64, shuffle=False,
+                            collate_fn=collate_unpooled, num_workers=4, pin_memory=True)
+    in_dim = infer_in_dim_from_unpooled_ds(train_ds)
+    if model_name == "mlp":
+        hidden = trial.suggest_categorical("hidden", [256, 512, 1024, 2048])
+        model = MLPClassifier(in_dim=in_dim, hidden=hidden, dropout=dropout)
+    elif model_name == "cnn":
+        c = trial.suggest_categorical("channels", [128, 256, 512])
+        k = trial.suggest_categorical("kernel", [3, 5, 7])
+        layers = trial.suggest_int("layers", 1, 4)
+        model = CNNClassifier(in_ch=in_dim, c=c, k=k, layers=layers, dropout=dropout)
+    elif model_name == "transformer":
+        d = trial.suggest_categorical("d_model", [128, 256, 384])
+        nhead = trial.suggest_categorical("nhead", [4, 8])
+        layers = trial.suggest_int("layers", 1, 4)
+        ff = trial.suggest_categorical("ff", [256, 512, 1024, 1536])
+        model = TransformerClassifier(in_dim=in_dim, d_model=d, nhead=nhead, layers=layers, ff=ff, dropout=dropout)
+    else:
+        raise ValueError(model_name)
+    model = model.to(device)
+    # class imbalance handling
+    ytr = np.asarray(train_ds["label"], dtype=np.int64)
+    pos = ytr.sum()
+    neg = len(ytr) - pos
+    pos_weight = torch.tensor([neg / max(pos, 1)], device=device, dtype=torch.float32)
+    criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
+    optim = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
+    best_f1 = -1.0
+    patience = 8
+    bad = 0
+    for epoch in range(1, 51):
+        train_one_epoch(model, train_loader, optim, criterion, device)
+        y_true, y_prob = eval_probs(model, val_loader, device)
+        auc = roc_auc_score(y_true, y_prob)
+        thr, f1 = best_f1_threshold(y_true, y_prob)
+        trial.set_user_attr("val_auc", float(auc))
+        trial.set_user_attr("val_f1", float(f1))
+        trial.set_user_attr("val_thr", float(thr))
+        # prune
+        trial.report(f1, epoch)
+        if trial.should_prune():
+            raise optuna.TrialPruned()
+        if f1 > best_f1 + 1e-4:
+            best_f1 = f1
+            bad = 0
+        else:
+            bad += 1
+            if bad >= patience:
+                break
+    return best_f1
+def run_optuna_and_refit_nn(dataset_path: str, out_dir: str, model_name: str, n_trials: int = 50, device="cuda:0"):
+    os.makedirs(out_dir, exist_ok=True)
+    train_ds, val_ds = load_split(dataset_path)
+    print(f"[Data] Train: {len(train_ds)}, Val: {len(val_ds)}")
+    study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner())
+    study.optimize(lambda trial: objective_nn(trial, model_name, train_ds, val_ds, device=device), n_trials=n_trials)
+    trials_df = study.trials_dataframe()
+    trials_df.to_csv(os.path.join(out_dir, "study_trials.csv"), index=False)
+    best = study.best_trial
+    best_params = dict(best.params)
+    best_f1_optuna = float(best.value)
+    best_auc_optuna = float(best.user_attrs.get("val_auc", np.nan))
+    best_thr = float(best.user_attrs.get("val_thr", 0.5))
+    in_dim = infer_in_dim_from_unpooled_ds(train_ds)
+    # --- Refit best model  ---
+    batch_size = int(best_params.get("batch_size", 32))
+    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,
+                              collate_fn=collate_unpooled, num_workers=4, pin_memory=True)
+    val_loader = DataLoader(val_ds, batch_size=64, shuffle=False,
+                            collate_fn=collate_unpooled, num_workers=4, pin_memory=True)
+    # Rebuild
+    dropout = float(best_params.get("dropout", 0.1))
+    if model_name == "mlp":
+        model = MLPClassifier(
+            in_dim=in_dim,
+            hidden=int(best_params["hidden"]),
+            dropout=dropout,
+        )
+    elif model_name == "cnn":
+        model = CNNClassifier(
+            in_ch=in_dim,
+            c=int(best_params["channels"]),
+            k=int(best_params["kernel"]),
+            layers=int(best_params["layers"]),
+            dropout=dropout,
+        )
+    elif model_name == "transformer":
+        model = TransformerClassifier(
+            in_dim=in_dim,
+            d_model=int(best_params["d_model"]),
+            nhead=int(best_params["nhead"]),
+            layers=int(best_params["layers"]),
+            ff=int(best_params["ff"]),
+            dropout=dropout,
+        )
+    else:
+        raise ValueError(model_name)
+    model = model.to(device)
+    # loss + optimizer
+    ytr = np.asarray(train_ds["label"], dtype=np.int64)
+    pos = ytr.sum()
+    neg = len(ytr) - pos
+    pos_weight = torch.tensor([neg / max(pos, 1)], device=device, dtype=torch.float32)
+    criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
+    lr = float(best_params["lr"])
+    wd = float(best_params["weight_decay"])
+    optim = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
+    # train longer with early stopping on AUC
+    best_f1_seen, bad, patience = -1.0, 0, 12
+    best_state = None
+    best_thr_seen = 0.5
+    best_auc_seen = -1.0
+    for epoch in range(1, 151):
+        train_one_epoch(model, train_loader, optim, criterion, device)
+        y_true, y_prob = eval_probs(model, val_loader, device)
+        auc = roc_auc_score(y_true, y_prob)
+        thr, f1 = best_f1_threshold(y_true, y_prob)
+        if f1 > best_f1_seen + 1e-4:
+            best_f1_seen = f1
+            best_thr_seen = thr
+            best_auc_seen = auc
+            bad = 0
+            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
+        else:
+            bad += 1
+            if bad >= patience:
+                break
+    if best_state is not None:
+        model.load_state_dict(best_state)
+    # final preds + threshold picked on val
+    y_true_val, y_prob_val = eval_probs(model, val_loader, device)
+    best_thr_final, best_f1_final = best_f1_threshold(y_true_val, y_prob_val)
+    # save model
+    model_path = os.path.join(out_dir, "best_model.pt")
+    torch.save({"state_dict": model.state_dict(), "best_params": best_params}, model_path)
+    # train preds
+    y_true_tr, y_prob_tr = eval_probs(model, DataLoader(train_ds, batch_size=64, shuffle=False,
+                                                       collate_fn=collate_unpooled, num_workers=4, pin_memory=True), device)
+    save_predictions_csv(out_dir, "train", y_true_tr, y_prob_tr, best_thr_final,
+                         sequences=np.asarray(train_ds["sequence"]) if "sequence" in train_ds.column_names else None)
+    save_predictions_csv(out_dir, "val", y_true_val, y_prob_val, best_thr_final,
+                         sequences=np.asarray(val_ds["sequence"]) if "sequence" in val_ds.column_names else None)
+    plot_curves(out_dir, y_true_val, y_prob_val)
+    summary = [
+        "=" * 72,
+        f"MODEL: {model_name}",
+        # Optuna results (objective = F1)
+        f"Best Optuna F1 (objective): {best_f1_optuna:.4f}",
+        f"Best Optuna AUC (val, recorded): {best_auc_optuna:.4f}",
+        f"Best Optuna threshold (val): {best_thr:.4f}",
+        # Refit results
+        f"Refit best AUC (val): {best_auc_seen:.4f}",
+        f"Refit best F1@thr (val): {best_f1_final:.4f} at thr={best_thr_final:.4f}",
+        "Best params:",
+        json.dumps(best_params, indent=2),
+        f"Saved model: {model_path}",
+        "=" * 72,
+    ]
+    with open(os.path.join(out_dir, "optimization_summary.txt"), "w") as f:
+        f.write("\n".join(summary))
+    print("\n".join(summary))
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset_path", type=str, required=True)
+    parser.add_argument("--out_dir", type=str, required=True)
+    parser.add_argument("--model", type=str, choices=["mlp", "cnn", "transformer"], required=True)
+    parser.add_argument("--n_trials", type=int, default=50)
+    args = parser.parse_args()
+    if args.model in ["mlp", "cnn", "transformer"]:
+        run_optuna_and_refit_nn(args.dataset_path, args.out_dir, args.model, args.n_trials, device="cuda:0")

training_classifiers/.ipynb_checkpoints/train_nn_regression-checkpoint.py ADDED Viewed

	@@ -0,0 +1,420 @@

+import os, json, time
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from datasets import load_from_disk, DatasetDict
+import optuna
+from dataclasses import dataclass
+from typing import Dict, Any, Tuple, Optional
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+from scipy.stats import spearmanr
+from torch.cuda.amp import autocast
+from torch.cuda.amp import autocast, GradScaler
+scaler = GradScaler(enabled=torch.cuda.is_available())
+from lightning.pytorch import seed_everything
+seed_everything(1986)
+def load_split(dataset_path):
+    ds = load_from_disk(dataset_path)
+    if isinstance(ds, DatasetDict):
+        return ds["train"], ds["val"]
+    raise ValueError("Expected DatasetDict with 'train' and 'val' splits")
+def collate_unpooled_reg(batch):
+    lengths = [int(x["length"]) for x in batch]
+    Lmax = max(lengths)
+    H = len(batch[0]["embedding"][0])
+    X = torch.zeros(len(batch), Lmax, H, dtype=torch.float32)
+    M = torch.zeros(len(batch), Lmax, dtype=torch.bool)
+    y = torch.tensor([float(x["label"]) for x in batch], dtype=torch.float32)
+    for i, x in enumerate(batch):
+        emb = torch.tensor(x["embedding"], dtype=torch.float32)  # (L,H)
+        L = emb.shape[0]
+        X[i, :L] = emb
+        if "attention_mask" in x:
+            m = torch.tensor(x["attention_mask"], dtype=torch.bool)
+            M[i, :L] = m[:L]
+        else:
+            M[i, :L] = True
+    return X, M, y
+def infer_in_dim(ds) -> int:
+    ex = ds[0]
+    return int(len(ex["embedding"][0]))
+# ============================
+# Metrics
+# ============================
+def safe_spearmanr(y_true: np.ndarray, y_pred: np.ndarray) -> float:
+    rho = spearmanr(y_true, y_pred).correlation
+    if rho is None or np.isnan(rho):
+        return 0.0
+    return float(rho)
+def eval_regression(y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str, float]:
+    # ---- RMSE ----
+    try:
+        from sklearn.metrics import root_mean_squared_error
+        rmse = root_mean_squared_error(y_true, y_pred)
+    except Exception:
+        mse = mean_squared_error(y_true, y_pred)
+        rmse = float(np.sqrt(mse))
+    mae  = float(mean_absolute_error(y_true, y_pred))
+    r2   = float(r2_score(y_true, y_pred))
+    rho  = float(safe_spearmanr(y_true, y_pred))
+    return {"rmse": float(rmse), "mae": mae, "r2": r2, "spearman_rho": rho}
+# ============================
+# Models
+# ============================
+class MaskedMeanPool(nn.Module):
+    def forward(self, X, M):
+        Mf = M.unsqueeze(-1).float()
+        denom = Mf.sum(dim=1).clamp(min=1.0)
+        return (X * Mf).sum(dim=1) / denom
+class MLPRegressor(nn.Module):
+    def __init__(self, in_dim, hidden=512, dropout=0.1):
+        super().__init__()
+        self.pool = MaskedMeanPool()
+        self.net = nn.Sequential(
+            nn.Linear(in_dim, hidden),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden, 1),
+        )
+    def forward(self, X, M):
+        z = self.pool(X, M)
+        return self.net(z).squeeze(-1)  # y_pred
+class CNNRegressor(nn.Module):
+    def __init__(self, in_ch, c=256, k=5, layers=2, dropout=0.1):
+        super().__init__()
+        blocks = []
+        ch = in_ch
+        for _ in range(layers):
+            blocks += [
+                nn.Conv1d(ch, c, kernel_size=k, padding=k//2),
+                nn.GELU(),
+                nn.Dropout(dropout),
+            ]
+            ch = c
+        self.conv = nn.Sequential(*blocks)
+        self.head = nn.Linear(c, 1)
+    def forward(self, X, M):
+        Xc = X.transpose(1, 2)                # (B,H,L)
+        Y = self.conv(Xc).transpose(1, 2)     # (B,L,C)
+        Mf = M.unsqueeze(-1).float()
+        denom = Mf.sum(dim=1).clamp(min=1.0)
+        pooled = (Y * Mf).sum(dim=1) / denom  # (B,C)
+        return self.head(pooled).squeeze(-1)
+class TransformerRegressor(nn.Module):
+    def __init__(self, in_dim, d_model=256, nhead=8, layers=2, ff=512, dropout=0.1):
+        super().__init__()
+        self.proj = nn.Linear(in_dim, d_model)
+        enc_layer = nn.TransformerEncoderLayer(
+            d_model=d_model, nhead=nhead, dim_feedforward=ff,
+            dropout=dropout, batch_first=True, activation="gelu"
+        )
+        self.enc = nn.TransformerEncoder(enc_layer, num_layers=layers)
+        self.head = nn.Linear(d_model, 1)
+    def forward(self, X, M):
+        pad_mask = ~M
+        Z = self.proj(X)
+        Z = self.enc(Z, src_key_padding_mask=pad_mask)
+        Mf = M.unsqueeze(-1).float()
+        denom = Mf.sum(dim=1).clamp(min=1.0)
+        pooled = (Z * Mf).sum(dim=1) / denom
+        return self.head(pooled).squeeze(-1)
+# ============================
+# Train / eval
+# ============================
+@torch.no_grad()
+def eval_preds(model, loader, device):
+    model.eval()
+    ys, ps = [], []
+    for X, M, y in loader:
+        X, M = X.to(device), M.to(device)
+        pred = model(X, M).detach().cpu().numpy()
+        ys.append(y.numpy())
+        ps.append(pred)
+    return np.concatenate(ys), np.concatenate(ps)
+def train_one_epoch_reg(model, loader, optim, criterion, device):
+    model.train()
+    for X, M, y in loader:
+        X, M, y = X.to(device), M.to(device), y.to(device)
+        optim.zero_grad(set_to_none=True)
+        with autocast(enabled=torch.cuda.is_available()):
+            pred = model(X, M)
+            loss = criterion(pred, y)
+        scaler.scale(loss).backward()
+        scaler.step(optim)
+        scaler.update()
+# ============================
+# Saving + plots
+# ============================
+def save_predictions_csv(out_dir, split_name, y_true, y_pred, sequences=None):
+    os.makedirs(out_dir, exist_ok=True)
+    df = pd.DataFrame({
+        "y_true": y_true.astype(float),
+        "y_pred": y_pred.astype(float),
+        "residual": (y_true - y_pred).astype(float),
+    })
+    if sequences is not None:
+        df.insert(0, "sequence", sequences)
+    df.to_csv(os.path.join(out_dir, f"{split_name}_predictions.csv"), index=False)
+def plot_regression_diagnostics(out_dir, y_true, y_pred):
+    os.makedirs(out_dir, exist_ok=True)
+    plt.figure()
+    plt.scatter(y_true, y_pred, s=8, alpha=0.5)
+    plt.xlabel("y_true"); plt.ylabel("y_pred")
+    plt.title("Predicted vs True")
+    plt.tight_layout()
+    plt.savefig(os.path.join(out_dir, "pred_vs_true.png"))
+    plt.close()
+    resid = y_true - y_pred
+    plt.figure()
+    plt.hist(resid, bins=50)
+    plt.xlabel("residual (y_true - y_pred)"); plt.ylabel("count")
+    plt.title("Residual Histogram")
+    plt.tight_layout()
+    plt.savefig(os.path.join(out_dir, "residual_hist.png"))
+    plt.close()
+    plt.figure()
+    plt.scatter(y_pred, resid, s=8, alpha=0.5)
+    plt.xlabel("y_pred"); plt.ylabel("residual")
+    plt.title("Residuals vs Prediction")
+    plt.tight_layout()
+    plt.savefig(os.path.join(out_dir, "residual_vs_pred.png"))
+    plt.close()
+# ============================
+# Optuna objective
+# ============================
+def score_from_metrics(metrics: Dict[str, float], objective: str) -> float:
+    if objective == "spearman":
+        return metrics["spearman_rho"]
+    if objective == "r2":
+        return metrics["r2"]
+    if objective == "neg_rmse":
+        return -metrics["rmse"]
+    raise ValueError(f"Unknown objective={objective}")
+def objective_nn_reg(trial, model_name, train_ds, val_ds, device="cuda:0", objective="spearman"):
+    lr = trial.suggest_float("lr", 1e-5, 3e-3, log=True)
+    wd = trial.suggest_float("weight_decay", 1e-10, 1e-2, log=True)
+    dropout = trial.suggest_float("dropout", 0.0, 0.5)
+    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])
+    in_dim = infer_in_dim(train_ds)
+    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,
+                              collate_fn=collate_unpooled_reg, num_workers=4, pin_memory=True)
+    val_loader = DataLoader(val_ds, batch_size=64, shuffle=False,
+                            collate_fn=collate_unpooled_reg, num_workers=4, pin_memory=True)
+    if model_name == "mlp":
+        hidden = trial.suggest_categorical("hidden", [256, 512, 1024, 2048])
+        model = MLPRegressor(in_dim=in_dim, hidden=hidden, dropout=dropout)
+    elif model_name == "cnn":
+        c = trial.suggest_categorical("channels", [128, 256, 512])
+        k = trial.suggest_categorical("kernel", [3, 5, 7])
+        layers = trial.suggest_int("layers", 1, 4)
+        model = CNNRegressor(in_ch=in_dim, c=c, k=k, layers=layers, dropout=dropout)
+    elif model_name == "transformer":
+        d = trial.suggest_categorical("d_model", [128, 256, 384])
+        nhead = trial.suggest_categorical("nhead", [4, 8])
+        layers = trial.suggest_int("layers", 1, 4)
+        ff = trial.suggest_categorical("ff", [256, 512, 1024, 1536])
+        model = TransformerRegressor(in_dim=in_dim, d_model=d, nhead=nhead, layers=layers, ff=ff, dropout=dropout)
+    else:
+        raise ValueError(model_name)
+    model = model.to(device)
+    loss_name = trial.suggest_categorical("loss", ["mse", "huber"])
+    if loss_name == "mse":
+        criterion = nn.MSELoss()
+    else:
+        delta = trial.suggest_float("huber_delta", 0.5, 5.0, log=True)
+        criterion = nn.HuberLoss(delta=delta)
+    optim = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
+    best_score = -1e18
+    patience = 10
+    bad = 0
+    for epoch in range(1, 61):
+        train_one_epoch_reg(model, train_loader, optim, criterion, device)
+        y_true, y_pred = eval_preds(model, val_loader, device)
+        metrics = eval_regression(y_true, y_pred)
+        score = score_from_metrics(metrics, objective)
+        # log attrs
+        for k, v in metrics.items():
+            trial.set_user_attr(f"val_{k}", float(v))
+        trial.report(score, epoch)
+        if trial.should_prune():
+            raise optuna.TrialPruned()
+        if score > best_score + 1e-6:
+            best_score = score
+            bad = 0
+        else:
+            bad += 1
+            if bad >= patience:
+                break
+    return float(best_score)
+# ============================
+# Main runner
+# ============================
+def run_optuna_and_refit_nn_reg(dataset_path, out_dir, model_name, n_trials=80, device="cuda:0",
+                                objective="spearman"):
+    os.makedirs(out_dir, exist_ok=True)
+    train_ds, val_ds = load_split(dataset_path)
+    print(f"[Data] Train: {len(train_ds)}, Val: {len(val_ds)}")
+    study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner())
+    study.optimize(lambda t: objective_nn_reg(t, model_name, train_ds, val_ds, device=device, objective=objective),
+                   n_trials=n_trials)
+    trials_df = study.trials_dataframe()
+    trials_df.to_csv(os.path.join(out_dir, "study_trials.csv"), index=False)
+    best = study.best_trial
+    best_params = dict(best.params)
+    # rebuild model from best params
+    in_dim = infer_in_dim(train_ds)
+    dropout = float(best_params.get("dropout", 0.1))
+    if model_name == "mlp":
+        model = MLPRegressor(in_dim=in_dim, hidden=int(best_params["hidden"]), dropout=dropout)
+    elif model_name == "cnn":
+        model = CNNRegressor(in_ch=in_dim, c=int(best_params["channels"]),
+                             k=int(best_params["kernel"]), layers=int(best_params["layers"]),
+                             dropout=dropout)
+    elif model_name == "transformer":
+        model = TransformerRegressor(in_dim=in_dim, d_model=int(best_params["d_model"]),
+                                     nhead=int(best_params["nhead"]), layers=int(best_params["layers"]),
+                                     ff=int(best_params["ff"]), dropout=dropout)
+    else:
+        raise ValueError(model_name)
+    model = model.to(device)
+    batch_size = int(best_params.get("batch_size", 32))
+    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,
+                              collate_fn=collate_unpooled_reg, num_workers=4, pin_memory=True)
+    val_loader = DataLoader(val_ds, batch_size=64, shuffle=False,
+                            collate_fn=collate_unpooled_reg, num_workers=4, pin_memory=True)
+    # loss
+    if best_params.get("loss", "mse") == "mse":
+        criterion = nn.MSELoss()
+    else:
+        criterion = nn.HuberLoss(delta=float(best_params["huber_delta"]))
+    optim = torch.optim.AdamW(model.parameters(), lr=float(best_params["lr"]),
+                              weight_decay=float(best_params["weight_decay"]))
+    # refit longer with early stopping on the SAME objective
+    best_score, bad, patience = -1e18, 0, 15
+    best_state = None
+    for epoch in range(1, 201):
+        train_one_epoch_reg(model, train_loader, optim, criterion, device)
+        y_true, y_pred = eval_preds(model, val_loader, device)
+        metrics = eval_regression(y_true, y_pred)
+        score = score_from_metrics(metrics, objective)
+        if score > best_score + 1e-6:
+            best_score = score
+            bad = 0
+            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
+            best_metrics = metrics
+        else:
+            bad += 1
+            if bad >= patience:
+                break
+    if best_state is not None:
+        model.load_state_dict(best_state)
+    # preds
+    y_true_tr, y_pred_tr = eval_preds(model, DataLoader(train_ds, batch_size=64, shuffle=False,
+                                                       collate_fn=collate_unpooled_reg, num_workers=4, pin_memory=True), device)
+    y_true_va, y_pred_va = eval_preds(model, val_loader, device)
+    seq_train = np.asarray(train_ds["sequence"]) if "sequence" in train_ds.column_names else None
+    seq_val   = np.asarray(val_ds["sequence"])   if "sequence" in val_ds.column_names else None
+    save_predictions_csv(out_dir, "train", y_true_tr, y_pred_tr, seq_train)
+    save_predictions_csv(out_dir, "val",   y_true_va, y_pred_va, seq_val)
+    plot_regression_diagnostics(out_dir, y_true_va, y_pred_va)
+    # save model
+    model_path = os.path.join(out_dir, "best_model.pt")
+    torch.save({"state_dict": model.state_dict(), "best_params": best_params, "in_dim": in_dim}, model_path)
+    summary = [
+        "=" * 72,
+        f"MODEL: {model_name}",
+        f"OPTUNA objective: {objective} (direction=maximize)",
+        f"Best trial: {best.number}",
+        "Best val metrics:",
+        json.dumps({k: float(v) for k, v in best_metrics.items()}, indent=2),
+        f"Saved model: {model_path}",
+        "Best params:",
+        json.dumps(best_params, indent=2),
+        "=" * 72,
+    ]
+    with open(os.path.join(out_dir, "optimization_summary.txt"), "w") as f:
+        f.write("\n".join(summary))
+    print("\n".join(summary))
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset_path", type=str, required=True)
+    parser.add_argument("--out_dir", type=str, required=True)
+    parser.add_argument("--model", type=str, choices=["mlp","cnn","transformer"], required=True)
+    parser.add_argument("--n_trials", type=int, default=80)
+    parser.add_argument("--objective", type=str, default="spearman",
+                        choices=["spearman","neg_rmse","r2"])
+    parser.add_argument("--device", type=str, default="cuda:0")
+    args = parser.parse_args()
+    run_optuna_and_refit_nn_reg(
+        dataset_path=args.dataset_path,
+        out_dir=args.out_dir,
+        model_name=args.model,
+        n_trials=args.n_trials,
+        device=args.device,
+        objective=args.objective,
+    )

training_data_cleaned/data_split.ipynb → training_classifiers/binding_affinity/val_smiles_pooled.csv RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:981339bf1a6594e42a722a42993c238512c3ac572344f68b810f561d4b7b7757
-size 228787

 version https://git-lfs.github.com/spec/v1
+oid sha256:5410a45a7b65def6cfb94c167b07537abd33b5aac4ecdffe162b7ce4e9bc3d19
+size 36525

training_data_cleaned/nf_smiles_train.csv → training_classifiers/binding_affinity/val_smiles_unpooled.csv RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1f08b8d9b77fef6da407a6e22765201d8eaf1cff6ae7f0da5d8da261baf64f86
-size 2069832

 version https://git-lfs.github.com/spec/v1
+oid sha256:cdf71fbb3e7b3b8e8dbfe4ed45b32a2da0049df851f09ee32564825f626cb86c
+size 37187

training_data_cleaned/smiles_data_split.ipynb → training_classifiers/binding_affinity/val_wt_pooled.csv RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:83d55a03c6934dc9ee64f7dbe76d2cf8e042be84b00f8e8bb1c92e2bc6da0c3f
-size 2300353

 version https://git-lfs.github.com/spec/v1
+oid sha256:b194e7b2b97258320323021b3ffe6143133070212a0215ade22fa91b87a3a861
+size 33224

training_data_cleaned/nf_smiles_val.csv → training_classifiers/binding_affinity/val_wt_unpooled.csv RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5a2cf82b6cc31686eff6a7931de34ee0975defc460e470e183b208c0513e5f3b
-size 55387144

 version https://git-lfs.github.com/spec/v1
+oid sha256:051325790047e749fbf1daf7bf25a08178297b0c37acaf9439816d09f2b6c1e3
+size 33826

training_classifiers/binding_affinity/wt_smiles_pooled/best_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12f956a7bf04ed602c11fd275377afa73f3f0af1982dbe06c607d8ada304b01c
+size 21617397

training_classifiers/binding_affinity/wt_smiles_pooled/best_params.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "lr": 0.00011987622631192274,
+  "weight_decay": 5.279397670067118e-05,
+  "dropout": 0.06773313718640918,
+  "hidden_dim": 256,
+  "n_heads": 8,
+  "n_layers": 3,
+  "cls_weight": 0.29331613012593555,
+  "batch_size": 16
+}

training_classifiers/binding_affinity/wt_smiles_pooled/optuna_trials.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:23161560edf5ad2069302afa1d387819dcfb7010c6ff0437c61ec09a8aa0e0f0
+size 40599

training_classifiers/binding_affinity/wt_smiles_unpooled/.ipynb_checkpoints/best_params-checkpoint.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "lr": 6.714904102732621e-05,
+  "weight_decay": 6.94348785472601e-08,
+  "dropout": 0.20599610484012826,
+  "hidden_dim": 768,
+  "n_heads": 4,
+  "n_layers": 3,
+  "cls_weight": 0.26109289573917854,
+  "batch_size": 16
+}

training_classifiers/binding_affinity/wt_smiles_unpooled/best_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d7ae3d2190b034352a65bda1bce86aa5a96ce3daf74cf10a166f8d9e9af51f0
+size 181183221

training_classifiers/binding_affinity/wt_smiles_unpooled/best_params.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "lr": 6.714904102732621e-05,
+  "weight_decay": 6.94348785472601e-08,
+  "dropout": 0.20599610484012826,
+  "hidden_dim": 768,
+  "n_heads": 4,
+  "n_layers": 3,
+  "cls_weight": 0.26109289573917854,
+  "batch_size": 16
+}

training_classifiers/binding_affinity/wt_smiles_unpooled/optuna_trials.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11838c42881182dd76b055f06e89b4423659eacd729d695b8a8f4c0a10165da0
+size 40533

training_classifiers/binding_affinity/wt_wt_pooled/.ipynb_checkpoints/optuna_trials-checkpoint.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b685b92714882d618b42b582000574d83c3be2fbecbec5e0de6b5476948b96c5
+size 40700

training_classifiers/binding_affinity/wt_wt_pooled/best_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:636de30f4388efd55e8e625c4f2c71a7629982197ef1e53fecf2a4f640df1ae0
+size 182756085

training_classifiers/binding_affinity/wt_wt_pooled/best_params.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "lr": 0.0001730381005812531,
+  "weight_decay": 8.736709570411299e-05,
+  "dropout": 0.17533811687195416,
+  "hidden_dim": 768,
+  "n_heads": 4,
+  "n_layers": 3,
+  "cls_weight": 0.1278591739909013,
+  "batch_size": 16
+}

training_classifiers/binding_affinity/wt_wt_pooled/optuna_trials.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b685b92714882d618b42b582000574d83c3be2fbecbec5e0de6b5476948b96c5
+size 40700

training_classifiers/binding_affinity/wt_wt_unpooled/best_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce8da2406036535794909ffde1f1941843096b7d3c71ba772f9170ab123877f2
+size 69333557

training_classifiers/binding_affinity/wt_wt_unpooled/best_params.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "lr": 0.000657577559506255,
+  "weight_decay": 3.3209159985473103e-07,
+  "dropout": 0.16430662769055482,
+  "hidden_dim": 768,
+  "n_heads": 8,
+  "n_layers": 1,
+  "cls_weight": 0.7037398702018655,
+  "batch_size": 16
+}

training_classifiers/binding_affinity/wt_wt_unpooled/optuna_trials.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b330d37733e684ff780b143f17fd26c498f615dfbab8c5b3df08eae7eb019139
+size 40587

training_classifiers/binding_affinity_iptm.py ADDED Viewed

	@@ -0,0 +1,132 @@

+#!/usr/bin/env python3
+"""
+extract_iptm_affinity_csv_all.py
+Writes:
+  - out_dir/wt_iptm_affinity_all.csv
+  - out_dir/smiles_iptm_affinity_all.csv
+Also prints:
+  - N
+  - Spearman rho (affinity vs iptm)
+  - Pearson r (affinity vs iptm)
+"""
+from pathlib import Path
+import numpy as np
+import pandas as pd
+def corr_stats(df: pd.DataFrame, x: str, y: str):
+    # pandas handles NaNs if we already dropped them; still be safe
+    xx = pd.to_numeric(df[x], errors="coerce")
+    yy = pd.to_numeric(df[y], errors="coerce")
+    m = xx.notna() & yy.notna()
+    xx = xx[m]
+    yy = yy[m]
+    n = int(m.sum())
+    # Pearson r
+    pearson_r = float(xx.corr(yy, method="pearson")) if n > 1 else float("nan")
+    # Spearman rho
+    spearman_rho = float(xx.corr(yy, method="spearman")) if n > 1 else float("nan")
+    return {"n": n, "pearson_r": pearson_r, "spearman_rho": spearman_rho}
+def clean_one(
+    in_csv: Path,
+    out_csv: Path,
+    iptm_col: str,
+    affinity_col: str = "affinity",
+    keep_cols=(),
+):
+    df = pd.read_csv(in_csv)
+    # affinity + iptm must exist
+    need = [affinity_col, iptm_col]
+    missing = [c for c in need if c not in df.columns]
+    if missing:
+        raise ValueError(f"{in_csv} missing columns: {missing}. Found: {list(df.columns)}")
+    # coerce numeric
+    df[affinity_col] = pd.to_numeric(df[affinity_col], errors="coerce")
+    df[iptm_col] = pd.to_numeric(df[iptm_col], errors="coerce")
+    # drop NaNs in either
+    df = df.dropna(subset=[affinity_col, iptm_col]).reset_index(drop=True)
+    # output cols (standardize names)
+    out = pd.DataFrame({
+        "affinity": df[affinity_col].astype(float),
+        "iptm": df[iptm_col].astype(float),
+    })
+    # keep split if present (handy for coloring later, but not used for corr)
+    if "split" in df.columns:
+        out.insert(0, "split", df["split"].astype(str))
+    # optional extras for labeling/debug
+    for c in keep_cols:
+        if c in df.columns:
+            out[c] = df[c]
+    out_csv.parent.mkdir(parents=True, exist_ok=True)
+    out.to_csv(out_csv, index=False)
+    stats = corr_stats(out, "iptm", "affinity")
+    print(f"[write] {out_csv}")
+    print(f"  N={stats['n']} | Pearson r={stats['pearson_r']:.4f} | Spearman rho={stats['spearman_rho']:.4f}")
+    # also save stats json next to csv
+    stats_path = out_csv.with_suffix(".stats.json")
+    with open(stats_path, "w") as f:
+        import json
+        json.dump(
+            {
+                "input_csv": str(in_csv),
+                "output_csv": str(out_csv),
+                "iptm_col": iptm_col,
+                "affinity_col": affinity_col,
+                **stats,
+            },
+            f,
+            indent=2,
+        )
+def main():
+    import argparse
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--wt_meta_csv", type=str, required=True)
+    ap.add_argument("--smiles_meta_csv", type=str, required=True)
+    ap.add_argument("--out_dir", type=str, required=True)
+    ap.add_argument("--wt_iptm_col", type=str, default="wt_iptm_score")
+    ap.add_argument("--smiles_iptm_col", type=str, default="smiles_iptm_score")
+    ap.add_argument("--affinity_col", type=str, default="affinity")
+    args = ap.parse_args()
+    out_dir = Path(args.out_dir)
+    clean_one(
+        Path(args.wt_meta_csv),
+        out_dir / "wt_iptm_affinity_all.csv",
+        iptm_col=args.wt_iptm_col,
+        affinity_col=args.affinity_col,
+        keep_cols=("seq1", "seq2", "Fasta2SMILES", "REACT_SMILES"),
+    )
+    clean_one(
+        Path(args.smiles_meta_csv),
+        out_dir / "smiles_iptm_affinity_all.csv",
+        iptm_col=args.smiles_iptm_col,
+        affinity_col=args.affinity_col,
+        keep_cols=("seq1", "seq2", "Fasta2SMILES", "REACT_SMILES", "smiles_sequence"),
+    )
+    print(f"\n[DONE] CSVs + stats JSONs in: {out_dir}")
+if __name__ == "__main__":
+    main()

training_classifiers/binding_affinity_split.py ADDED Viewed

	@@ -0,0 +1,847 @@

+#!/usr/bin/env python3
+import os
+import math
+from pathlib import Path
+import sys
+from contextlib import contextmanager
+import numpy as np
+import pandas as pd
+import torch
+# tqdm is optional; we’ll disable it by default in notebooks
+from tqdm import tqdm
+sys.path.append("/vast/projects/pranam/lab/yz927/projects/Classifier_Weight")
+from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
+from datasets import Dataset, DatasetDict, Features, Value, Sequence as HFSequence
+from transformers import AutoTokenizer, EsmModel, AutoModelForMaskedLM
+# -------------------------
+# Config
+# -------------------------
+CSV_PATH = Path("/vast/projects/pranam/lab/yz927/projects/Classifier_Weight/c-binding_with_openfold_scores.csv")
+OUT_ROOT = Path(
+    "/vast/projects/pranam/lab/yz927/projects/Classifier_Weight/training_data_cleaned/binding_affinity"
+)
+# WT (seq) embedding model
+WT_MODEL_NAME = "facebook/esm2_t33_650M_UR50D"
+WT_MAX_LEN = 1022
+WT_BATCH = 32
+# SMILES embedding model + tokenizer
+SMI_MODEL_NAME = "aaronfeller/PeptideCLM-23M-all"
+TOKENIZER_VOCAB = "/vast/projects/pranam/lab/yz927/projects/Classifier_Weight/tokenizer/new_vocab.txt"
+TOKENIZER_SPLITS = "/vast/projects/pranam/lab/yz927/projects/Classifier_Weight/tokenizer/new_splits.txt"
+SMI_MAX_LEN = 768
+SMI_BATCH = 128
+# Split config
+TRAIN_FRAC = 0.80
+RANDOM_SEED = 1986
+AFFINITY_Q_BINS = 30
+# Columns expected in CSV
+COL_SEQ1 = "seq1"
+COL_SEQ2 = "seq2"
+COL_AFF = "affinity"
+COL_F2S = "Fasta2SMILES"
+COL_REACT = "REACT_SMILES"
+COL_WT_IPTM = "wt_iptm_score"
+COL_SMI_IPTM = "smiles_iptm_score"
+# Device
+DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# -------------------------
+# Quiet / notebook-safe output controls
+# -------------------------
+QUIET = True       # suppress most prints
+USE_TQDM = False   # disable tqdm bars (recommended in Jupyter to avoid crashing)
+LOG_FILE = None    # optionally: OUT_ROOT / "build.log"
+def log(msg: str):
+    if LOG_FILE is not None:
+        Path(LOG_FILE).parent.mkdir(parents=True, exist_ok=True)
+        with open(LOG_FILE, "a") as f:
+            f.write(msg.rstrip() + "\n")
+    if not QUIET:
+        print(msg)
+def pbar(it, **kwargs):
+    return tqdm(it, **kwargs) if USE_TQDM else it
+@contextmanager
+def section(title: str):
+    log(f"\n=== {title} ===")
+    yield
+    log(f"=== done: {title} ===")
+# -------------------------
+# Helpers
+# -------------------------
+def has_uaa(seq: str) -> bool:
+    return "X" in str(seq).upper()
+def affinity_to_class(a: float) -> str:
+    # High: >= 9 ; Moderate: [7, 9) ; Low: < 7
+    if a >= 9.0:
+        return "High"
+    elif a >= 7.0:
+        return "Moderate"
+    else:
+        return "Low"
+def make_distribution_matched_split(df: pd.DataFrame) -> pd.DataFrame:
+    df = df.copy()
+    df[COL_AFF] = pd.to_numeric(df[COL_AFF], errors="coerce")
+    df = df.dropna(subset=[COL_AFF]).reset_index(drop=True)
+    df["affinity_class"] = df[COL_AFF].apply(affinity_to_class)
+    try:
+        df["aff_bin"] = pd.qcut(df[COL_AFF], q=AFFINITY_Q_BINS, duplicates="drop")
+        strat_col = "aff_bin"
+    except Exception:
+        df["aff_bin"] = df["affinity_class"]
+        strat_col = "aff_bin"
+    rng = np.random.RandomState(RANDOM_SEED)
+    df["split"] = None
+    for _, g in df.groupby(strat_col, observed=True):
+        idx = g.index.to_numpy()
+        rng.shuffle(idx)
+        n_train = int(math.floor(len(idx) * TRAIN_FRAC))
+        df.loc[idx[:n_train], "split"] = "train"
+        df.loc[idx[n_train:], "split"] = "val"
+    df["split"] = df["split"].fillna("train")
+    return df
+def _summ(x):
+    x = np.asarray(x, dtype=float)
+    x = x[~np.isnan(x)]
+    if len(x) == 0:
+        return {"n": 0, "mean": np.nan, "std": np.nan, "p50": np.nan, "p95": np.nan}
+    return {
+        "n": int(len(x)),
+        "mean": float(np.mean(x)),
+        "std": float(np.std(x)),
+        "p50": float(np.quantile(x, 0.50)),
+        "p95": float(np.quantile(x, 0.95)),
+    }
+def _len_stats(seqs):
+    lens = np.asarray([len(str(s)) for s in seqs], dtype=float)
+    if len(lens) == 0:
+        return {"n": 0, "mean": np.nan, "std": np.nan, "p50": np.nan, "p95": np.nan}
+    return {
+        "n": int(len(lens)),
+        "mean": float(lens.mean()),
+        "std": float(lens.std()),
+        "p50": float(np.quantile(lens, 0.50)),
+        "p95": float(np.quantile(lens, 0.95)),
+    }
+def verify_split_before_embedding(
+    df2: pd.DataFrame,
+    affinity_col: str,
+    split_col: str,
+    seq_col: str,
+    iptm_col: str,
+    aff_class_col: str = "affinity_class",
+    aff_bins: int = 30,
+    save_report_prefix: str | None = None,
+    verbose: bool = False,
+):
+    """
+    Notebook-safe: by default prints only ONE line via `log()`.
+    Optionally writes CSV reports (stats + class proportions).
+    """
+    df2 = df2.copy()
+    df2[affinity_col] = pd.to_numeric(df2[affinity_col], errors="coerce")
+    df2[iptm_col] = pd.to_numeric(df2[iptm_col], errors="coerce")
+    assert split_col in df2.columns, f"Missing split col: {split_col}"
+    assert set(df2[split_col].dropna().unique()).issubset({"train", "val"}), f"Unexpected split values: {df2[split_col].unique()}"
+    assert df2[affinity_col].notna().any(), "No valid affinity values after coercion."
+    try:
+        df2["_aff_bin_dbg"] = pd.qcut(df2[affinity_col], q=aff_bins, duplicates="drop")
+    except Exception:
+        df2["_aff_bin_dbg"] = df2[aff_class_col].astype(str)
+    tr = df2[df2[split_col] == "train"].reset_index(drop=True)
+    va = df2[df2[split_col] == "val"].reset_index(drop=True)
+    tr_aff = _summ(tr[affinity_col].to_numpy())
+    va_aff = _summ(va[affinity_col].to_numpy())
+    tr_len = _len_stats(tr[seq_col].tolist())
+    va_len = _len_stats(va[seq_col].tolist())
+    # bin drift
+    bin_ct = (
+        df2.groupby([split_col, "_aff_bin_dbg"])
+           .size()
+           .groupby(level=0)
+           .apply(lambda s: s / s.sum())
+    )
+    tr_bins = bin_ct.loc["train"]
+    va_bins = bin_ct.loc["val"]
+    all_bins = tr_bins.index.union(va_bins.index)
+    tr_bins = tr_bins.reindex(all_bins, fill_value=0.0)
+    va_bins = va_bins.reindex(all_bins, fill_value=0.0)
+    max_bin_diff = float(np.max(np.abs(tr_bins.values - va_bins.values)))
+    msg = (
+        f"[split-check] rows={len(df2)} train={len(tr)} val={len(va)} | "
+        f"aff(mean±std) train={tr_aff['mean']:.3f}±{tr_aff['std']:.3f} val={va_aff['mean']:.3f}±{va_aff['std']:.3f} | "
+        f"len(p50/p95) train={tr_len['p50']:.1f}/{tr_len['p95']:.1f} val={va_len['p50']:.1f}/{va_len['p95']:.1f} | "
+        f"max_bin_diff={max_bin_diff:.4f}"
+    )
+    log(msg)
+    if verbose and (not QUIET):
+        class_ct = df2.groupby([split_col, aff_class_col]).size().unstack(fill_value=0)
+        class_prop = class_ct.div(class_ct.sum(axis=1), axis=0)
+        print("\n[verbose] affinity_class counts:\n", class_ct)
+        print("\n[verbose] affinity_class proportions:\n", class_prop.round(4))
+    if save_report_prefix is not None:
+        out = Path(save_report_prefix)
+        out.parent.mkdir(parents=True, exist_ok=True)
+        stats_df = pd.DataFrame([
+            {"split": "train", **{f"aff_{k}": v for k, v in tr_aff.items()}, **{f"len_{k}": v for k, v in tr_len.items()}},
+            {"split": "val",   **{f"aff_{k}": v for k, v in va_aff.items()}, **{f"len_{k}": v for k, v in va_len.items()}},
+        ])
+        class_ct = df2.groupby([split_col, aff_class_col]).size().unstack(fill_value=0)
+        class_prop = class_ct.div(class_ct.sum(axis=1), axis=0).reset_index()
+        stats_df.to_csv(out.with_suffix(".stats.csv"), index=False)
+        class_prop.to_csv(out.with_suffix(".class_prop.csv"), index=False)
+# -------------------------
+# WT pooled (ESM2)
+# -------------------------
+@torch.no_grad()
+def wt_pooled_embeddings(seqs, tokenizer, model, batch_size=32, max_length=1022):
+    embs = []
+    for i in pbar(range(0, len(seqs), batch_size)):
+        batch = seqs[i:i + batch_size]
+        inputs = tokenizer(
+            batch,
+            padding=True,
+            truncation=True,
+            max_length=max_length,
+            return_tensors="pt",
+        )
+        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+        out = model(**inputs)
+        h = out.last_hidden_state  # (B, L, H)
+        attn = inputs["attention_mask"].unsqueeze(-1)  # (B, L, 1)
+        summed = (h * attn).sum(dim=1)                 # (B, H)
+        denom = attn.sum(dim=1).clamp(min=1e-9)        # (B, 1)
+        pooled = (summed / denom).detach().cpu().numpy()
+        embs.append(pooled)
+    return np.vstack(embs)
+# -------------------------
+# WT unpooled (ESM2)
+# -------------------------
+@torch.no_grad()
+def wt_unpooled_one(seq, tokenizer, model, cls_id, eos_id, max_length=1022):
+    tok = tokenizer(seq, padding=False, truncation=True, max_length=max_length, return_tensors="pt")
+    tok = {k: v.to(DEVICE) for k, v in tok.items()}
+    out = model(**tok)
+    h = out.last_hidden_state[0]           # (L, H)
+    attn = tok["attention_mask"][0].bool() # (L,)
+    ids = tok["input_ids"][0]
+    keep = attn.clone()
+    if cls_id is not None:
+        keep &= (ids != cls_id)
+    if eos_id is not None:
+        keep &= (ids != eos_id)
+    return h[keep].detach().cpu().to(torch.float16).numpy()
+def build_wt_unpooled_dataset(df_split: pd.DataFrame, out_dir: Path, tokenizer, model):
+    """
+    Expects df_split to have:
+      - target_sequence  (seq1)
+      - sequence         (binder seq2; WT binder)
+      - label, affinity_class, COL_AFF, COL_WT_IPTM
+    Saves a dataset where each row contains BOTH:
+      - target_embedding (Lt,H), target_attention_mask, target_length
+      - binder_embedding (Lb,H), binder_attention_mask, binder_length
+    """
+    cls_id = tokenizer.cls_token_id
+    eos_id = tokenizer.eos_token_id
+    H = model.config.hidden_size
+    features = Features({
+        "target_sequence": Value("string"),
+        "sequence": Value("string"),
+        "label": Value("float32"),
+        "affinity": Value("float32"),
+        "affinity_class": Value("string"),
+        "target_embedding": HFSequence(HFSequence(Value("float16"), length=H)),
+        "target_attention_mask": HFSequence(Value("int8")),
+        "target_length": Value("int64"),
+        "binder_embedding": HFSequence(HFSequence(Value("float16"), length=H)),
+        "binder_attention_mask": HFSequence(Value("int8")),
+        "binder_length": Value("int64"),
+        COL_WT_IPTM: Value("float32"),
+        COL_AFF: Value("float32"),
+    })
+    def gen_rows(df: pd.DataFrame):
+        for r in pbar(df.itertuples(index=False), total=len(df)):
+            tgt = str(getattr(r, "target_sequence")).strip()
+            bnd = str(getattr(r, "sequence")).strip()
+            y = float(getattr(r, "label"))
+            aff = float(getattr(r, COL_AFF))
+            acls = str(getattr(r, "affinity_class"))
+            iptm = getattr(r, COL_WT_IPTM)
+            iptm = float(iptm) if pd.notna(iptm) else np.nan
+            # token embeddings for target + binder (both ESM)
+            t_emb = wt_unpooled_one(tgt, tokenizer, model, cls_id, eos_id, max_length=WT_MAX_LEN)  # (Lt,H)
+            b_emb = wt_unpooled_one(bnd, tokenizer, model, cls_id, eos_id, max_length=WT_MAX_LEN)  # (Lb,H)
+            t_list = t_emb.tolist()
+            b_list = b_emb.tolist()
+            Lt = len(t_list)
+            Lb = len(b_list)
+            yield {
+                "target_sequence": tgt,
+                "sequence": bnd,
+                "label": np.float32(y),
+                "affinity": np.float32(aff),
+                "affinity_class": acls,
+                "target_embedding": t_list,
+                "target_attention_mask": [1] * Lt,
+                "target_length": int(Lt),
+                "binder_embedding": b_list,
+                "binder_attention_mask": [1] * Lb,
+                "binder_length": int(Lb),
+                COL_WT_IPTM: np.float32(iptm) if not np.isnan(iptm) else np.float32(np.nan),
+                COL_AFF: np.float32(aff),
+            }
+    out_dir.mkdir(parents=True, exist_ok=True)
+    ds = Dataset.from_generator(lambda: gen_rows(df_split), features=features)
+    ds.save_to_disk(str(out_dir), max_shard_size="1GB")
+    return ds
+def build_smiles_unpooled_paired_dataset(df_split: pd.DataFrame, out_dir: Path, wt_tokenizer, wt_model_unpooled,
+                                        smi_tok, smi_roformer):
+    """
+    df_split must have:
+      - target_sequence (seq1)
+      - sequence        (binder smiles string)
+      - label, affinity_class, COL_AFF, COL_SMI_IPTM
+    Saves rows with:
+      target_embedding (Lt,Ht) from ESM
+      binder_embedding (Lb,Hb) from PeptideCLM
+    """
+    cls_id = wt_tokenizer.cls_token_id
+    eos_id = wt_tokenizer.eos_token_id
+    Ht = wt_model_unpooled.config.hidden_size
+    # Infer Hb from one forward pass? easiest: run one mini batch outside in main if you want.
+    # Here: we’ll infer from model config if available.
+    Hb = getattr(smi_roformer.config, "hidden_size", None)
+    if Hb is None:
+        Hb = getattr(smi_roformer.config, "dim", None)
+    if Hb is None:
+        raise ValueError("Cannot infer Hb from smi_roformer config; print(smi_roformer.config) and set Hb manually.")
+    features = Features({
+        "target_sequence": Value("string"),
+        "sequence": Value("string"),
+        "label": Value("float32"),
+        "affinity": Value("float32"),
+        "affinity_class": Value("string"),
+        "target_embedding": HFSequence(HFSequence(Value("float16"), length=Ht)),
+        "target_attention_mask": HFSequence(Value("int8")),
+        "target_length": Value("int64"),
+        "binder_embedding": HFSequence(HFSequence(Value("float16"), length=Hb)),
+        "binder_attention_mask": HFSequence(Value("int8")),
+        "binder_length": Value("int64"),
+        COL_SMI_IPTM: Value("float32"),
+        COL_AFF: Value("float32"),
+    })
+    def gen_rows(df: pd.DataFrame):
+        for r in pbar(df.itertuples(index=False), total=len(df)):
+            tgt = str(getattr(r, "target_sequence")).strip()
+            bnd = str(getattr(r, "sequence")).strip()
+            y = float(getattr(r, "label"))
+            aff = float(getattr(r, COL_AFF))
+            acls = str(getattr(r, "affinity_class"))
+            iptm = getattr(r, COL_SMI_IPTM)
+            iptm = float(iptm) if pd.notna(iptm) else np.nan
+            # target token embeddings (ESM)
+            t_emb = wt_unpooled_one(tgt, wt_tokenizer, wt_model_unpooled, cls_id, eos_id, max_length=WT_MAX_LEN)
+            t_list = t_emb.tolist()
+            Lt = len(t_list)
+            # binder token embeddings (PeptideCLM) — single-item batch
+            _, tok_list, mask_list, lengths = smiles_embed_batch_return_both(
+                [bnd], smi_tok, smi_roformer, max_length=SMI_MAX_LEN
+            )
+            b_emb = tok_list[0]  # np.float16 (Lb, Hb)
+            b_list = b_emb.tolist()
+            Lb = int(lengths[0])
+            b_mask = mask_list[0].astype(np.int8).tolist()
+            yield {
+                "target_sequence": tgt,
+                "sequence": bnd,
+                "label": np.float32(y),
+                "affinity": np.float32(aff),
+                "affinity_class": acls,
+                "target_embedding": t_list,
+                "target_attention_mask": [1] * Lt,
+                "target_length": int(Lt),
+                "binder_embedding": b_list,
+                "binder_attention_mask": [int(x) for x in b_mask],
+                "binder_length": int(Lb),
+                COL_SMI_IPTM: np.float32(iptm) if not np.isnan(iptm) else np.float32(np.nan),
+                COL_AFF: np.float32(aff),
+            }
+    out_dir.mkdir(parents=True, exist_ok=True)
+    ds = Dataset.from_generator(lambda: gen_rows(df_split), features=features)
+    ds.save_to_disk(str(out_dir), max_shard_size="1GB")
+    return ds
+# -------------------------
+# SMILES pooled + unpooled (PeptideCLM)
+# -------------------------
+def get_special_ids(tokenizer_obj):
+    cand = [
+        getattr(tokenizer_obj, "pad_token_id", None),
+        getattr(tokenizer_obj, "cls_token_id", None),
+        getattr(tokenizer_obj, "sep_token_id", None),
+        getattr(tokenizer_obj, "bos_token_id", None),
+        getattr(tokenizer_obj, "eos_token_id", None),
+        getattr(tokenizer_obj, "mask_token_id", None),
+    ]
+    return sorted({x for x in cand if x is not None})
+@torch.no_grad()
+def smiles_embed_batch_return_both(batch_sequences, tokenizer_obj, model_roformer, max_length):
+    tok = tokenizer_obj(
+        batch_sequences,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=max_length,
+    )
+    input_ids = tok["input_ids"].to(DEVICE)
+    attention_mask = tok["attention_mask"].to(DEVICE)
+    outputs = model_roformer(input_ids=input_ids, attention_mask=attention_mask)
+    last_hidden = outputs.last_hidden_state  # (B, L, H)
+    special_ids = get_special_ids(tokenizer_obj)
+    valid = attention_mask.bool()
+    if len(special_ids) > 0:
+        sid = torch.tensor(special_ids, device=DEVICE, dtype=torch.long)
+        if hasattr(torch, "isin"):
+            valid = valid & (~torch.isin(input_ids, sid))
+        else:
+            m = torch.zeros_like(valid)
+            for s in special_ids:
+                m |= (input_ids == s)
+            valid = valid & (~m)
+    valid_f = valid.unsqueeze(-1).float()
+    summed = torch.sum(last_hidden * valid_f, dim=1)
+    denom = torch.clamp(valid_f.sum(dim=1), min=1e-9)
+    pooled = (summed / denom).detach().cpu().numpy()
+    token_emb_list, mask_list, lengths = [], [], []
+    for b in range(last_hidden.shape[0]):
+        emb = last_hidden[b, valid[b]]  # (Li, H)
+        token_emb_list.append(emb.detach().cpu().to(torch.float16).numpy())
+        li = emb.shape[0]
+        lengths.append(int(li))
+        mask_list.append(np.ones((li,), dtype=np.int8))
+    return pooled, token_emb_list, mask_list, lengths
+def smiles_generate_embeddings_batched_both(seqs, tokenizer_obj, model_roformer, batch_size, max_length):
+    pooled_all = []
+    token_emb_all = []
+    mask_all = []
+    lengths_all = []
+    for i in pbar(range(0, len(seqs), batch_size)):
+        batch = seqs[i:i + batch_size]
+        pooled, tok_list, m_list, lens = smiles_embed_batch_return_both(
+            batch, tokenizer_obj, model_roformer, max_length
+        )
+        pooled_all.append(pooled)
+        token_emb_all.extend(tok_list)
+        mask_all.extend(m_list)
+        lengths_all.extend(lens)
+    return np.vstack(pooled_all), token_emb_all, mask_all, lengths_all
+# -------------------------
+# Target embedding cache (NO extra ESM runs)
+# We will compute target pooled embeddings ONCE from WT view, then reuse for SMILES.
+# -------------------------
+def build_target_cache_from_wt_view(wt_view_train: pd.DataFrame, wt_view_val: pd.DataFrame):
+    wt_tok = AutoTokenizer.from_pretrained(WT_MODEL_NAME)
+    wt_model = EsmModel.from_pretrained(WT_MODEL_NAME).to(DEVICE).eval()
+    # compute target pooled embeddings once
+    tgt_wt_train = wt_view_train["target_sequence"].astype(str).tolist()
+    tgt_wt_val   = wt_view_val["target_sequence"].astype(str).tolist()
+    wt_train_tgt_emb = wt_pooled_embeddings(
+        tgt_wt_train, wt_tok, wt_model, batch_size=WT_BATCH, max_length=WT_MAX_LEN
+    )
+    wt_val_tgt_emb = wt_pooled_embeddings(
+        tgt_wt_val, wt_tok, wt_model, batch_size=WT_BATCH, max_length=WT_MAX_LEN
+    )
+    # build dict: target_sequence -> embedding (float32 array)
+    # if duplicates exist, last wins; you can add checks if needed
+    train_map = {s: e for s, e in zip(tgt_wt_train, wt_train_tgt_emb)}
+    val_map   = {s: e for s, e in zip(tgt_wt_val,   wt_val_tgt_emb)}
+    return wt_tok, wt_model, wt_train_tgt_emb, wt_val_tgt_emb, train_map, val_map
+# -------------------------
+# Main
+# -------------------------
+def main():
+    log(f"[INFO] DEVICE: {DEVICE}")
+    OUT_ROOT.mkdir(parents=True, exist_ok=True)
+    # 1) Load
+    with section("load csv + dedup"):
+        df = pd.read_csv(CSV_PATH)
+        for c in [COL_SEQ1, COL_SEQ2, COL_F2S, COL_REACT]:
+            if c in df.columns:
+                df[c] = df[c].apply(lambda x: x.strip() if isinstance(x, str) else x)
+        # Dedup on the full identity tuple you want
+        DEDUP_COLS = [COL_SEQ1, COL_SEQ2, COL_F2S, COL_REACT]
+        df = df.drop_duplicates(subset=DEDUP_COLS).reset_index(drop=True)
+        print("Rows after dedup on", DEDUP_COLS, ":", len(df))
+        need = [COL_SEQ1, COL_SEQ2, COL_AFF, COL_F2S, COL_REACT, COL_WT_IPTM, COL_SMI_IPTM]
+        missing = [c for c in need if c not in df.columns]
+        if missing:
+            raise ValueError(f"Missing required columns: {missing}")
+        # numeric affinity for both branches
+        df[COL_AFF] = pd.to_numeric(df[COL_AFF], errors="coerce")
+    # 2) Build WT subset + SMILES subset separately (NO global dropping)
+    with section("prepare wt/smiles subsets"):
+        # WT: requires a canonical peptide sequence (no X) + affinity
+        df_wt = df.copy()
+        df_wt["wt_sequence"] = df_wt[COL_SEQ2].astype(str).str.strip()
+        df_wt = df_wt.dropna(subset=[COL_AFF]).reset_index(drop=True)
+        df_wt = df_wt[df_wt["wt_sequence"].notna() & (df_wt["wt_sequence"] != "")]
+        df_wt = df_wt[~df_wt["wt_sequence"].str.contains("X", case=False, na=False)].reset_index(drop=True)
+        # SMILES: requires affinity + a usable picked SMILES (UAA->REACT, else->Fasta2SMILES)
+        df_smi = df.copy()
+        df_smi = df_smi.dropna(subset=[COL_AFF]).reset_index(drop=True)
+        df_smi = df_smi[
+            pd.to_numeric(df_smi[COL_SMI_IPTM], errors="coerce").notna()
+        ].reset_index(drop=True) # empty iptm means sth wrong with their smiles sequenc
+        is_uaa = df_smi[COL_SEQ2].astype(str).str.contains("X", case=False, na=False)
+        df_smi["smiles_sequence"] = np.where(is_uaa, df_smi[COL_REACT], df_smi[COL_F2S])
+        df_smi["smiles_sequence"] = df_smi["smiles_sequence"].astype(str).str.strip()
+        df_smi = df_smi[df_smi["smiles_sequence"].notna() & (df_smi["smiles_sequence"] != "")]
+        df_smi = df_smi[~df_smi["smiles_sequence"].isin(["nan", "None"])].reset_index(drop=True)
+        log(f"[counts] WT rows={len(df_wt)} | SMILES rows={len(df_smi)} (after per-branch filtering)")
+    # 3) Split separately (different sizes and memberships are expected)
+    with section("split wt and smiles separately"):
+        df_wt2 = make_distribution_matched_split(df_wt)
+        df_smi2 = make_distribution_matched_split(df_smi)
+        # save split tables
+        wt_split_csv = OUT_ROOT / "binding_affinity_wt_meta_with_split.csv"
+        smi_split_csv = OUT_ROOT / "binding_affinity_smiles_meta_with_split.csv"
+        df_wt2.to_csv(wt_split_csv, index=False)
+        df_smi2.to_csv(smi_split_csv, index=False)
+        log(f"Saved WT split meta: {wt_split_csv}")
+        log(f"Saved SMILES split meta: {smi_split_csv}")
+        # lightweight double-check (one-line)
+        verify_split_before_embedding(
+            df2=df_wt2,
+            affinity_col=COL_AFF,
+            split_col="split",
+            seq_col="wt_sequence",
+            iptm_col=COL_WT_IPTM,
+            aff_class_col="affinity_class",
+            aff_bins=AFFINITY_Q_BINS,
+            save_report_prefix=str(OUT_ROOT / "wt_split_doublecheck_report"),
+            verbose=False,
+        )
+        verify_split_before_embedding(
+            df2=df_smi2,
+            affinity_col=COL_AFF,
+            split_col="split",
+            seq_col="smiles_sequence",
+            iptm_col=COL_SMI_IPTM,
+            aff_class_col="affinity_class",
+            aff_bins=AFFINITY_Q_BINS,
+            save_report_prefix=str(OUT_ROOT / "smiles_split_doublecheck_report"),
+            verbose=False,
+        )
+    # Prepare split views
+    def prep_view(df_in: pd.DataFrame, binder_seq_col: str, iptm_col: str) -> pd.DataFrame:
+        out = df_in.copy()
+        out["target_sequence"] = out[COL_SEQ1].astype(str).str.strip()   # <-- NEW
+        out["sequence"] = out[binder_seq_col].astype(str).str.strip()   # binder
+        out["label"] = pd.to_numeric(out[COL_AFF], errors="coerce")
+        out[iptm_col] = pd.to_numeric(out[iptm_col], errors="coerce")
+        out[COL_AFF] = pd.to_numeric(out[COL_AFF], errors="coerce")
+        out = out.dropna(subset=["target_sequence", "sequence", "label"]).reset_index(drop=True)
+        return out[["target_sequence", "sequence", "label", "split", iptm_col, COL_AFF, "affinity_class"]]
+    wt_view = prep_view(df_wt2, "wt_sequence", COL_WT_IPTM)
+    smi_view = prep_view(df_smi2, "smiles_sequence", COL_SMI_IPTM)
+    # -------------------------
+    # Split views
+    # -------------------------
+    wt_train = wt_view[wt_view["split"] == "train"].reset_index(drop=True)
+    wt_val   = wt_view[wt_view["split"] == "val"].reset_index(drop=True)
+    smi_train = smi_view[smi_view["split"] == "train"].reset_index(drop=True)
+    smi_val   = smi_view[smi_view["split"] == "val"].reset_index(drop=True)
+    # =========================
+    # TARGET pooled embeddings (ESM) — SEPARATE per branch
+    # =========================
+    with section("TARGET pooled embeddings (ESM) — WT + SMILES separately"):
+        wt_tok = AutoTokenizer.from_pretrained(WT_MODEL_NAME)
+        wt_esm = EsmModel.from_pretrained(WT_MODEL_NAME).to(DEVICE).eval()
+        # ---- WT targets ----
+        wt_train_tgt_emb = wt_pooled_embeddings(
+            wt_train["target_sequence"].astype(str).str.strip().tolist(),
+            wt_tok, wt_esm,
+            batch_size=WT_BATCH,
+            max_length=WT_MAX_LEN,
+        ).astype(np.float32)
+        wt_val_tgt_emb = wt_pooled_embeddings(
+            wt_val["target_sequence"].astype(str).str.strip().tolist(),
+            wt_tok, wt_esm,
+            batch_size=WT_BATCH,
+            max_length=WT_MAX_LEN,
+        ).astype(np.float32)
+        # ---- SMILES targets (independent; may include UAA-only targets) ----
+        smi_train_tgt_emb = wt_pooled_embeddings(
+            smi_train["target_sequence"].astype(str).str.strip().tolist(),
+            wt_tok, wt_esm,
+            batch_size=WT_BATCH,
+            max_length=WT_MAX_LEN,
+        ).astype(np.float32)
+        smi_val_tgt_emb = wt_pooled_embeddings(
+            smi_val["target_sequence"].astype(str).str.strip().tolist(),
+            wt_tok, wt_esm,
+            batch_size=WT_BATCH,
+            max_length=WT_MAX_LEN,
+        ).astype(np.float32)
+    # =========================
+    # WT pooled binder embeddings (binder = WT peptide)
+    # =========================
+    with section("WT pooled binder embeddings + save"):
+        wt_train_emb = wt_pooled_embeddings(
+            wt_train["sequence"].astype(str).str.strip().tolist(),
+            wt_tok, wt_esm,
+            batch_size=WT_BATCH,
+            max_length=WT_MAX_LEN,
+        ).astype(np.float32)
+        wt_val_emb = wt_pooled_embeddings(
+            wt_val["sequence"].astype(str).str.strip().tolist(),
+            wt_tok, wt_esm,
+            batch_size=WT_BATCH,
+            max_length=WT_MAX_LEN,
+        ).astype(np.float32)
+        wt_train_ds = Dataset.from_dict({
+            "target_sequence": wt_train["target_sequence"].tolist(),
+            "sequence": wt_train["sequence"].tolist(),
+            "label": wt_train["label"].astype(float).tolist(),
+            "target_embedding": wt_train_tgt_emb,
+            "embedding": wt_train_emb,
+            COL_WT_IPTM: wt_train[COL_WT_IPTM].astype(float).tolist(),
+            COL_AFF: wt_train[COL_AFF].astype(float).tolist(),
+            "affinity_class": wt_train["affinity_class"].tolist(),
+        })
+        wt_val_ds = Dataset.from_dict({
+            "target_sequence": wt_val["target_sequence"].tolist(),
+            "sequence": wt_val["sequence"].tolist(),
+            "label": wt_val["label"].astype(float).tolist(),
+            "target_embedding": wt_val_tgt_emb,
+            "embedding": wt_val_emb,
+            COL_WT_IPTM: wt_val[COL_WT_IPTM].astype(float).tolist(),
+            COL_AFF: wt_val[COL_AFF].astype(float).tolist(),
+            "affinity_class": wt_val["affinity_class"].tolist(),
+        })
+        wt_pooled_dd = DatasetDict({"train": wt_train_ds, "val": wt_val_ds})
+        wt_pooled_out = OUT_ROOT / "pair_wt_wt_pooled"
+        wt_pooled_dd.save_to_disk(str(wt_pooled_out))
+        log(f"Saved WT pooled -> {wt_pooled_out}")
+    # =========================
+    # SMILES pooled binder embeddings (binder = SMILES via PeptideCLM)
+    # =========================
+    with section("SMILES pooled binder embeddings + save"):
+        smi_tok = SMILES_SPE_Tokenizer(TOKENIZER_VOCAB, TOKENIZER_SPLITS)
+        smi_roformer = (
+            AutoModelForMaskedLM
+            .from_pretrained(SMI_MODEL_NAME)
+            .roformer
+            .to(DEVICE)
+            .eval()
+        )
+        smi_train_pooled, _, _, _ = smiles_generate_embeddings_batched_both(
+            smi_train["sequence"].astype(str).str.strip().tolist(),
+            smi_tok, smi_roformer,
+            batch_size=SMI_BATCH,
+            max_length=SMI_MAX_LEN,
+        )
+        smi_val_pooled, _, _, _ = smiles_generate_embeddings_batched_both(
+            smi_val["sequence"].astype(str).str.strip().tolist(),
+            smi_tok, smi_roformer,
+            batch_size=SMI_BATCH,
+            max_length=SMI_MAX_LEN,
+        )
+        smi_train_ds = Dataset.from_dict({
+            "target_sequence": smi_train["target_sequence"].tolist(),
+            "sequence": smi_train["sequence"].tolist(),
+            "label": smi_train["label"].astype(float).tolist(),
+            "target_embedding": smi_train_tgt_emb,
+            "embedding": smi_train_pooled.astype(np.float32),
+            COL_SMI_IPTM: smi_train[COL_SMI_IPTM].astype(float).tolist(),
+            COL_AFF: smi_train[COL_AFF].astype(float).tolist(),
+            "affinity_class": smi_train["affinity_class"].tolist(),
+        })
+        smi_val_ds = Dataset.from_dict({
+            "target_sequence": smi_val["target_sequence"].tolist(),
+            "sequence": smi_val["sequence"].tolist(),
+            "label": smi_val["label"].astype(float).tolist(),
+            "target_embedding": smi_val_tgt_emb,
+            "embedding": smi_val_pooled.astype(np.float32),
+            COL_SMI_IPTM: smi_val[COL_SMI_IPTM].astype(float).tolist(),
+            COL_AFF: smi_val[COL_AFF].astype(float).tolist(),
+            "affinity_class": smi_val["affinity_class"].tolist(),
+        })
+        smi_pooled_dd = DatasetDict({"train": smi_train_ds, "val": smi_val_ds})
+        smi_pooled_out = OUT_ROOT / "pair_wt_smiles_pooled"
+        smi_pooled_dd.save_to_disk(str(smi_pooled_out))
+        log(f"Saved SMILES pooled -> {smi_pooled_out}")
+        # =========================
+    # WT unpooled paired (ESM target + ESM binder) + save
+    # =========================
+    with section("WT unpooled paired embeddings + save"):
+        wt_tok_unpooled = wt_tok                       # reuse tokenizer
+        wt_esm_unpooled = wt_esm                       # reuse model
+        wt_unpooled_out = OUT_ROOT / "pair_wt_wt_unpooled"
+        wt_unpooled_dd = DatasetDict({
+            "train": build_wt_unpooled_dataset(wt_train, wt_unpooled_out / "train",
+                                               wt_tok_unpooled, wt_esm_unpooled),
+            "val":   build_wt_unpooled_dataset(wt_val,   wt_unpooled_out / "val",
+                                               wt_tok_unpooled, wt_esm_unpooled),
+        })
+        # (Optional) also save as DatasetDict root if you want a single load_from_disk path:
+        wt_unpooled_dd.save_to_disk(str(wt_unpooled_out))
+        log(f"Saved WT unpooled -> {wt_unpooled_out}")
+    # =========================
+    # SMILES unpooled paired (ESM target + PeptideCLM binder) + save
+    # =========================
+    with section("SMILES unpooled paired embeddings + save"):
+        # reuse already-loaded smi_tok/smi_roformer from pooled section if still in scope;
+        # otherwise re-init here:
+        # smi_tok = SMILES_SPE_Tokenizer(TOKENIZER_VOCAB, TOKENIZER_SPLITS)
+        # smi_roformer = AutoModelForMaskedLM.from_pretrained(SMI_MODEL_NAME).roformer.to(DEVICE).eval()
+        smi_unpooled_out = OUT_ROOT / "pair_wt_smiles_unpooled"
+        smi_unpooled_dd = DatasetDict({
+            "train": build_smiles_unpooled_paired_dataset(
+                smi_train, smi_unpooled_out / "train",
+                wt_tok, wt_esm,
+                smi_tok, smi_roformer
+            ),
+            "val": build_smiles_unpooled_paired_dataset(
+                smi_val, smi_unpooled_out / "val",
+                wt_tok, wt_esm,
+                smi_tok, smi_roformer
+            ),
+        })
+        smi_unpooled_dd.save_to_disk(str(smi_unpooled_out))
+        log(f"Saved SMILES unpooled -> {smi_unpooled_out}")
+    log(f"\n[DONE] All datasets saved under: {OUT_ROOT}")
+if __name__ == "__main__":
+    main()

training_classifiers/binding_training.py ADDED Viewed

	@@ -0,0 +1,414 @@

+import os, json
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+import optuna
+from datasets import load_from_disk, DatasetDict
+from scipy.stats import spearmanr
+from lightning.pytorch import seed_everything
+seed_everything(1986)
+DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+def safe_spearmanr(y_true: np.ndarray, y_pred: np.ndarray) -> float:
+    rho = spearmanr(y_true, y_pred).correlation
+    if rho is None or np.isnan(rho):
+        return 0.0
+    return float(rho)
+# -----------------------------
+# Affinity class thresholds (final spec)
+# High >= 9 ; Moderate 7-9 ; Low < 7
+# 0=High, 1=Moderate, 2=Low
+# -----------------------------
+def affinity_to_class_tensor(y: torch.Tensor) -> torch.Tensor:
+    high = y >= 9.0
+    low  = y < 7.0
+    mid  = ~(high | low)
+    cls = torch.zeros_like(y, dtype=torch.long)
+    cls[mid] = 1
+    cls[low] = 2
+    return cls
+# -----------------------------
+# Load paired DatasetDict
+# -----------------------------
+def load_split_paired(path: str):
+    dd = load_from_disk(path)
+    if not isinstance(dd, DatasetDict):
+        raise ValueError(f"Expected DatasetDict at {path}")
+    if "train" not in dd or "val" not in dd:
+        raise ValueError(f"DatasetDict missing train/val at {path}")
+    return dd["train"], dd["val"]
+# -----------------------------
+# Collate: pooled paired
+# -----------------------------
+def collate_pair_pooled(batch):
+    Pt = torch.tensor([x["target_embedding"] for x in batch], dtype=torch.float32)  # (B,Ht)
+    Pb = torch.tensor([x["binder_embedding"] for x in batch], dtype=torch.float32)  # (B,Hb)
+    y  = torch.tensor([float(x["label"]) for x in batch], dtype=torch.float32)
+    return Pt, Pb, y
+# -----------------------------
+# Collate: unpooled paired
+# -----------------------------
+def collate_pair_unpooled(batch):
+    B = len(batch)
+    Ht = len(batch[0]["target_embedding"][0])
+    Hb = len(batch[0]["binder_embedding"][0])
+    Lt_max = max(int(x["target_length"]) for x in batch)
+    Lb_max = max(int(x["binder_length"]) for x in batch)
+    Pt = torch.zeros(B, Lt_max, Ht, dtype=torch.float32)
+    Pb = torch.zeros(B, Lb_max, Hb, dtype=torch.float32)
+    Mt = torch.zeros(B, Lt_max, dtype=torch.bool)
+    Mb = torch.zeros(B, Lb_max, dtype=torch.bool)
+    y  = torch.tensor([float(x["label"]) for x in batch], dtype=torch.float32)
+    for i, x in enumerate(batch):
+        t = torch.tensor(x["target_embedding"], dtype=torch.float32)
+        b = torch.tensor(x["binder_embedding"], dtype=torch.float32)
+        lt, lb = t.shape[0], b.shape[0]
+        Pt[i, :lt] = t
+        Pb[i, :lb] = b
+        Mt[i, :lt] = torch.tensor(x["target_attention_mask"][:lt], dtype=torch.bool)
+        Mb[i, :lb] = torch.tensor(x["binder_attention_mask"][:lb], dtype=torch.bool)
+    return Pt, Mt, Pb, Mb, y
+# -----------------------------
+# Cross-attention models
+# -----------------------------
+class CrossAttnPooled(nn.Module):
+    """
+    pooled vectors -> treat as single-token sequences for cross attention
+    """
+    def __init__(self, Ht, Hb, hidden=512, n_heads=8, n_layers=3, dropout=0.1):
+        super().__init__()
+        self.t_proj = nn.Sequential(nn.Linear(Ht, hidden), nn.LayerNorm(hidden))
+        self.b_proj = nn.Sequential(nn.Linear(Hb, hidden), nn.LayerNorm(hidden))
+        self.layers = nn.ModuleList([])
+        for _ in range(n_layers):
+            self.layers.append(nn.ModuleDict({
+                "attn_tb": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=False),
+                "attn_bt": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=False),
+                "n1t": nn.LayerNorm(hidden),
+                "n2t": nn.LayerNorm(hidden),
+                "n1b": nn.LayerNorm(hidden),
+                "n2b": nn.LayerNorm(hidden),
+                "fft": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
+                "ffb": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
+            }))
+        self.shared = nn.Sequential(nn.Linear(2*hidden, hidden), nn.GELU(), nn.Dropout(dropout))
+        self.reg = nn.Linear(hidden, 1)
+        self.cls = nn.Linear(hidden, 3)
+    def forward(self, t_vec, b_vec):
+        # (B,Ht),(B,Hb)
+        t = self.t_proj(t_vec).unsqueeze(0)  # (1,B,H)
+        b = self.b_proj(b_vec).unsqueeze(0)  # (1,B,H)
+        for L in self.layers:
+            t_attn, _ = L["attn_tb"](t, b, b)
+            t = L["n1t"]((t + t_attn).transpose(0,1)).transpose(0,1)
+            t = L["n2t"]((t + L["fft"](t)).transpose(0,1)).transpose(0,1)
+            b_attn, _ = L["attn_bt"](b, t, t)
+            b = L["n1b"]((b + b_attn).transpose(0,1)).transpose(0,1)
+            b = L["n2b"]((b + L["ffb"](b)).transpose(0,1)).transpose(0,1)
+        t0 = t[0]
+        b0 = b[0]
+        z = torch.cat([t0, b0], dim=-1)
+        h = self.shared(z)
+        return self.reg(h).squeeze(-1), self.cls(h)
+class CrossAttnUnpooled(nn.Module):
+    """
+    token sequences with masks; alternating cross attention.
+    """
+    def __init__(self, Ht, Hb, hidden=512, n_heads=8, n_layers=3, dropout=0.1):
+        super().__init__()
+        self.t_proj = nn.Sequential(nn.Linear(Ht, hidden), nn.LayerNorm(hidden))
+        self.b_proj = nn.Sequential(nn.Linear(Hb, hidden), nn.LayerNorm(hidden))
+        self.layers = nn.ModuleList([])
+        for _ in range(n_layers):
+            self.layers.append(nn.ModuleDict({
+                "attn_tb": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True),
+                "attn_bt": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True),
+                "n1t": nn.LayerNorm(hidden),
+                "n2t": nn.LayerNorm(hidden),
+                "n1b": nn.LayerNorm(hidden),
+                "n2b": nn.LayerNorm(hidden),
+                "fft": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
+                "ffb": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
+            }))
+        self.shared = nn.Sequential(nn.Linear(2*hidden, hidden), nn.GELU(), nn.Dropout(dropout))
+        self.reg = nn.Linear(hidden, 1)
+        self.cls = nn.Linear(hidden, 3)
+    def masked_mean(self, X, M):
+        Mf = M.unsqueeze(-1).float()
+        denom = Mf.sum(dim=1).clamp(min=1.0)
+        return (X * Mf).sum(dim=1) / denom
+    def forward(self, T, Mt, B, Mb):
+        # T:(B,Lt,Ht), Mt:(B,Lt) ; B:(B,Lb,Hb), Mb:(B,Lb)
+        T = self.t_proj(T)
+        Bx = self.b_proj(B)
+        kp_t = ~Mt  # key_padding_mask True = pad
+        kp_b = ~Mb
+        for L in self.layers:
+            # T attends to B
+            T_attn, _ = L["attn_tb"](T, Bx, Bx, key_padding_mask=kp_b)
+            T = L["n1t"](T + T_attn)
+            T = L["n2t"](T + L["fft"](T))
+            # B attends to T
+            B_attn, _ = L["attn_bt"](Bx, T, T, key_padding_mask=kp_t)
+            Bx = L["n1b"](Bx + B_attn)
+            Bx = L["n2b"](Bx + L["ffb"](Bx))
+        t_pool = self.masked_mean(T, Mt)
+        b_pool = self.masked_mean(Bx, Mb)
+        z = torch.cat([t_pool, b_pool], dim=-1)
+        h = self.shared(z)
+        return self.reg(h).squeeze(-1), self.cls(h)
+# -----------------------------
+# Train/eval
+# -----------------------------
+@torch.no_grad()
+def eval_spearman_pooled(model, loader):
+    model.eval()
+    ys, ps = [], []
+    for t, b, y in loader:
+        t = t.to(DEVICE, non_blocking=True)
+        b = b.to(DEVICE, non_blocking=True)
+        pred, _ = model(t, b)
+        ys.append(y.numpy())
+        ps.append(pred.detach().cpu().numpy())
+    return safe_spearmanr(np.concatenate(ys), np.concatenate(ps))
+@torch.no_grad()
+def eval_spearman_unpooled(model, loader):
+    model.eval()
+    ys, ps = [], []
+    for T, Mt, B, Mb, y in loader:
+        T = T.to(DEVICE, non_blocking=True)
+        Mt = Mt.to(DEVICE, non_blocking=True)
+        B = B.to(DEVICE, non_blocking=True)
+        Mb = Mb.to(DEVICE, non_blocking=True)
+        pred, _ = model(T, Mt, B, Mb)
+        ys.append(y.numpy())
+        ps.append(pred.detach().cpu().numpy())
+    return safe_spearmanr(np.concatenate(ys), np.concatenate(ps))
+def train_one_epoch_pooled(model, loader, opt, loss_reg, loss_cls, cls_w=1.0, clip=1.0):
+    model.train()
+    for t, b, y in loader:
+        t = t.to(DEVICE, non_blocking=True)
+        b = b.to(DEVICE, non_blocking=True)
+        y = y.to(DEVICE, non_blocking=True)
+        y_cls = affinity_to_class_tensor(y)
+        opt.zero_grad(set_to_none=True)
+        pred, logits = model(t, b)
+        L = loss_reg(pred, y) + cls_w * loss_cls(logits, y_cls)
+        L.backward()
+        if clip is not None:
+            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
+        opt.step()
+def train_one_epoch_unpooled(model, loader, opt, loss_reg, loss_cls, cls_w=1.0, clip=1.0):
+    model.train()
+    for T, Mt, B, Mb, y in loader:
+        T = T.to(DEVICE, non_blocking=True)
+        Mt = Mt.to(DEVICE, non_blocking=True)
+        B = B.to(DEVICE, non_blocking=True)
+        Mb = Mb.to(DEVICE, non_blocking=True)
+        y = y.to(DEVICE, non_blocking=True)
+        y_cls = affinity_to_class_tensor(y)
+        opt.zero_grad(set_to_none=True)
+        pred, logits = model(T, Mt, B, Mb)
+        L = loss_reg(pred, y) + cls_w * loss_cls(logits, y_cls)
+        L.backward()
+        if clip is not None:
+            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
+        opt.step()
+# -----------------------------
+# Optuna objective
+# -----------------------------
+def objective_crossattn(trial: optuna.Trial, mode: str, train_ds, val_ds) -> float:
+    lr = trial.suggest_float("lr", 1e-5, 3e-3, log=True)
+    wd = trial.suggest_float("weight_decay", 1e-10, 1e-2, log=True)
+    dropout = trial.suggest_float("dropout", 0.0, 0.4)
+    hidden = trial.suggest_categorical("hidden_dim", [256, 384, 512, 768])
+    n_heads = trial.suggest_categorical("n_heads", [4, 8])
+    n_layers = trial.suggest_int("n_layers", 1, 4)
+    cls_w = trial.suggest_float("cls_weight", 0.1, 2.0, log=True)
+    batch = trial.suggest_categorical("batch_size", [16, 32, 64, 128])
+    # infer dims from first row
+    if mode == "pooled":
+        Ht = len(train_ds[0]["target_embedding"])
+        Hb = len(train_ds[0]["binder_embedding"])
+        collate = collate_pair_pooled
+        model = CrossAttnPooled(Ht, Hb, hidden=hidden, n_heads=n_heads, n_layers=n_layers, dropout=dropout).to(DEVICE)
+        train_loader = DataLoader(train_ds, batch_size=batch, shuffle=True, num_workers=4, pin_memory=True, collate_fn=collate)
+        val_loader   = DataLoader(val_ds,   batch_size=batch, shuffle=False, num_workers=4, pin_memory=True, collate_fn=collate)
+        eval_fn = eval_spearman_pooled
+        train_fn = train_one_epoch_pooled
+    else:
+        Ht = len(train_ds[0]["target_embedding"][0])
+        Hb = len(train_ds[0]["binder_embedding"][0])
+        collate = collate_pair_unpooled
+        model = CrossAttnUnpooled(Ht, Hb, hidden=hidden, n_heads=n_heads, n_layers=n_layers, dropout=dropout).to(DEVICE)
+        train_loader = DataLoader(train_ds, batch_size=batch, shuffle=True, num_workers=4, pin_memory=True, collate_fn=collate)
+        val_loader   = DataLoader(val_ds,   batch_size=batch, shuffle=False, num_workers=4, pin_memory=True, collate_fn=collate)
+        eval_fn = eval_spearman_unpooled
+        train_fn = train_one_epoch_unpooled
+    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
+    loss_reg = nn.MSELoss()
+    loss_cls = nn.CrossEntropyLoss()
+    best = -1e9
+    bad = 0
+    patience = 10
+    for ep in range(1, 61):
+        train_fn(model, train_loader, opt, loss_reg, loss_cls, cls_w=cls_w)
+        rho = eval_fn(model, val_loader)
+        trial.report(rho, ep)
+        if trial.should_prune():
+            raise optuna.TrialPruned()
+        if rho > best + 1e-6:
+            best = rho
+            bad = 0
+        else:
+            bad += 1
+            if bad >= patience:
+                break
+    return float(best)
+# -----------------------------
+# Run: optuna + refit best
+# -----------------------------
+def run(dataset_path: str, out_dir: str, mode: str, n_trials: int = 50):
+    out_dir = Path(out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    train_ds, val_ds = load_split_paired(dataset_path)
+    print(f"[Data] Train={len(train_ds)} Val={len(val_ds)} | mode={mode}")
+    study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner())
+    study.optimize(lambda t: objective_crossattn(t, mode, train_ds, val_ds), n_trials=n_trials)
+    study.trials_dataframe().to_csv(out_dir / "optuna_trials.csv", index=False)
+    best = study.best_trial
+    best_params = dict(best.params)
+    # refit longer
+    lr = float(best_params["lr"])
+    wd = float(best_params["weight_decay"])
+    dropout = float(best_params["dropout"])
+    hidden = int(best_params["hidden_dim"])
+    n_heads = int(best_params["n_heads"])
+    n_layers = int(best_params["n_layers"])
+    cls_w = float(best_params["cls_weight"])
+    batch = int(best_params["batch_size"])
+    loss_reg = nn.MSELoss()
+    loss_cls = nn.CrossEntropyLoss()
+    if mode == "pooled":
+        Ht = len(train_ds[0]["target_embedding"])
+        Hb = len(train_ds[0]["binder_embedding"])
+        model = CrossAttnPooled(Ht, Hb, hidden=hidden, n_heads=n_heads, n_layers=n_layers, dropout=dropout).to(DEVICE)
+        collate = collate_pair_pooled
+        train_loader = DataLoader(train_ds, batch_size=batch, shuffle=True, num_workers=4, pin_memory=True, collate_fn=collate)
+        val_loader   = DataLoader(val_ds,   batch_size=batch, shuffle=False, num_workers=4, pin_memory=True, collate_fn=collate)
+        eval_fn = eval_spearman_pooled
+        train_fn = train_one_epoch_pooled
+    else:
+        Ht = len(train_ds[0]["target_embedding"][0])
+        Hb = len(train_ds[0]["binder_embedding"][0])
+        model = CrossAttnUnpooled(Ht, Hb, hidden=hidden, n_heads=n_heads, n_layers=n_layers, dropout=dropout).to(DEVICE)
+        collate = collate_pair_unpooled
+        train_loader = DataLoader(train_ds, batch_size=batch, shuffle=True, num_workers=4, pin_memory=True, collate_fn=collate)
+        val_loader   = DataLoader(val_ds,   batch_size=batch, shuffle=False, num_workers=4, pin_memory=True, collate_fn=collate)
+        eval_fn = eval_spearman_unpooled
+        train_fn = train_one_epoch_unpooled
+    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
+    best_rho = -1e9
+    bad = 0
+    patience = 20
+    best_state = None
+    for ep in range(1, 201):
+        train_fn(model, train_loader, opt, loss_reg, loss_cls, cls_w=cls_w)
+        rho = eval_fn(model, val_loader)
+        if rho > best_rho + 1e-6:
+            best_rho = rho
+            bad = 0
+            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
+        else:
+            bad += 1
+            if bad >= patience:
+                break
+    if best_state is not None:
+        model.load_state_dict(best_state)
+    # save
+    torch.save({"mode": mode, "best_params": best_params, "state_dict": model.state_dict()}, out_dir / "best_model.pt")
+    with open(out_dir / "best_params.json", "w") as f:
+        json.dump(best_params, f, indent=2)
+    print(f"[DONE] {out_dir} | best_optuna_rho={study.best_value:.4f} | refit_best_rho={best_rho:.4f}")
+if __name__ == "__main__":
+    import argparse
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--dataset_path", type=str, required=True, help="Paired DatasetDict path (pair_*)")
+    ap.add_argument("--mode", type=str, choices=["pooled", "unpooled"], required=True)
+    ap.add_argument("--out_dir", type=str, required=True)
+    ap.add_argument("--n_trials", type=int, default=50)
+    args = ap.parse_args()
+    run(
+        dataset_path=args.dataset_path,
+        out_dir=args.out_dir,
+        mode=args.mode,
+        n_trials=args.n_trials,
+    )

training_classifiers/binding_wt.bash ADDED Viewed

	@@ -0,0 +1,31 @@

+#!/bin/bash
+#SBATCH --job-name=b-data
+#SBATCH --partition=dgx-b200
+#SBATCH --gpus=1
+#SBATCH --cpus-per-task=10
+#SBATCH --mem=100G
+#SBATCH --time=48:00:00
+#SBATCH --output=%x_%j.out
+HOME_LOC=/vast/projects/pranam/lab/yz927
+SCRIPT_LOC=$HOME_LOC/projects/Classifier_Weight/training_classifiers
+DATA_LOC=$HOME_LOC/projects/Classifier_Weight/training_data_cleaned
+OBJECTIVE='binding_affinity'
+WT='smiles' #wt/smiles
+STATUS='pooled' #pooled/unpooled
+DATA_FILE="pair_wt_${WT}_${STATUS}"
+LOG_LOC=$SCRIPT_LOC
+DATE=$(date +%m_%d)
+SPECIAL_PREFIX="binding_affinity_data_generation"
+# Create log directory if it doesn't exist
+mkdir -p $LOG_LOC
+cd $SCRIPT_LOC
+source /vast/projects/pranam/lab/shared/miniconda3/etc/profile.d/conda.sh
+conda activate /vast/projects/pranam/lab/shared/miniconda3/envs/metal
+python -u binding_affinity_split.py > "${LOG_LOC}/${DATE}_${SPECIAL_PREFIX}.log" 2>&1
+echo "Script completed at $(date)"
+conda deactivate

training_classifiers/hemolysis/cnn_smiles/best_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd8f379ef2a10dacff4236ca37aa64832a3ce8bc9608ca1297b1b7662780ee6f
+size 14170677

training_classifiers/hemolysis/cnn_smiles/best_model_benchmark.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "n_samples": 800,
+  "wall_time_s": 6.477548930000921,
+  "throughput_samples_per_s": 123.50350551499211,
+  "gpu_total_kernel_ms": 39.01705604791641,
+  "gpu_ms_per_sample": 0.04877132005989551,
+  "gpu_avg_ms_per_batch": 0.7803411209583282,
+  "gpu_peak_mem_MB": 219.23974609375,
+  "telemetry_pre": {
+    "cpu_freq_current_MHz": 1357.0153303571428,
+    "cpu_freq_max_MHz": 4000.0,
+    "cpu_util_pct": 5.7,
+    "cpu_count_logical": 224,
+    "cpu_count_physical": 112,
+    "gpu_util_pct": 0,
+    "gpu_mem_util_pct": 0,
+    "gpu_mem_used_MB": 1829.0625,
+    "gpu_mem_total_MB": 183359.0,
+    "gpu_sm_clock_MHz": 1965,
+    "gpu_mem_clock_MHz": 3996,
+    "gpu_power_W": 194.631,
+    "gpu_temp_C": 31
+  },
+  "telemetry_post": {
+    "cpu_freq_current_MHz": 1529.574044642856,
+    "cpu_freq_max_MHz": 4000.0,
+    "cpu_util_pct": 6.0,
+    "cpu_count_logical": 224,
+    "cpu_count_physical": 112,
+    "gpu_util_pct": 0,
+    "gpu_mem_util_pct": 0,
+    "gpu_mem_used_MB": 1923.0625,
+    "gpu_mem_total_MB": 183359.0,
+    "gpu_sm_clock_MHz": 1965,
+    "gpu_mem_clock_MHz": 3996,
+    "gpu_power_W": 197.518,
+    "gpu_temp_C": 31
+  }
+}

training_classifiers/hemolysis/cnn_smiles/optimization_summary.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+========================================================================
+MODEL: cnn
+Best Optuna F1 (objective): 0.5290
+Best Optuna AUC (val, recorded): 0.7477
+Best Optuna threshold (val): 0.4518
+Refit best AUC (val): 0.7851
+Refit best F1@thr (val): 0.5366 at thr=0.5298
+Best params:
+{
+  "lr": 0.0002237456677696451,
+  "weight_decay": 0.0005722918417016266,
+  "dropout": 0.2697397384794115,
+  "batch_size": 16,
+  "channels": 512,
+  "kernel": 3,
+  "layers": 4
+}
+Saved model: /vast/projects/pranam/lab/yz927/projects/Classifier_Weight/training_classifiers/hemolysis/cnn_smiles/best_model.pt
+========================================================================

training_classifiers/hemolysis/cnn_smiles/pr_curve.png ADDED Viewed

training_classifiers/hemolysis/cnn_smiles/roc_curve.png ADDED Viewed

training_classifiers/hemolysis/cnn_smiles/study_trials.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:341f45fdbd7565793d9ff64a3d1be6ebd6d56d6e6957db19d119a946c658d296
+size 48177

training_classifiers/hemolysis/cnn_smiles/train_predictions.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44ee47243c96b0d372586ee8c40582af2aa086cfb626c5826b05778f1f08b919
+size 1943431

training_classifiers/hemolysis/cnn_smiles/val_predictions.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df88e6577184fbf56018945e0540c5d5679981ff74af08422b03cd6aab43d5e3
+size 472104