BrightBlueCheese commited on
Commit
a780c2f
1 Parent(s): 92b1a49
.ipynb_checkpoints/app-checkpoint.py CHANGED
@@ -1,4 +1,133 @@
 
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- x = st.slider('Slect a value')
4
- st.write(x, 'squared is :', x * x)
 
1
+
2
  import streamlit as st
3
+ import subprocess
4
+
5
+ subprocesses.run(['git', 'clone', 'https://huggingface.co/ttmn/SolLlama-mtr'])
6
+
7
+ import sys
8
+ import os
9
+ import torch
10
+ import numpy as np
11
+ import pandas as pd
12
+ import warnings
13
+ import lightning as L
14
+ torch.set_float32_matmul_precision('high')
15
+ warnings.filterwarnings("ignore", module="pl_bolts")
16
+
17
+ sys.path.append( '../')
18
+
19
+ import tokenizer_sl, datamodule_finetune_sl, model_finetune_sl, chemllama_mtr, utils_sl
20
+ import auto_evaluator_sl
21
+
22
+ torch.manual_seed(1004)
23
+ np.random.seed(1004)
24
+
25
+ smiles_str = st.text_area('Enter SMILE string')
26
+
27
+ ###
28
+ solute_or_solvent = 'solute'
29
+ solute_or_solvent = st.selectbox('Solute or Solvent', ['Solute,' 'Solvent'])
30
+ ver_ft = 0 # version control for FT model & evaluation data # Or it will overwrite the models and results
31
+ batch_size_pair = [64, 64] if solute_or_solvent == 'solute' else [10, 10] # [train, valid(test)]
32
+ # since 'solute' has very small dataset. So I thinl 10 for train and 10 for valid(test) should be the maximum values.
33
+ lr = 0.0001
34
+ epochs = 7
35
+ use_freeze = False # Freeze the model or not # False measn not freezing
36
+ overwrite_level_2 = True
37
+ ###
38
+ max_seq_length = 512
39
+ tokenizer = tokenizer_sl.fn_load_tokenizer_llama(
40
+ max_seq_length=max_seq_length,
41
+ )
42
+ max_length = max_seq_length
43
+ num_workers = 2
44
+
45
+ # I just reused our previous research code with some modifications.
46
+ dir_main = "./"
47
+ name_model_mtr = "ChemLlama_Medium_30m_vloss_val_loss=0.029_ep_epoch=04.ckpt"
48
+
49
+ dir_model_mtr = f"{dir_main}/SolLmama-mtr/{name_model_mtr}"
50
+
51
+ max_seq_length = 512
52
+
53
+ tokenizer = tokenizer_sl.fn_load_tokenizer_llama(
54
+ max_seq_length=max_seq_length,
55
+ )
56
+ max_length = max_seq_length
57
+ num_workers = 2
58
+
59
+ ## FT
60
+
61
+ ver_ft = 0
62
+ dir_model_ft_to_save = f"{dir_main}/SolLlama-mtr"
63
+ # name_model_ft = 'Solvent.pt'
64
+ name_model_ft = f"{solute_or_solvent}.pt"
65
+
66
+ # Load dataset for finetune
67
+ batch_size_for_train = batch_size_pair[0]
68
+ batch_size_for_valid = batch_size_pair[1]
69
+
70
+ data_module = datamodule_finetune_sol.CustomFinetuneDataModule(
71
+ solute_or_solvent=solute_or_solvent,
72
+ tokenizer=tokenizer,
73
+ max_seq_length=max_length,
74
+ batch_size_train=batch_size_for_train,
75
+ batch_size_valid=batch_size_for_valid,
76
+ # num_device=int(config.NUM_DEVICE) * config.NUM_WORKERS_MULTIPLIER,
77
+ num_device=num_workers,
78
+ )
79
+
80
+ data_module.prepare_data(smiles_str=smiles_str)
81
+ data_module.setup()
82
+ steps_per_epoch = len(data_module.train_dataloader())
83
+
84
+ # Load model and optimizer for finetune
85
+ learning_rate = lr
86
+
87
+
88
+ model_mtr = chemllama_mtr.ChemLlama.load_from_checkpoint(dir_model_mtr)
89
+
90
+
91
+ model_ft = model_finetune_sl.CustomFinetuneModel(
92
+ model_mtr=model_mtr,
93
+ steps_per_epoch=steps_per_epoch,
94
+ warmup_epochs=1,
95
+ max_epochs=epochs,
96
+ learning_rate=learning_rate,
97
+ # dataset_dict=dataset_dict,
98
+ use_freeze=use_freeze,
99
+ )
100
+
101
+ # 'SolLlama_solute_vloss_val_loss=0.082_ep_epoch=06.pt'
102
+
103
+ trainer = L.Trainer(
104
+ default_root_dir=dir_model_ft_to_save,
105
+ # profiler=profiler,
106
+ # logger=csv_logger,
107
+ accelerator='auto',
108
+ devices='auto',
109
+ # accelerator='gpu',
110
+ # devices=[0],
111
+ min_epochs=1,
112
+ max_epochs=epochs,
113
+ precision=32,
114
+ # callbacks=[checkpoint_callback]
115
+ )
116
+
117
+
118
+ # Predict
119
+ local_model_ft = utils_sl.load_model_ft_with_epoch(
120
+ class_model_ft=model_ft,
121
+ target_epoch=ep,
122
+ dir_model_ft=dir_model_ft_to_save,
123
+ name_model_ft=name_model_ft
124
+ )
125
+
126
+ result = trainer.predict(local_model_ft, data_module)
127
+ result_pred = list()
128
+ result_label = list()
129
+ for bat in range(len(result)):
130
+ result_pred.append(result[bat][0].squeeze())
131
+ result_label.append(result[bat][1])
132
 
133
+ st.write(result_pred)
 
.ipynb_checkpoints/auto_evaluator_sl-checkpoint.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import re
4
+ import pandas as pd
5
+ import numpy as np
6
+
7
+ import lightning as L
8
+ from lightning.pytorch.loggers import CSVLogger
9
+ from lightning.pytorch.profilers import PyTorchProfiler
10
+ from lightning.pytorch.callbacks import ModelCheckpoint
11
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
12
+
13
+ # Now you can import from models_mtr
14
+ # from chemllama_mtr import ChemLlama
15
+ import chemllama_mtr
16
+
17
+ # from .datamodule_finetune import CustomFinetuneDataModule
18
+ import datamodule_finetune_sl
19
+ # from .model_finetune import CustomFinetuneModel
20
+ import model_finetune_sl
21
+ import utils_sol
22
+
23
+ def auto_evaluator_level_2_sol(
24
+ dir_model_mtr,
25
+ # dir_model_mtr_ep_to_save:str,
26
+ dir_model_ft_to_save:str,
27
+ tokenizer,
28
+ max_length:int,
29
+ # molnet_dict:dict,
30
+ # list_dataset_to_finetune:list,
31
+ solute_or_solvent:str,
32
+ num_workers:int,
33
+ batch_size_pair=[32, 48],
34
+ lr=0.0001,
35
+ overwrite_level_2:bool=False,
36
+ epochs:int=7,
37
+ use_freeze:bool=True
38
+ ):
39
+
40
+ """
41
+ Evaluate the "one" pretrained MTR model through multiple finetuning benchmarking dataset.
42
+
43
+ Parameters:
44
+ # - dir_model_mtr_ep_to_save (str): The pretrained model for MTR with epoch.
45
+ # EX with 0 epoch:
46
+ # /master_dicrectory/pre_trained_model_MTR_name/model_MTR_with_epoch
47
+ - batch_size_pair: The pair of the train and valid(+test) batch size (e.g. [32, 48] which is [32, int(32*1.5)])
48
+ - overwrite_level_2 (bool): If there exists such folder that has the same "dir_model_mtr_ep_to_save", overwite it.
49
+ Warning! This option is only for "dir_model_mtr_ep_to_save". It's sub directory and files will be overwritten!
50
+ """
51
+
52
+
53
+ assert not (os.path.exists(dir_model_ft_to_save) and overwrite_level_2 == False), f"You sat 'overwrite_level_2' False and '{dir_model_ft_to_save}' already exists. Check it again."
54
+
55
+
56
+ model_mtr = chemllama_mtr.ChemLlama.load_from_checkpoint(dir_model_mtr)
57
+
58
+ # # local_dataset_to_finetune is a key of molnet_dict
59
+ # list_local_finetuned_result = list()
60
+ # for local_dataset_to_finetune in list_dataset_to_finetune:
61
+
62
+ # dataset_dict = molnet_dict[local_dataset_to_finetune]
63
+ # dataset_dict["dataset_name"] = local_dataset_to_finetune
64
+
65
+ # dir_model_ft = f"{dir_model_mtr_ep_to_save}/{dataset_dict['dataset_name']}"
66
+ dir_model_ft = f"{dir_model_ft_to_save}"
67
+ # name_model_ft = utils_sol.model_ft_namer(dataset_dict['dataset_name'])
68
+ name_model_ft = f"SolLlama_{solute_or_solvent}"
69
+
70
+ # array_level_1, model_ft, data_loader_test
71
+ array_level_1 = auto_evaluator_level_1_sol(
72
+ model_mtr=model_mtr,
73
+ dir_model_ft=dir_model_ft,
74
+ name_model_ft=name_model_ft,
75
+ # dataset_dict=dataset_dict,
76
+ solute_or_solvent=solute_or_solvent,
77
+ tokenizer=tokenizer,
78
+ max_length=max_length,
79
+ num_workers=num_workers,
80
+ batch_size_pair=batch_size_pair,
81
+ lr=lr,
82
+ epochs=epochs,
83
+ use_freeze=use_freeze,
84
+ )
85
+
86
+ return array_level_1
87
+
88
+ # list_local_finetuned_result.append(array_level_1)
89
+
90
+ # array_level_2 = np.vstack(list_local_finetuned_result)
91
+ # array_level_2 shaped (number of epochs x len(list_dataset_to_finetune), number of columns at the bottom)
92
+ # dataset_name, task, RMSE, MAE, p_value mantissam, p_value exponent, epoch, loss, loss_ranking, metric_1_ranking
93
+
94
+ # return array_level_2
95
+
96
+ def auto_evaluator_level_1_sol(
97
+ model_mtr,
98
+ dir_model_ft:str,
99
+ name_model_ft:str,
100
+ # dataset_dict:dict,
101
+ solute_or_solvent:str,
102
+ tokenizer,
103
+ max_length:int,
104
+ num_workers:int, ##
105
+ batch_size_pair=[32, 48],
106
+ lr=0.0001,
107
+ epochs:int=7,
108
+ use_freeze:bool=True,
109
+ ):
110
+
111
+ """
112
+ Automate the entire process including preparing "one" finetuning dataset + finetuing + evalulation.
113
+ This is a step before the level 2 evaluate automation.
114
+
115
+ Parameters:
116
+ - model_mtr: The pretrained model for MTR.
117
+ - dir_model_ft (str): The directory where the model to be stored.
118
+ - name_model_ft (str): The name of the model for finetune to be titled.
119
+ An example of the directory of the fintuned model with 0 epoch:
120
+ {dir_folder}/{name_model_ft}_ep_000
121
+ - batch_size_pair: The pair of the train and valid(+test) batch size (e.g. [32, 48] which is [32, int(32*1.5)])
122
+ """
123
+
124
+ csv_logger = CSVLogger(
125
+ save_dir=dir_model_ft,
126
+ name=name_model_ft,
127
+ version=0,
128
+ )
129
+
130
+ checkpoint_callback = ModelCheckpoint(
131
+ monitor='val_loss',
132
+ filename=name_model_ft + '_vloss_{val_loss:.3f}_ep_{epoch:02d}',
133
+ every_n_epochs=1,
134
+ save_top_k=-1,
135
+ enable_version_counter=False, # keep the version == 0
136
+ save_weights_only=True,
137
+ )
138
+ checkpoint_callback.FILE_EXTENSION = ".pt"
139
+
140
+ # Load dataset for finetune
141
+ batch_size_for_train = batch_size_pair[0]
142
+ batch_size_for_valid = batch_size_pair[1]
143
+
144
+ data_module = datamodule_finetune_sol.CustomFinetuneDataModule(
145
+ solute_or_solvent=solute_or_solvent,
146
+ tokenizer=tokenizer,
147
+ max_seq_length=max_length,
148
+ batch_size_train=batch_size_for_train,
149
+ batch_size_valid=batch_size_for_valid,
150
+ # num_device=int(config.NUM_DEVICE) * config.NUM_WORKERS_MULTIPLIER,
151
+ num_device=num_workers,
152
+ )
153
+ data_module.prepare_data()
154
+ data_module.setup()
155
+ steps_per_epoch = len(data_module.train_dataloader())
156
+
157
+ # Load model and optimizer for finetune
158
+ learning_rate = lr
159
+
160
+ model_ft = model_finetune_sol.CustomFinetuneModel(
161
+ model_mtr=model_mtr,
162
+ steps_per_epoch=steps_per_epoch,
163
+ warmup_epochs=1,
164
+ max_epochs=epochs,
165
+ learning_rate=learning_rate,
166
+ # dataset_dict=dataset_dict,
167
+ use_freeze=use_freeze,
168
+ )
169
+
170
+ trainer = L.Trainer(
171
+ default_root_dir=dir_model_ft,
172
+ # profiler=profiler,
173
+ logger=csv_logger,
174
+ accelerator='auto',
175
+ devices='auto',
176
+ # accelerator='gpu',
177
+ # devices=[0],
178
+ min_epochs=1,
179
+ max_epochs=epochs,
180
+ precision=32,
181
+ callbacks=[checkpoint_callback]
182
+ )
183
+ trainer.fit(model_ft, data_module)
184
+ trainer.validate(model_ft, data_module)
185
+
186
+ list_validation_loss = pd.read_csv(f"{dir_model_ft}/{name_model_ft}/version_0/metrics.csv", usecols=['val_loss'])['val_loss'].dropna().tolist()[:epochs]
187
+
188
+ # class_model_ft = CustomFinetuneModel
189
+ # Level 1 Automation - Evaulate the finetuned model through every epoch
190
+ array_level_1 = auto_evaluator_level_1_sub_sol(
191
+ class_model_ft=model_ft,
192
+ list_validation_loss=list_validation_loss,
193
+ dir_model_ft=dir_model_ft,
194
+ name_model_ft=name_model_ft,
195
+ data_module=data_module,
196
+ # dataset_dict=dataset_dict,
197
+ solute_or_solvent=solute_or_solvent,
198
+ trainer=trainer
199
+ )
200
+
201
+ return array_level_1
202
+
203
+ def auto_evaluator_level_1_sub_sol(
204
+ class_model_ft,
205
+ list_validation_loss,
206
+ dir_model_ft:str,
207
+ name_model_ft:str,
208
+ data_module,
209
+ # dataset_dict:dict,
210
+ solute_or_solvent:str,
211
+ trainer,
212
+ ):
213
+
214
+ """
215
+ Evaluate the finetuned model by a single finetuning dataset.
216
+
217
+ Guides for some parameters:
218
+ - model_mtr: The pretrained model for MTR.
219
+ - dir_model_ft (str): The directory where the model to be stored.
220
+ - name_model_ft (str): The name of the model for finetune to be titled.
221
+ An example of the directory of the fintuned model with 0 epoch:
222
+ {dir_folder}/{name_model_ft}_ep_000
223
+ """
224
+
225
+ array_loss_ranking = utils_sol.rank_value_sol(
226
+ list_value=list_validation_loss,
227
+ # dataset_dict=dataset_dict,
228
+ is_loss=True,
229
+ )
230
+ # ranking : lower the better. ranking starting from 0
231
+
232
+ print("- Epoch starts from 0")
233
+ print("=======================================")
234
+
235
+ list_level_1 = list()
236
+ for ep in range(len(list_validation_loss)):
237
+
238
+ local_model_ft = utils_sol.load_model_ft_with_epoch(
239
+ class_model_ft=class_model_ft,
240
+ target_epoch=ep,
241
+ dir_model_ft=dir_model_ft,
242
+ name_model_ft=name_model_ft
243
+ )
244
+
245
+ result = trainer.predict(local_model_ft, data_module)
246
+ result_pred = list()
247
+ result_label = list()
248
+ for bat in range(len(result)):
249
+ result_pred.append(result[bat][0].squeeze())
250
+ result_label.append(result[bat][1])
251
+
252
+ list_local_model_ft_result = utils_sol.model_evalulator_sol(
253
+ array_predictions=np.vstack(result_pred),
254
+ array_labels=np.vstack(result_label),
255
+ # dataset_dict=dataset_dict,
256
+ solute_or_solvent=solute_or_solvent,
257
+ show_plot=False,
258
+ print_result=False,
259
+ )
260
+ # dataset_name, task, RMSE, MAE, p_value mantissam, p_value exponent
261
+
262
+ # add epoch (starting from 0) to the right
263
+ list_local_model_ft_result.append(ep)
264
+ # dataset_name, task, metric1 (RMSE or ROC-AUC), metric2 (MAE or None), p_value mantissam, p_value exponent, epoch
265
+
266
+ list_level_1.append(list_local_model_ft_result)
267
+ print("=======================================")
268
+ print("=======================================")
269
+
270
+ # to get the metric_1 ranking
271
+ array_level_1 = np.array(list_level_1)
272
+ array_metric_1 = array_level_1[:, 2].astype('float32')
273
+ array_metric_1_ranking = utils_sol.rank_value_sol(list_value=array_metric_1,
274
+ # dataset_dict=dataset_dict,
275
+ is_loss=False)
276
+
277
+ # add loss, and ranking of the loss value to the right
278
+ # reg: lower the better, class: higher the better
279
+ array_level_1 = np.hstack((list_level_1,
280
+ np.expand_dims(list_validation_loss, axis=1),
281
+ np.expand_dims(array_loss_ranking, axis=1),
282
+ np.expand_dims(array_metric_1_ranking, axis=1)))
283
+ # solute_or_solvent, RMSE, MAE, p_value mantissam, p_value exponent, epoch, loss, loss_ranking, metric_1_ranking
284
+
285
+ return array_level_1
286
+ #################################### EX #########################################
287
+ # list_column_names = ['solute_or_solvent',
288
+ # 'metric_1',
289
+ # 'metric_2',
290
+ # 'p_value_mantissa',
291
+ # 'p_value_exponent',
292
+ # 'epoch',
293
+ # 'loss',
294
+ # 'loss_ranking',
295
+ # 'metric_1_ranking']
296
+ # df_evaluation_level_1 = pd.DataFrame(array_level_1, columns=list_column_names)
297
+ #################################################################################
.ipynb_checkpoints/datamodule_finetune_sl-checkpoint.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import lightning as L
2
+ import torch
3
+ import numpy as np
4
+ import pandas as pd
5
+ from sklearn.model_selection import train_test_split
6
+
7
+ from torch.utils.data import Dataset, DataLoader
8
+ from transformers import DataCollatorWithPadding
9
+
10
+
11
+ class CustomLlamaDatasetAbraham(Dataset):
12
+ def __init__(self, df, tokenizer, max_seq_length):
13
+ self.keys = df.iloc[:, 0] # 1D array
14
+ self.labels = df.iloc[:, 1:] # 2D array
15
+ self.tokenizer = tokenizer
16
+ self.max_seq_length = max_seq_length
17
+
18
+ def __len__(self):
19
+ return self.keys.shape[0]
20
+
21
+ def fn_token_encode(self, smiles):
22
+ return self.tokenizer(
23
+ smiles,
24
+ truncation=True,
25
+ padding="max_length",
26
+ max_length=self.max_seq_length,
27
+ )
28
+
29
+ def __getitem__(self, idx):
30
+ local_encoded = self.fn_token_encode(self.keys.iloc[idx])
31
+
32
+ return {
33
+ "input_ids": torch.tensor(local_encoded["input_ids"]),
34
+ "attention_mask": torch.tensor(local_encoded["attention_mask"]),
35
+ "labels": torch.tensor(self.labels.iloc[idx]),
36
+ }
37
+
38
+
39
+ class CustomFinetuneDataModule(L.LightningDataModule):
40
+ def __init__(
41
+ self,
42
+ solute_or_solvent,
43
+ tokenizer,
44
+ max_seq_length,
45
+ batch_size_train,
46
+ batch_size_valid,
47
+ num_device,
48
+ ):
49
+ super().__init__()
50
+
51
+ self.solute_or_solvent = solute_or_solvent
52
+ self.tokenizer = tokenizer
53
+ self.max_seq_length = max_seq_length
54
+ self.batch_size_train = batch_size_train
55
+ self.batch_size_valid = batch_size_valid
56
+ self.data_collator = DataCollatorWithPadding(self.tokenizer)
57
+ self.num_device = num_device
58
+
59
+
60
+ def prepare_data(self, smiles_str:str):
61
+ # self.list_df = load_abraham(self.solute_or_solvent)
62
+ self.smiles_str = smiles_str
63
+
64
+ def setup(self, stage=None):
65
+ # self.train_df, self.valid_df, self.test_df = self.list_df
66
+ self.train_df = None
67
+ self.valid_df = None
68
+ self.test_df = self.smiles_str
69
+
70
+ def train_dataloader(self):
71
+ return DataLoader(
72
+ dataset=CustomLlamaDatasetAbraham(
73
+ self.train_df, self.tokenizer, self.max_seq_length,
74
+ ),
75
+ batch_size=self.batch_size_train,
76
+ num_workers=self.num_device,
77
+ collate_fn=self.data_collator,
78
+ shuffle=True,
79
+ )
80
+
81
+ def val_dataloader(self):
82
+ return DataLoader(
83
+ dataset=CustomLlamaDatasetAbraham(
84
+ self.valid_df, self.tokenizer, self.max_seq_length,
85
+ ),
86
+ batch_size=self.batch_size_valid,
87
+ num_workers=self.num_device,
88
+ collate_fn=self.data_collator,
89
+ shuffle=False,
90
+ )
91
+
92
+ def test_dataloader(self):
93
+ return DataLoader(
94
+ dataset=CustomLlamaDatasetAbraham(
95
+ self.test_df, self.tokenizer, self.max_seq_length,
96
+ ),
97
+ batch_size=self.batch_size_valid,
98
+ num_workers=self.num_device,
99
+ collate_fn=self.data_collator,
100
+ shuffle=False,
101
+ )
102
+
103
+ # It uses test_df
104
+ def predict_dataloader(self):
105
+ return DataLoader(
106
+ dataset=CustomLlamaDatasetAbraham(
107
+ self.test_df, self.tokenizer, self.max_seq_length,
108
+ ),
109
+ batch_size=self.batch_size_valid,
110
+ num_workers=self.num_device,
111
+ collate_fn=self.data_collator,
112
+ shuffle=False,
113
+ )
114
+
115
+
116
+
117
+
118
+
119
+
120
+
.ipynb_checkpoints/dict_dtype_slpy-checkpoint ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ dict_dtype = {
2
+ "solute_or_solvent": "str",
3
+ "metric_1": "float32",
4
+ "metric_2": "float32",
5
+ "epoch": "int32",
6
+ "loss": "float32",
7
+ "loss_ranking": "int32",
8
+ "metric_1_ranking": "int32",
9
+ }
.ipynb_checkpoints/requirements-checkpoint.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ lightning
4
+ lightning-bolts
5
+ numpy
6
+ pytorch-ignite
7
+ pytorch-lightning
8
+ pytorch-lightning-bolts
9
+ pytorch-warmup
10
+ scikit-learn
11
+ scipy
12
+ seaborn
.ipynb_checkpoints/run_auto_llama_cuda0-checkpoint.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+
4
+ # This means you will use the first GPU among the four GPUs in our case.
5
+ # "0", "1", "2", "3". Since FT dataset is small, using one GPU should be proper.
6
+ os.environ["CUDA_VISIBLE_DEVICES"]= "0"
7
+
8
+
9
+ import torch
10
+ import numpy as np
11
+ import pandas as pd
12
+ import warnings
13
+ import lightning as L
14
+ torch.set_float32_matmul_precision('high')
15
+
16
+ # Filter out FutureWarning and UnderReviewWarning messages from pl_bolts
17
+ warnings.filterwarnings("ignore", module="pl_bolts")
18
+
19
+ # Add the parent directory to sys.path
20
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
21
+
22
+ import tokenizer_sl
23
+ import auto_evaluator_sl
24
+
25
+ print(os.path.dirname(__file__))
26
+
27
+
28
+ torch.manual_seed(1004)
29
+ np.random.seed(1004)
30
+
31
+ print(os.getcwd())
32
+
33
+ """
34
+ Note 1 to Dr. Lang
35
+
36
+ I have checked that when we not freeze the MTR model, then the test loss values are keep decreasing when I set the epochs as 7.
37
+ (At least for solute.) So We you may try to run more epochs if you want. But Solvent may be already overfitted or will get soon since it has only few data.
38
+
39
+ Using learning rate bigger than the default setting is not that recommanded since we don't freeze the MTR model.
40
+ But lower lr could work.
41
+
42
+ Be aware of doing version control (ver_ft). Make sure you keep the same version for both 'solute' and 'solvent' otherwise, you will get confused.
43
+
44
+ The variable "dir_model_ft_to_save" is where the FT model get saved.
45
+ The result csv files will be located at 'evaluations/corresponding version/solute and (or) solvent.csv'
46
+
47
+ You can run this code by
48
+ cd ~/SolLlama
49
+ python run_auto_llama_cuda0.py
50
+
51
+ But makes sure you are running this in your virtual environment that all requirements_cuda118.txt installed
52
+ """
53
+
54
+
55
+ """
56
+ # You can run both 'solute' and 'solvent' at one run by doing the below
57
+ for solute_or_solvent in ['solute' ,'solvent']:
58
+ The REST of the codes except the variant solute_or_solvent right below with this (SAME) indentation levels
59
+ """
60
+ #### Hyper Parameters ##### <- You can control these parameters as you want
61
+ # solute_or_solvent = 'solvent'
62
+ solute_or_solvent = 'solute'
63
+ ver_ft = 0 # version control for FT model & evaluation data # Or it will overwrite the models and results
64
+ batch_size_pair = [64, 64] if solute_or_solvent == 'solute' else [10, 10] # [train, valid(test)]
65
+ # since 'solute' has very small dataset. So I thinl 10 for train and 10 for valid(test) should be the maximum values.
66
+ lr = 0.0001
67
+ epochs = 7
68
+ use_freeze = False # Freeze the model or not # False measn not freezing
69
+ overwrite_level_2 = True # If you don't want to overwrite the models and csv files, then change this to False
70
+ ###########################
71
+
72
+
73
+ # I just reused our previous research code with some modifications.
74
+ dir_main = "/home/ylee/SolLlama"
75
+ name_model_mtr = "ChemLlama_Medium_30m_vloss_val_loss=0.029_ep_epoch=04.ckpt"
76
+
77
+ dir_model_mtr = f"{dir_main}/model_mtr/{name_model_mtr}"
78
+
79
+ max_seq_length = 512
80
+
81
+ tokenizer = tokenizer_sol.fn_load_tokenizer_llama(
82
+ max_seq_length=max_seq_length,
83
+ )
84
+ max_length = max_seq_length
85
+ num_workers = 2
86
+
87
+ dir_model_ft_to_save = f"{dir_main}/save_models_ft/ft_version_{ver_ft}"
88
+
89
+ array_level_2 = auto_evaluator_sol.auto_evaluator_level_2_sol(
90
+ dir_model_mtr=dir_model_mtr,
91
+ dir_model_ft_to_save=dir_model_ft_to_save,
92
+ tokenizer=tokenizer,
93
+ max_length=max_seq_length,
94
+ solute_or_solvent=solute_or_solvent,
95
+ num_workers=num_workers,
96
+ batch_size_pair=batch_size_pair,
97
+ lr=lr,
98
+ overwrite_level_2=overwrite_level_2,
99
+ epochs=epochs,
100
+ use_freeze=use_freeze,
101
+ )
102
+
103
+ print(array_level_2.shape)
104
+ print(array_level_2)
105
+
106
+ list_column_names_level_2 = [
107
+ 'solute_or_solvent',
108
+ 'metric_1',
109
+ 'metric_2',
110
+ 'epoch',
111
+ 'loss',
112
+ 'loss_ranking',
113
+ 'metric_1_ranking'
114
+ ]
115
+
116
+ df_evaluation_level_2 = pd.DataFrame(array_level_2, columns=list_column_names_level_2)
117
+
118
+ os.makedirs(f'{os.path.dirname(__file__)}/evaluations/ft_version_{ver_ft}', exist_ok=True)
119
+ df_evaluation_level_2.to_csv(f'{os.path.dirname(__file__)}/evaluations/ft_version_{ver_ft}/{solute_or_solvent}.csv', index=False)
120
+
121
+
122
+
.ipynb_checkpoints/tokenizer_sl-checkpoint.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import LlamaTokenizerFast
2
+ import json
3
+ import os
4
+
5
+
6
+ def fn_load_tokenizer_llama(
7
+ max_seq_length,
8
+ dir_tokenizer: str = "./tokenizer.json",
9
+ # dir_tokenizer:str = os.path.abspath(os.path.join(os.getcwd(), '..', "models_mtr/tokenizer.json")), # for JUP
10
+ add_eos_token:bool = True,
11
+ ):
12
+
13
+ tokenizer = LlamaTokenizerFast(
14
+ tokenizer_file=dir_tokenizer,
15
+ model_max_length=max_seq_length,
16
+ padding_side="right",
17
+ bos_token="<s>",
18
+ eos_token="</s>",
19
+ unk_token="<unk>",
20
+ add_eos_token=add_eos_token,
21
+ )
22
+ tokenizer.add_special_tokens({"pad_token": "<pad>", "sep_token": "</s>", "cls_token": "<s>", "mask_token":"<mask>"})
23
+ # tokenizer.add_special_tokens({"pad_token": "<pad>"})
24
+
25
+ return tokenizer
26
+
27
+ def fn_load_descriptor_list(
28
+ key_descriptor_list,
29
+ dir_descriptor_list,
30
+ ):
31
+
32
+ with open(dir_descriptor_list, "r") as js:
33
+ list_descriptor = json.load(js)[key_descriptor_list]
34
+
35
+ return list_descriptor
.ipynb_checkpoints/utils_sl-checkpoint.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.metrics import roc_curve, auc, roc_auc_score
2
+ from sklearn.metrics import mean_squared_error
3
+ from sklearn.metrics import r2_score
4
+ from sklearn.metrics import mean_absolute_error
5
+ from scipy.stats import spearmanr
6
+ import matplotlib.pyplot as plt
7
+
8
+ import numpy as np
9
+ import os
10
+
11
+ def model_evalulator_sol(
12
+ array_predictions,
13
+ array_labels,
14
+ # dataset_dict:dict,
15
+ solute_or_solvent:str,
16
+ show_plot:bool=True,
17
+ print_result:bool=True,
18
+ ):
19
+
20
+ if print_result:
21
+ print(f"Dataset : {solute_or_solvent}")
22
+ print("N:", array_labels.shape[0])
23
+
24
+ fig, ax = plt.subplots()
25
+
26
+ metric = mean_squared_error(array_labels, array_predictions, squared=False) #RMSE
27
+ r2 = r2_score(array_labels, array_predictions)
28
+ metric2 = mean_absolute_error(array_labels, array_predictions) # MAE
29
+ ax.scatter(array_labels, array_predictions)
30
+ ax.set_title("Scatter Plot of Labels vs Predictions")
31
+ ax.set_xlabel("Labels")
32
+ ax.set_ylabel("Predictions")
33
+
34
+ if print_result:
35
+ print("R2:", r2)
36
+ print("Root Mean Square Error:", metric)
37
+ print("Mean Absolute Error:", metric2)
38
+
39
+ # correlation, p_value = spearmanr(array_labels, array_predictions)
40
+
41
+ # if print_result:
42
+ # print("Spearman correlation:", correlation)
43
+ # print("p-value:", p_value)
44
+ # print("=======================================")
45
+
46
+ xmin, xmax = ax.get_xlim()
47
+ ax.set_ylim(xmin, xmax)
48
+
49
+ if not show_plot:
50
+ plt.ioff()
51
+ plt.clf()
52
+ plt.close()
53
+ else :
54
+ plt.show()
55
+
56
+ # metrict 1 - ROC score (classification) | RMSE (regression)
57
+ # metric 2 - None (classification) | MAE ( regression)
58
+ round_decimal = 6
59
+ if metric2 != None:
60
+ metric2 = round(metric2, round_decimal)
61
+
62
+ # list_p_value = str(p_value).split('e')
63
+ # p_value_mantissa = round(float(list_p_value[0]), round_decimal)
64
+ # if len(list_p_value) == 2:
65
+ # p_value_exponent = int(list_p_value[1])
66
+ # else:
67
+ # p_value_exponent = None
68
+
69
+ return [solute_or_solvent,
70
+ round(metric, round_decimal),
71
+ metric2]
72
+ # return [solute_or_solvent,
73
+ # round(metric, round_decimal),
74
+ # metric2,
75
+ # p_value_mantissa,
76
+ # p_value_exponent]
77
+
78
+ # from .model_finetune import CustomFinetuneModel
79
+ # import model_finetune_sol
80
+ import torch
81
+ def load_model_ft_with_epoch(class_model_ft,
82
+ target_epoch:int,
83
+ dir_model_ft:str,
84
+ name_model_ft:str):
85
+ # dir_model_ft level 1
86
+ # ex /main/model_mtr/model_mtr_ep/dataset
87
+
88
+ dir_all_model_ft = f"{dir_model_ft}/{name_model_ft}/version_0/checkpoints/"
89
+ list_files_in_dir_model_ft = os.listdir(dir_all_model_ft)
90
+ # extension = '.ckpt'
91
+ extension = '.pt'
92
+ list_model_ft_in_the_dir = sorted(list_files_in_dir_model_ft, key=lambda x: float(x.split('=')[-1].split('.')[0]))
93
+
94
+ print(f"Loaded model with epoch {target_epoch}")
95
+ dir_target_model_ft = f"{dir_all_model_ft}/{list_model_ft_in_the_dir[target_epoch]}"
96
+
97
+ # class_model_ft.load_from_checkpoint(dir_target_model_ft)
98
+
99
+ loaded_state_dict = torch.load(dir_target_model_ft)
100
+ class_model_ft.load_state_dict(loaded_state_dict['state_dict'])
101
+
102
+ return class_model_ft # now is model_ft
103
+
104
+ from scipy.stats import rankdata
105
+ # rankdata does not consider decimal places!
106
+ def rank_value_sol(
107
+ list_value,
108
+ # dataset_dict:dict,
109
+ is_loss:bool=True,
110
+ ):
111
+
112
+ list_value = np.array(list_value)
113
+
114
+ return np.array(rankdata(list_value * 100000, method='min')) - 1
app.py CHANGED
@@ -1,4 +1,133 @@
 
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- x = st.slider('Slect a value')
4
- st.write(x, 'squared is :', x * x)
 
1
+
2
  import streamlit as st
3
+ import subprocess
4
+
5
+ subprocesses.run(['git', 'clone', 'https://huggingface.co/ttmn/SolLlama-mtr'])
6
+
7
+ import sys
8
+ import os
9
+ import torch
10
+ import numpy as np
11
+ import pandas as pd
12
+ import warnings
13
+ import lightning as L
14
+ torch.set_float32_matmul_precision('high')
15
+ warnings.filterwarnings("ignore", module="pl_bolts")
16
+
17
+ sys.path.append( '../')
18
+
19
+ import tokenizer_sl, datamodule_finetune_sl, model_finetune_sl, chemllama_mtr, utils_sl
20
+ import auto_evaluator_sl
21
+
22
+ torch.manual_seed(1004)
23
+ np.random.seed(1004)
24
+
25
+ smiles_str = st.text_area('Enter SMILE string')
26
+
27
+ ###
28
+ solute_or_solvent = 'solute'
29
+ solute_or_solvent = st.selectbox('Solute or Solvent', ['Solute,' 'Solvent'])
30
+ ver_ft = 0 # version control for FT model & evaluation data # Or it will overwrite the models and results
31
+ batch_size_pair = [64, 64] if solute_or_solvent == 'solute' else [10, 10] # [train, valid(test)]
32
+ # since 'solute' has very small dataset. So I thinl 10 for train and 10 for valid(test) should be the maximum values.
33
+ lr = 0.0001
34
+ epochs = 7
35
+ use_freeze = False # Freeze the model or not # False measn not freezing
36
+ overwrite_level_2 = True
37
+ ###
38
+ max_seq_length = 512
39
+ tokenizer = tokenizer_sl.fn_load_tokenizer_llama(
40
+ max_seq_length=max_seq_length,
41
+ )
42
+ max_length = max_seq_length
43
+ num_workers = 2
44
+
45
+ # I just reused our previous research code with some modifications.
46
+ dir_main = "./"
47
+ name_model_mtr = "ChemLlama_Medium_30m_vloss_val_loss=0.029_ep_epoch=04.ckpt"
48
+
49
+ dir_model_mtr = f"{dir_main}/SolLmama-mtr/{name_model_mtr}"
50
+
51
+ max_seq_length = 512
52
+
53
+ tokenizer = tokenizer_sl.fn_load_tokenizer_llama(
54
+ max_seq_length=max_seq_length,
55
+ )
56
+ max_length = max_seq_length
57
+ num_workers = 2
58
+
59
+ ## FT
60
+
61
+ ver_ft = 0
62
+ dir_model_ft_to_save = f"{dir_main}/SolLlama-mtr"
63
+ # name_model_ft = 'Solvent.pt'
64
+ name_model_ft = f"{solute_or_solvent}.pt"
65
+
66
+ # Load dataset for finetune
67
+ batch_size_for_train = batch_size_pair[0]
68
+ batch_size_for_valid = batch_size_pair[1]
69
+
70
+ data_module = datamodule_finetune_sol.CustomFinetuneDataModule(
71
+ solute_or_solvent=solute_or_solvent,
72
+ tokenizer=tokenizer,
73
+ max_seq_length=max_length,
74
+ batch_size_train=batch_size_for_train,
75
+ batch_size_valid=batch_size_for_valid,
76
+ # num_device=int(config.NUM_DEVICE) * config.NUM_WORKERS_MULTIPLIER,
77
+ num_device=num_workers,
78
+ )
79
+
80
+ data_module.prepare_data(smiles_str=smiles_str)
81
+ data_module.setup()
82
+ steps_per_epoch = len(data_module.train_dataloader())
83
+
84
+ # Load model and optimizer for finetune
85
+ learning_rate = lr
86
+
87
+
88
+ model_mtr = chemllama_mtr.ChemLlama.load_from_checkpoint(dir_model_mtr)
89
+
90
+
91
+ model_ft = model_finetune_sl.CustomFinetuneModel(
92
+ model_mtr=model_mtr,
93
+ steps_per_epoch=steps_per_epoch,
94
+ warmup_epochs=1,
95
+ max_epochs=epochs,
96
+ learning_rate=learning_rate,
97
+ # dataset_dict=dataset_dict,
98
+ use_freeze=use_freeze,
99
+ )
100
+
101
+ # 'SolLlama_solute_vloss_val_loss=0.082_ep_epoch=06.pt'
102
+
103
+ trainer = L.Trainer(
104
+ default_root_dir=dir_model_ft_to_save,
105
+ # profiler=profiler,
106
+ # logger=csv_logger,
107
+ accelerator='auto',
108
+ devices='auto',
109
+ # accelerator='gpu',
110
+ # devices=[0],
111
+ min_epochs=1,
112
+ max_epochs=epochs,
113
+ precision=32,
114
+ # callbacks=[checkpoint_callback]
115
+ )
116
+
117
+
118
+ # Predict
119
+ local_model_ft = utils_sl.load_model_ft_with_epoch(
120
+ class_model_ft=model_ft,
121
+ target_epoch=ep,
122
+ dir_model_ft=dir_model_ft_to_save,
123
+ name_model_ft=name_model_ft
124
+ )
125
+
126
+ result = trainer.predict(local_model_ft, data_module)
127
+ result_pred = list()
128
+ result_label = list()
129
+ for bat in range(len(result)):
130
+ result_pred.append(result[bat][0].squeeze())
131
+ result_label.append(result[bat][1])
132
 
133
+ st.write(result_pred)
 
auto_evaluator_sl.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import re
4
+ import pandas as pd
5
+ import numpy as np
6
+
7
+ import lightning as L
8
+ from lightning.pytorch.loggers import CSVLogger
9
+ from lightning.pytorch.profilers import PyTorchProfiler
10
+ from lightning.pytorch.callbacks import ModelCheckpoint
11
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
12
+
13
+ # Now you can import from models_mtr
14
+ # from chemllama_mtr import ChemLlama
15
+ import chemllama_mtr
16
+
17
+ # from .datamodule_finetune import CustomFinetuneDataModule
18
+ import datamodule_finetune_sl
19
+ # from .model_finetune import CustomFinetuneModel
20
+ import model_finetune_sl
21
+ import utils_sol
22
+
23
+ def auto_evaluator_level_2_sol(
24
+ dir_model_mtr,
25
+ # dir_model_mtr_ep_to_save:str,
26
+ dir_model_ft_to_save:str,
27
+ tokenizer,
28
+ max_length:int,
29
+ # molnet_dict:dict,
30
+ # list_dataset_to_finetune:list,
31
+ solute_or_solvent:str,
32
+ num_workers:int,
33
+ batch_size_pair=[32, 48],
34
+ lr=0.0001,
35
+ overwrite_level_2:bool=False,
36
+ epochs:int=7,
37
+ use_freeze:bool=True
38
+ ):
39
+
40
+ """
41
+ Evaluate the "one" pretrained MTR model through multiple finetuning benchmarking dataset.
42
+
43
+ Parameters:
44
+ # - dir_model_mtr_ep_to_save (str): The pretrained model for MTR with epoch.
45
+ # EX with 0 epoch:
46
+ # /master_dicrectory/pre_trained_model_MTR_name/model_MTR_with_epoch
47
+ - batch_size_pair: The pair of the train and valid(+test) batch size (e.g. [32, 48] which is [32, int(32*1.5)])
48
+ - overwrite_level_2 (bool): If there exists such folder that has the same "dir_model_mtr_ep_to_save", overwite it.
49
+ Warning! This option is only for "dir_model_mtr_ep_to_save". It's sub directory and files will be overwritten!
50
+ """
51
+
52
+
53
+ assert not (os.path.exists(dir_model_ft_to_save) and overwrite_level_2 == False), f"You sat 'overwrite_level_2' False and '{dir_model_ft_to_save}' already exists. Check it again."
54
+
55
+
56
+ model_mtr = chemllama_mtr.ChemLlama.load_from_checkpoint(dir_model_mtr)
57
+
58
+ # # local_dataset_to_finetune is a key of molnet_dict
59
+ # list_local_finetuned_result = list()
60
+ # for local_dataset_to_finetune in list_dataset_to_finetune:
61
+
62
+ # dataset_dict = molnet_dict[local_dataset_to_finetune]
63
+ # dataset_dict["dataset_name"] = local_dataset_to_finetune
64
+
65
+ # dir_model_ft = f"{dir_model_mtr_ep_to_save}/{dataset_dict['dataset_name']}"
66
+ dir_model_ft = f"{dir_model_ft_to_save}"
67
+ # name_model_ft = utils_sol.model_ft_namer(dataset_dict['dataset_name'])
68
+ name_model_ft = f"SolLlama_{solute_or_solvent}"
69
+
70
+ # array_level_1, model_ft, data_loader_test
71
+ array_level_1 = auto_evaluator_level_1_sol(
72
+ model_mtr=model_mtr,
73
+ dir_model_ft=dir_model_ft,
74
+ name_model_ft=name_model_ft,
75
+ # dataset_dict=dataset_dict,
76
+ solute_or_solvent=solute_or_solvent,
77
+ tokenizer=tokenizer,
78
+ max_length=max_length,
79
+ num_workers=num_workers,
80
+ batch_size_pair=batch_size_pair,
81
+ lr=lr,
82
+ epochs=epochs,
83
+ use_freeze=use_freeze,
84
+ )
85
+
86
+ return array_level_1
87
+
88
+ # list_local_finetuned_result.append(array_level_1)
89
+
90
+ # array_level_2 = np.vstack(list_local_finetuned_result)
91
+ # array_level_2 shaped (number of epochs x len(list_dataset_to_finetune), number of columns at the bottom)
92
+ # dataset_name, task, RMSE, MAE, p_value mantissam, p_value exponent, epoch, loss, loss_ranking, metric_1_ranking
93
+
94
+ # return array_level_2
95
+
96
+ def auto_evaluator_level_1_sol(
97
+ model_mtr,
98
+ dir_model_ft:str,
99
+ name_model_ft:str,
100
+ # dataset_dict:dict,
101
+ solute_or_solvent:str,
102
+ tokenizer,
103
+ max_length:int,
104
+ num_workers:int, ##
105
+ batch_size_pair=[32, 48],
106
+ lr=0.0001,
107
+ epochs:int=7,
108
+ use_freeze:bool=True,
109
+ ):
110
+
111
+ """
112
+ Automate the entire process including preparing "one" finetuning dataset + finetuing + evalulation.
113
+ This is a step before the level 2 evaluate automation.
114
+
115
+ Parameters:
116
+ - model_mtr: The pretrained model for MTR.
117
+ - dir_model_ft (str): The directory where the model to be stored.
118
+ - name_model_ft (str): The name of the model for finetune to be titled.
119
+ An example of the directory of the fintuned model with 0 epoch:
120
+ {dir_folder}/{name_model_ft}_ep_000
121
+ - batch_size_pair: The pair of the train and valid(+test) batch size (e.g. [32, 48] which is [32, int(32*1.5)])
122
+ """
123
+
124
+ csv_logger = CSVLogger(
125
+ save_dir=dir_model_ft,
126
+ name=name_model_ft,
127
+ version=0,
128
+ )
129
+
130
+ checkpoint_callback = ModelCheckpoint(
131
+ monitor='val_loss',
132
+ filename=name_model_ft + '_vloss_{val_loss:.3f}_ep_{epoch:02d}',
133
+ every_n_epochs=1,
134
+ save_top_k=-1,
135
+ enable_version_counter=False, # keep the version == 0
136
+ save_weights_only=True,
137
+ )
138
+ checkpoint_callback.FILE_EXTENSION = ".pt"
139
+
140
+ # Load dataset for finetune
141
+ batch_size_for_train = batch_size_pair[0]
142
+ batch_size_for_valid = batch_size_pair[1]
143
+
144
+ data_module = datamodule_finetune_sol.CustomFinetuneDataModule(
145
+ solute_or_solvent=solute_or_solvent,
146
+ tokenizer=tokenizer,
147
+ max_seq_length=max_length,
148
+ batch_size_train=batch_size_for_train,
149
+ batch_size_valid=batch_size_for_valid,
150
+ # num_device=int(config.NUM_DEVICE) * config.NUM_WORKERS_MULTIPLIER,
151
+ num_device=num_workers,
152
+ )
153
+ data_module.prepare_data()
154
+ data_module.setup()
155
+ steps_per_epoch = len(data_module.train_dataloader())
156
+
157
+ # Load model and optimizer for finetune
158
+ learning_rate = lr
159
+
160
+ model_ft = model_finetune_sol.CustomFinetuneModel(
161
+ model_mtr=model_mtr,
162
+ steps_per_epoch=steps_per_epoch,
163
+ warmup_epochs=1,
164
+ max_epochs=epochs,
165
+ learning_rate=learning_rate,
166
+ # dataset_dict=dataset_dict,
167
+ use_freeze=use_freeze,
168
+ )
169
+
170
+ trainer = L.Trainer(
171
+ default_root_dir=dir_model_ft,
172
+ # profiler=profiler,
173
+ logger=csv_logger,
174
+ accelerator='auto',
175
+ devices='auto',
176
+ # accelerator='gpu',
177
+ # devices=[0],
178
+ min_epochs=1,
179
+ max_epochs=epochs,
180
+ precision=32,
181
+ callbacks=[checkpoint_callback]
182
+ )
183
+ trainer.fit(model_ft, data_module)
184
+ trainer.validate(model_ft, data_module)
185
+
186
+ list_validation_loss = pd.read_csv(f"{dir_model_ft}/{name_model_ft}/version_0/metrics.csv", usecols=['val_loss'])['val_loss'].dropna().tolist()[:epochs]
187
+
188
+ # class_model_ft = CustomFinetuneModel
189
+ # Level 1 Automation - Evaulate the finetuned model through every epoch
190
+ array_level_1 = auto_evaluator_level_1_sub_sol(
191
+ class_model_ft=model_ft,
192
+ list_validation_loss=list_validation_loss,
193
+ dir_model_ft=dir_model_ft,
194
+ name_model_ft=name_model_ft,
195
+ data_module=data_module,
196
+ # dataset_dict=dataset_dict,
197
+ solute_or_solvent=solute_or_solvent,
198
+ trainer=trainer
199
+ )
200
+
201
+ return array_level_1
202
+
203
+ def auto_evaluator_level_1_sub_sol(
204
+ class_model_ft,
205
+ list_validation_loss,
206
+ dir_model_ft:str,
207
+ name_model_ft:str,
208
+ data_module,
209
+ # dataset_dict:dict,
210
+ solute_or_solvent:str,
211
+ trainer,
212
+ ):
213
+
214
+ """
215
+ Evaluate the finetuned model by a single finetuning dataset.
216
+
217
+ Guides for some parameters:
218
+ - model_mtr: The pretrained model for MTR.
219
+ - dir_model_ft (str): The directory where the model to be stored.
220
+ - name_model_ft (str): The name of the model for finetune to be titled.
221
+ An example of the directory of the fintuned model with 0 epoch:
222
+ {dir_folder}/{name_model_ft}_ep_000
223
+ """
224
+
225
+ array_loss_ranking = utils_sol.rank_value_sol(
226
+ list_value=list_validation_loss,
227
+ # dataset_dict=dataset_dict,
228
+ is_loss=True,
229
+ )
230
+ # ranking : lower the better. ranking starting from 0
231
+
232
+ print("- Epoch starts from 0")
233
+ print("=======================================")
234
+
235
+ list_level_1 = list()
236
+ for ep in range(len(list_validation_loss)):
237
+
238
+ local_model_ft = utils_sol.load_model_ft_with_epoch(
239
+ class_model_ft=class_model_ft,
240
+ target_epoch=ep,
241
+ dir_model_ft=dir_model_ft,
242
+ name_model_ft=name_model_ft
243
+ )
244
+
245
+ result = trainer.predict(local_model_ft, data_module)
246
+ result_pred = list()
247
+ result_label = list()
248
+ for bat in range(len(result)):
249
+ result_pred.append(result[bat][0].squeeze())
250
+ result_label.append(result[bat][1])
251
+
252
+ list_local_model_ft_result = utils_sol.model_evalulator_sol(
253
+ array_predictions=np.vstack(result_pred),
254
+ array_labels=np.vstack(result_label),
255
+ # dataset_dict=dataset_dict,
256
+ solute_or_solvent=solute_or_solvent,
257
+ show_plot=False,
258
+ print_result=False,
259
+ )
260
+ # dataset_name, task, RMSE, MAE, p_value mantissam, p_value exponent
261
+
262
+ # add epoch (starting from 0) to the right
263
+ list_local_model_ft_result.append(ep)
264
+ # dataset_name, task, metric1 (RMSE or ROC-AUC), metric2 (MAE or None), p_value mantissam, p_value exponent, epoch
265
+
266
+ list_level_1.append(list_local_model_ft_result)
267
+ print("=======================================")
268
+ print("=======================================")
269
+
270
+ # to get the metric_1 ranking
271
+ array_level_1 = np.array(list_level_1)
272
+ array_metric_1 = array_level_1[:, 2].astype('float32')
273
+ array_metric_1_ranking = utils_sol.rank_value_sol(list_value=array_metric_1,
274
+ # dataset_dict=dataset_dict,
275
+ is_loss=False)
276
+
277
+ # add loss, and ranking of the loss value to the right
278
+ # reg: lower the better, class: higher the better
279
+ array_level_1 = np.hstack((list_level_1,
280
+ np.expand_dims(list_validation_loss, axis=1),
281
+ np.expand_dims(array_loss_ranking, axis=1),
282
+ np.expand_dims(array_metric_1_ranking, axis=1)))
283
+ # solute_or_solvent, RMSE, MAE, p_value mantissam, p_value exponent, epoch, loss, loss_ranking, metric_1_ranking
284
+
285
+ return array_level_1
286
+ #################################### EX #########################################
287
+ # list_column_names = ['solute_or_solvent',
288
+ # 'metric_1',
289
+ # 'metric_2',
290
+ # 'p_value_mantissa',
291
+ # 'p_value_exponent',
292
+ # 'epoch',
293
+ # 'loss',
294
+ # 'loss_ranking',
295
+ # 'metric_1_ranking']
296
+ # df_evaluation_level_1 = pd.DataFrame(array_level_1, columns=list_column_names)
297
+ #################################################################################
chemllama_mtr.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import lightning as L
2
+ import torch
3
+ import torchmetrics
4
+
5
+ from torch import nn
6
+ from transformers import LlamaModel, LlamaConfig
7
+ from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
8
+
9
+ class ChemLlama(L.LightningModule):
10
+ def __init__(
11
+ self,
12
+ max_position_embeddings,
13
+ vocab_size,
14
+ pad_token_id,
15
+ bos_token_id,
16
+ eos_token_id,
17
+ steps_per_epoch=None, #
18
+ warmup_epochs=None, #
19
+ max_epochs=None, #
20
+ hidden_size=384,
21
+ intermediate_size=464,
22
+ num_labels=105,
23
+ attention_dropout=0.144,
24
+ num_hidden_layers=3,
25
+ num_attention_heads=12,
26
+ learning_rate=0.0001,
27
+ ):
28
+ super(ChemLlama, self).__init__()
29
+ self.save_hyperparameters()
30
+
31
+ self.hidden_size = hidden_size
32
+ self.intermediate_size = intermediate_size
33
+ self.num_labels = num_labels
34
+ self.vocab_size = vocab_size
35
+ self.pad_token_id = pad_token_id
36
+ self.bos_token_id = bos_token_id
37
+ self.eos_token_id = eos_token_id
38
+ self.steps_per_epoch = steps_per_epoch #
39
+ self.warmup_epochs = warmup_epochs #
40
+ self.max_epochs = max_epochs #
41
+ self.num_hidden_layers = num_hidden_layers
42
+ self.num_attention_heads = num_attention_heads
43
+ self.attention_dropout = attention_dropout
44
+ self.max_position_embeddings = max_position_embeddings
45
+ self.learning_rate = learning_rate
46
+
47
+ self.mae = torchmetrics.MeanAbsoluteError()
48
+ self.mse = torchmetrics.MeanSquaredError()
49
+
50
+ self.config_llama = LlamaConfig(
51
+ max_position_embeddings=self.max_position_embeddings,
52
+ vocab_size=self.vocab_size,
53
+ hidden_size=self.hidden_size,
54
+ intermediate_size=self.intermediate_size,
55
+ num_hidden_layers=self.num_hidden_layers,
56
+ num_attention_heads=self.num_attention_heads,
57
+ attention_dropout=self.attention_dropout,
58
+ pad_token_id=self.pad_token_id,
59
+ bos_token_id=self.bos_token_id,
60
+ eos_token_id=self.eos_token_id,
61
+ )
62
+
63
+ self.loss_fn = nn.L1Loss()
64
+
65
+ self.llama = LlamaModel(self.config_llama)
66
+ self.gelu = nn.GELU()
67
+ self.score = nn.Linear(self.hidden_size, self.num_labels)
68
+
69
+ def forward(self, input_ids, attention_mask, labels=None):
70
+
71
+ transformer_outputs = self.llama(
72
+ input_ids=input_ids, attention_mask=attention_mask
73
+ )
74
+
75
+ hidden_states = transformer_outputs[0]
76
+ hidden_states = self.gelu(hidden_states)
77
+ logits = self.score(hidden_states)
78
+
79
+ if input_ids is not None:
80
+ batch_size = input_ids.shape[0]
81
+ else:
82
+ batch_size = inputs_embeds.shape[0]
83
+
84
+ if self.config_llama.pad_token_id is None and batch_size != 1:
85
+ raise ValueError(
86
+ "Cannot handle batch sizes > 1 if no padding token is defined."
87
+ )
88
+ if self.config_llama.pad_token_id is None:
89
+ sequence_lengths = -1
90
+ else:
91
+ if input_ids is not None:
92
+ # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
93
+ sequence_lengths = (
94
+ torch.eq(input_ids, self.config_llama.pad_token_id).int().argmax(-1)
95
+ - 1
96
+ )
97
+ sequence_lengths = sequence_lengths % input_ids.shape[-1]
98
+ sequence_lengths = sequence_lengths.to(logits.device)
99
+ else:
100
+ sequence_lengths = -1
101
+ # raise ValueError(len(sequence_lengths), sequence_lengths)
102
+
103
+ pooled_logits = logits[
104
+ torch.arange(batch_size, device=logits.device), sequence_lengths
105
+ ]
106
+ return pooled_logits
107
+
108
+ def training_step(self, batch, batch_idx):
109
+
110
+ loss, logits, labels = self._common_step(batch=batch, batch_idx=batch_idx)
111
+
112
+ # mae = self.mae(logits, labels)
113
+ # mse = self.mse(logits, labels)
114
+ self.log_dict(
115
+ {
116
+ "train_loss": loss,
117
+ # "train_mae": mae,
118
+ # "train_mse": mse
119
+ },
120
+ on_step=True,
121
+ on_epoch=True,
122
+ prog_bar=True,
123
+ sync_dist=True,
124
+ # logger=True,
125
+ )
126
+ # on_stecp = True will use lots of computational resources
127
+
128
+ # return loss
129
+ return {"loss": loss, "logits": logits, "labels": labels}
130
+
131
+ def train_epoch_end(self, outputs):
132
+ # avg_loss = torch.stack([x["loss"] for x in outputs]).mean()
133
+ scores = torch.cat([x["logits"] for x in outputs])
134
+ labels = torch.cat([x["labels"] for x in outputs])
135
+ self.log_dict(
136
+ {
137
+ "train_mae": self.mae(scores, labels),
138
+ "train_mse": self.mse(scores, labels)
139
+ },
140
+ on_step=True,
141
+ on_epoch=True,
142
+ prog_bar=True,
143
+ sync_dist=True,
144
+ )
145
+
146
+ def validation_step(self, batch, batch_idx):
147
+
148
+ loss, logits, labels = self._common_step(batch=batch, batch_idx=batch_idx)
149
+ # self.log("val_loss", loss)
150
+ self.log("val_loss", loss, sync_dist=True)
151
+ return loss
152
+
153
+ def test_step(self, batch, batch_idx):
154
+
155
+ loss, logits, labels = self._common_step(batch=batch, batch_idx=batch_idx)
156
+ # self.log("val_loss", loss)
157
+ self.log("test_loss", loss, sync_dist=True,)
158
+ return loss
159
+
160
+ def _common_step(self, batch, batch_idx):
161
+
162
+ logits = self.forward(
163
+ input_ids=batch["input_ids"].squeeze(),
164
+ attention_mask=batch["attention_mask"].squeeze(),
165
+ )
166
+
167
+ labels = batch["labels"].squeeze()
168
+ loss = self.loss_fn(logits, labels)
169
+
170
+ # print(f"logits : {logits.shape}")
171
+ # print(f"labels : {labels.shape}")
172
+
173
+ return loss, logits, labels
174
+
175
+ # def configure_optimizers(self): # Schedular here too!
176
+ # # since confiture_optimizers and the model are included in the same class.. self.parameters()
177
+ # return torch.optim.AdamW(
178
+ # params=self.parameters(),
179
+ # lr=self.learning_rate,
180
+ # betas=(0.9, 0.999),
181
+ # weight_decay=0.01,
182
+ # )
183
+
184
+ # # The below is for warm-up scheduler
185
+ # https://lightning.ai/forums/t/how-to-use-warmup-lr-cosineannealinglr-in-lightning/1980
186
+ # https://github.com/Lightning-AI/pytorch-lightning/issues/328
187
+ def configure_optimizers(self): # Schedular here too!
188
+ # since confiture_optimizers and the model are included in the same class.. self.parameters()
189
+ optimizer = torch.optim.AdamW(
190
+ params=self.parameters(),
191
+ lr=self.learning_rate,
192
+ betas=(0.9, 0.999),
193
+ weight_decay=0.01,
194
+ )
195
+
196
+ # "warmup_epochs //4 only not max_epochs" will work
197
+ scheduler = LinearWarmupCosineAnnealingLR(
198
+ optimizer,
199
+ warmup_epochs=self.warmup_epochs*self.steps_per_epoch // torch.cuda.device_count(), # // num_device in this case
200
+ max_epochs=self.max_epochs*self.steps_per_epoch // torch.cuda.device_count(),
201
+ )
202
+
203
+ return {
204
+ "optimizer": optimizer,
205
+ "lr_scheduler": {
206
+ "scheduler": scheduler,
207
+ "interval": "step",
208
+ "frequency": 1,
209
+ "reduce_on_plateau": False,
210
+ "monitor": "val_loss",
211
+ }
212
+ }
datamodule_finetune_sl.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import lightning as L
2
+ import torch
3
+ import numpy as np
4
+ import pandas as pd
5
+ from sklearn.model_selection import train_test_split
6
+
7
+ from torch.utils.data import Dataset, DataLoader
8
+ from transformers import DataCollatorWithPadding
9
+
10
+
11
+ class CustomLlamaDatasetAbraham(Dataset):
12
+ def __init__(self, df, tokenizer, max_seq_length):
13
+ self.keys = df.iloc[:, 0] # 1D array
14
+ self.labels = df.iloc[:, 1:] # 2D array
15
+ self.tokenizer = tokenizer
16
+ self.max_seq_length = max_seq_length
17
+
18
+ def __len__(self):
19
+ return self.keys.shape[0]
20
+
21
+ def fn_token_encode(self, smiles):
22
+ return self.tokenizer(
23
+ smiles,
24
+ truncation=True,
25
+ padding="max_length",
26
+ max_length=self.max_seq_length,
27
+ )
28
+
29
+ def __getitem__(self, idx):
30
+ local_encoded = self.fn_token_encode(self.keys.iloc[idx])
31
+
32
+ return {
33
+ "input_ids": torch.tensor(local_encoded["input_ids"]),
34
+ "attention_mask": torch.tensor(local_encoded["attention_mask"]),
35
+ "labels": torch.tensor(self.labels.iloc[idx]),
36
+ }
37
+
38
+
39
+ class CustomFinetuneDataModule(L.LightningDataModule):
40
+ def __init__(
41
+ self,
42
+ solute_or_solvent,
43
+ tokenizer,
44
+ max_seq_length,
45
+ batch_size_train,
46
+ batch_size_valid,
47
+ num_device,
48
+ ):
49
+ super().__init__()
50
+
51
+ self.solute_or_solvent = solute_or_solvent
52
+ self.tokenizer = tokenizer
53
+ self.max_seq_length = max_seq_length
54
+ self.batch_size_train = batch_size_train
55
+ self.batch_size_valid = batch_size_valid
56
+ self.data_collator = DataCollatorWithPadding(self.tokenizer)
57
+ self.num_device = num_device
58
+
59
+
60
+ def prepare_data(self, smiles_str:str):
61
+ # self.list_df = load_abraham(self.solute_or_solvent)
62
+ self.smiles_str = smiles_str
63
+
64
+ def setup(self, stage=None):
65
+ # self.train_df, self.valid_df, self.test_df = self.list_df
66
+ self.train_df = None
67
+ self.valid_df = None
68
+ self.test_df = self.smiles_str
69
+
70
+ def train_dataloader(self):
71
+ return DataLoader(
72
+ dataset=CustomLlamaDatasetAbraham(
73
+ self.train_df, self.tokenizer, self.max_seq_length,
74
+ ),
75
+ batch_size=self.batch_size_train,
76
+ num_workers=self.num_device,
77
+ collate_fn=self.data_collator,
78
+ shuffle=True,
79
+ )
80
+
81
+ def val_dataloader(self):
82
+ return DataLoader(
83
+ dataset=CustomLlamaDatasetAbraham(
84
+ self.valid_df, self.tokenizer, self.max_seq_length,
85
+ ),
86
+ batch_size=self.batch_size_valid,
87
+ num_workers=self.num_device,
88
+ collate_fn=self.data_collator,
89
+ shuffle=False,
90
+ )
91
+
92
+ def test_dataloader(self):
93
+ return DataLoader(
94
+ dataset=CustomLlamaDatasetAbraham(
95
+ self.test_df, self.tokenizer, self.max_seq_length,
96
+ ),
97
+ batch_size=self.batch_size_valid,
98
+ num_workers=self.num_device,
99
+ collate_fn=self.data_collator,
100
+ shuffle=False,
101
+ )
102
+
103
+ # It uses test_df
104
+ def predict_dataloader(self):
105
+ return DataLoader(
106
+ dataset=CustomLlamaDatasetAbraham(
107
+ self.test_df, self.tokenizer, self.max_seq_length,
108
+ ),
109
+ batch_size=self.batch_size_valid,
110
+ num_workers=self.num_device,
111
+ collate_fn=self.data_collator,
112
+ shuffle=False,
113
+ )
114
+
115
+
116
+
117
+
118
+
119
+
120
+
dict_dtype_slpy ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ dict_dtype = {
2
+ "solute_or_solvent": "str",
3
+ "metric_1": "float32",
4
+ "metric_2": "float32",
5
+ "epoch": "int32",
6
+ "loss": "float32",
7
+ "loss_ranking": "int32",
8
+ "metric_1_ranking": "int32",
9
+ }
model_finetune_sl.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import lightning as L
4
+ from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR
5
+ import numpy as np
6
+
7
+ class CustomFinetuneModel(L.LightningModule):
8
+ def __init__(
9
+ self,
10
+ model_mtr,
11
+ steps_per_epoch, #
12
+ warmup_epochs, #
13
+ max_epochs, #
14
+ learning_rate,
15
+ linear_param:int=64,
16
+ use_freeze:bool=True,
17
+ *args, **kwargs
18
+ ):
19
+ super(CustomFinetuneModel, self).__init__()
20
+ # self.save_hyperparameters()
21
+
22
+ self.model_mtr = model_mtr
23
+ if use_freeze:
24
+ self.model_mtr.freeze()
25
+ # for name, param in model_mtr.named_parameters():
26
+ # param.requires_grad = False
27
+ # print(name, param.requires_grad)
28
+
29
+ self.steps_per_epoch = steps_per_epoch
30
+ self.warmup_epochs = warmup_epochs
31
+ self.max_epochs = max_epochs
32
+ self.learning_rate = learning_rate
33
+
34
+ self.list_val_loss = list()
35
+
36
+ self.gelu = nn.GELU()
37
+ self.linear1 = nn.Linear(self.model_mtr.num_labels, linear_param)
38
+ self.linear2 = nn.Linear(linear_param, linear_param)
39
+ self.regression = nn.Linear(linear_param, 5)
40
+
41
+ self.loss_fn = nn.L1Loss()
42
+
43
+ def forward(self, input_ids, attention_mask, labels=None):
44
+ x = self.model_mtr(input_ids=input_ids, attention_mask=attention_mask)
45
+ x = self.gelu(x)
46
+ x = self.linear1(x)
47
+ x = self.gelu(x)
48
+ x = self.linear2(x)
49
+ x = self.gelu(x)
50
+ x = self.regression(x)
51
+
52
+ return x
53
+
54
+ def training_step(self, batch, batch_idx):
55
+
56
+ loss, logits, labels = self._common_step(batch=batch, batch_idx=batch_idx)
57
+
58
+ self.log_dict(
59
+ {
60
+ "train_loss": loss,
61
+ },
62
+ on_step=True,
63
+ on_epoch=True,
64
+ prog_bar=True,
65
+ # sync_dist=True,
66
+ )
67
+
68
+ return {"loss": loss, "logits": logits, "labels": labels}
69
+
70
+ def validation_step(self, batch, batch_idx):
71
+
72
+ loss, logits, labels = self._common_step(batch=batch, batch_idx=batch_idx)
73
+ # self.log("val_loss", loss)
74
+ self.log("val_loss", loss, sync_dist=True)
75
+
76
+ return loss
77
+
78
+ def valid_epoch_end(self, outputs):
79
+ # avg_loss = torch.stack([x["loss"] for x in outputs]).mean()
80
+ scores = torch.cat([x["logits"] for x in outputs])
81
+ labels = torch.cat([x["labels"] for x in outputs])
82
+ self.list_val_loss.append(self.loss_fn(scores, labels))
83
+ self.log_dict(
84
+ {
85
+ "list_val_loss": self.list_val_loss,
86
+ },
87
+ on_step=False,
88
+ on_epoch=True,
89
+ prog_bar=True,
90
+ # sync_dist=True,
91
+ )
92
+
93
+ # def get_val_loss_history(self):
94
+ # return np.array(self.list_val_loss).squeeze()
95
+
96
+ def test_step(self, batch, batch_idx):
97
+
98
+ loss, logits, labels = self._common_step(batch=batch, batch_idx=batch_idx)
99
+ # self.log("val_loss", loss)
100
+ self.log("test_loss", loss, sync_dist=True,)
101
+
102
+ return loss
103
+
104
+ def _common_step(self, batch, batch_idx):
105
+
106
+ logits = self.forward(
107
+ input_ids=batch["input_ids"].squeeze(),
108
+ attention_mask=batch["attention_mask"].squeeze(),
109
+ ).squeeze()
110
+
111
+ labels = batch["labels"]
112
+ loss = self.loss_fn(logits, labels)
113
+
114
+ return loss, logits, labels
115
+
116
+ def predict_step(self, batch, batch_idx):
117
+ loss, logits, labels = self._common_step(batch=batch, batch_idx=batch_idx)
118
+
119
+ return logits, labels
120
+
121
+ def configure_optimizers(self): # Schedular here too!
122
+ # since confiture_optimizers and the model are included in the same class.. self.parameters()
123
+ optimizer = torch.optim.AdamW(
124
+ params=self.parameters(),
125
+ lr=self.learning_rate,
126
+ betas=(0.9, 0.999),
127
+ weight_decay=0.01,
128
+ )
129
+
130
+ # "warmup_epochs //4 only not max_epochs" will work
131
+ scheduler = LinearWarmupCosineAnnealingLR(
132
+ optimizer,
133
+ # warmup_epochs=self.warmup_epochs*self.steps_per_epoch // 4, # // num_device in this case
134
+ # max_epochs=self.max_epochs*self.steps_per_epoch // 4,
135
+ # Better not to use Multiple GPUs due to the smaller dataset size.
136
+ warmup_epochs=self.warmup_epochs*self.steps_per_epoch, # // num_device in this case
137
+ max_epochs=self.max_epochs*self.steps_per_epoch,
138
+ )
139
+
140
+ return {
141
+ "optimizer": optimizer,
142
+ "lr_scheduler": {
143
+ "scheduler": scheduler,
144
+ "interval": "step",
145
+ "frequency": 1,
146
+ "reduce_on_plateau": False,
147
+ "monitor": "val_loss",
148
+ }
149
+ }
150
+
151
+
152
+
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ lightning
4
+ lightning-bolts
5
+ numpy
6
+ pytorch-ignite
7
+ pytorch-lightning
8
+ pytorch-lightning-bolts
9
+ pytorch-warmup
10
+ scikit-learn
11
+ scipy
12
+ seaborn
run_auto_llama_cuda0.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+
4
+ # This means you will use the first GPU among the four GPUs in our case.
5
+ # "0", "1", "2", "3". Since FT dataset is small, using one GPU should be proper.
6
+ os.environ["CUDA_VISIBLE_DEVICES"]= "0"
7
+
8
+
9
+ import torch
10
+ import numpy as np
11
+ import pandas as pd
12
+ import warnings
13
+ import lightning as L
14
+ torch.set_float32_matmul_precision('high')
15
+
16
+ # Filter out FutureWarning and UnderReviewWarning messages from pl_bolts
17
+ warnings.filterwarnings("ignore", module="pl_bolts")
18
+
19
+ # Add the parent directory to sys.path
20
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
21
+
22
+ import tokenizer_sl
23
+ import auto_evaluator_sl
24
+
25
+ print(os.path.dirname(__file__))
26
+
27
+
28
+ torch.manual_seed(1004)
29
+ np.random.seed(1004)
30
+
31
+ print(os.getcwd())
32
+
33
+ """
34
+ Note 1 to Dr. Lang
35
+
36
+ I have checked that when we not freeze the MTR model, then the test loss values are keep decreasing when I set the epochs as 7.
37
+ (At least for solute.) So We you may try to run more epochs if you want. But Solvent may be already overfitted or will get soon since it has only few data.
38
+
39
+ Using learning rate bigger than the default setting is not that recommanded since we don't freeze the MTR model.
40
+ But lower lr could work.
41
+
42
+ Be aware of doing version control (ver_ft). Make sure you keep the same version for both 'solute' and 'solvent' otherwise, you will get confused.
43
+
44
+ The variable "dir_model_ft_to_save" is where the FT model get saved.
45
+ The result csv files will be located at 'evaluations/corresponding version/solute and (or) solvent.csv'
46
+
47
+ You can run this code by
48
+ cd ~/SolLlama
49
+ python run_auto_llama_cuda0.py
50
+
51
+ But makes sure you are running this in your virtual environment that all requirements_cuda118.txt installed
52
+ """
53
+
54
+
55
+ """
56
+ # You can run both 'solute' and 'solvent' at one run by doing the below
57
+ for solute_or_solvent in ['solute' ,'solvent']:
58
+ The REST of the codes except the variant solute_or_solvent right below with this (SAME) indentation levels
59
+ """
60
+ #### Hyper Parameters ##### <- You can control these parameters as you want
61
+ # solute_or_solvent = 'solvent'
62
+ solute_or_solvent = 'solute'
63
+ ver_ft = 0 # version control for FT model & evaluation data # Or it will overwrite the models and results
64
+ batch_size_pair = [64, 64] if solute_or_solvent == 'solute' else [10, 10] # [train, valid(test)]
65
+ # since 'solute' has very small dataset. So I thinl 10 for train and 10 for valid(test) should be the maximum values.
66
+ lr = 0.0001
67
+ epochs = 7
68
+ use_freeze = False # Freeze the model or not # False measn not freezing
69
+ overwrite_level_2 = True # If you don't want to overwrite the models and csv files, then change this to False
70
+ ###########################
71
+
72
+
73
+ # I just reused our previous research code with some modifications.
74
+ dir_main = "/home/ylee/SolLlama"
75
+ name_model_mtr = "ChemLlama_Medium_30m_vloss_val_loss=0.029_ep_epoch=04.ckpt"
76
+
77
+ dir_model_mtr = f"{dir_main}/model_mtr/{name_model_mtr}"
78
+
79
+ max_seq_length = 512
80
+
81
+ tokenizer = tokenizer_sol.fn_load_tokenizer_llama(
82
+ max_seq_length=max_seq_length,
83
+ )
84
+ max_length = max_seq_length
85
+ num_workers = 2
86
+
87
+ dir_model_ft_to_save = f"{dir_main}/save_models_ft/ft_version_{ver_ft}"
88
+
89
+ array_level_2 = auto_evaluator_sol.auto_evaluator_level_2_sol(
90
+ dir_model_mtr=dir_model_mtr,
91
+ dir_model_ft_to_save=dir_model_ft_to_save,
92
+ tokenizer=tokenizer,
93
+ max_length=max_seq_length,
94
+ solute_or_solvent=solute_or_solvent,
95
+ num_workers=num_workers,
96
+ batch_size_pair=batch_size_pair,
97
+ lr=lr,
98
+ overwrite_level_2=overwrite_level_2,
99
+ epochs=epochs,
100
+ use_freeze=use_freeze,
101
+ )
102
+
103
+ print(array_level_2.shape)
104
+ print(array_level_2)
105
+
106
+ list_column_names_level_2 = [
107
+ 'solute_or_solvent',
108
+ 'metric_1',
109
+ 'metric_2',
110
+ 'epoch',
111
+ 'loss',
112
+ 'loss_ranking',
113
+ 'metric_1_ranking'
114
+ ]
115
+
116
+ df_evaluation_level_2 = pd.DataFrame(array_level_2, columns=list_column_names_level_2)
117
+
118
+ os.makedirs(f'{os.path.dirname(__file__)}/evaluations/ft_version_{ver_ft}', exist_ok=True)
119
+ df_evaluation_level_2.to_csv(f'{os.path.dirname(__file__)}/evaluations/ft_version_{ver_ft}/{solute_or_solvent}.csv', index=False)
120
+
121
+
122
+
tokenizer.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"<pad>","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":11,"special":true,"content":"<unk>","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":12,"special":true,"content":"<s>","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":13,"special":true,"content":"</s>","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"ByteLevel","add_prefix_space":false,"trim_offsets":true},"post_processor":{"type":"RobertaProcessing","sep":["[SEP]",13],"cls":["[CLS]",12],"trim_offsets":true,"add_prefix_space":false},"decoder":{"type":"ByteLevel","add_prefix_space":true,"trim_offsets":true},"model":{"type":"BPE","dropout":null,"unk_token":null,"continuing_subword_prefix":"","end_of_word_suffix":"","fuse_unk":false,"vocab":{"<pad>":0,"[unused1]":1,"[unused2]":2,"[unused3]":3,"[unused4]":4,"[unused5]":5,"[unused6]":6,"[unused7]":7,"[unused8]":8,"[unused9]":9,"[unused10]":10,"<unk>":11,"<s>":12,"</s>":13,"<mask>":14,"c":15,"C":16,"(":17,")":18,"O":19,"1":20,"2":21,"=":22,"N":23,".":24,"n":25,"3":26,"F":27,"Cl":28,">>":29,"~":30,"-":31,"4":32,"[C@H]":33,"S":34,"[C@@H]":35,"[O-]":36,"Br":37,"#":38,"/":39,"[nH]":40,"[N+]":41,"s":42,"5":43,"o":44,"P":45,"[Na+]":46,"[Si]":47,"I":48,"[Na]":49,"[Pd]":50,"[K+]":51,"[K]":52,"[P]":53,"B":54,"[C@]":55,"[C@@]":56,"[Cl-]":57,"6":58,"[OH-]":59,"\\":60,"[N-]":61,"[Li]":62,"[H]":63,"[2H]":64,"[NH4+]":65,"[c-]":66,"[P-]":67,"[Cs+]":68,"[Li+]":69,"[Cs]":70,"[NaH]":71,"[H-]":72,"[O+]":73,"[BH4-]":74,"[Cu]":75,"7":76,"[Mg]":77,"[Fe+2]":78,"[n+]":79,"[Sn]":80,"[BH-]":81,"[Pd+2]":82,"[CH]":83,"[I-]":84,"[Br-]":85,"[C-]":86,"[Zn]":87,"[B-]":88,"[F-]":89,"[Al]":90,"[P+]":91,"[BH3-]":92,"[Fe]":93,"[C]":94,"[AlH4]":95,"[Ni]":96,"[SiH]":97,"8":98,"[Cu+2]":99,"[Mn]":100,"[AlH]":101,"[nH+]":102,"[AlH4-]":103,"[O-2]":104,"[Cr]":105,"[Mg+2]":106,"[NH3+]":107,"[S@]":108,"[Pt]":109,"[Al+3]":110,"[S@@]":111,"[S-]":112,"[Ti]":113,"[Zn+2]":114,"[PH]":115,"[NH2+]":116,"[Ru]":117,"[Ag+]":118,"[S+]":119,"[I+3]":120,"[NH+]":121,"[Ca+2]":122,"[Ag]":123,"9":124,"[Os]":125,"[Se]":126,"[SiH2]":127,"[Ca]":128,"[Ti+4]":129,"[Ac]":130,"[Cu+]":131,"[S]":132,"[Rh]":133,"[Cl+3]":134,"[cH-]":135,"[Zn+]":136,"[O]":137,"[Cl+]":138,"[SH]":139,"[H+]":140,"[Pd+]":141,"[se]":142,"[PH+]":143,"[I]":144,"[Pt+2]":145,"[C+]":146,"[Mg+]":147,"[Hg]":148,"[W]":149,"[SnH]":150,"[SiH3]":151,"[Fe+3]":152,"[NH]":153,"[Mo]":154,"[CH2+]":155,"%10":156,"[CH2-]":157,"[CH2]":158,"[n-]":159,"[Ce+4]":160,"[NH-]":161,"[Co]":162,"[I+]":163,"[PH2]":164,"[Pt+4]":165,"[Ce]":166,"[B]":167,"[Sn+2]":168,"[Ba+2]":169,"%11":170,"[Fe-3]":171,"[18F]":172,"[SH-]":173,"[Pb+2]":174,"[Os-2]":175,"[Zr+4]":176,"[N]":177,"[Ir]":178,"[Bi]":179,"[Ni+2]":180,"[P@]":181,"[Co+2]":182,"[s+]":183,"[As]":184,"[P+3]":185,"[Hg+2]":186,"[Yb+3]":187,"[CH-]":188,"[Zr+2]":189,"[Mn+2]":190,"[CH+]":191,"[In]":192,"[KH]":193,"[Ce+3]":194,"[Zr]":195,"[AlH2-]":196,"[OH2+]":197,"[Ti+3]":198,"[Rh+2]":199,"[Sb]":200,"[S-2]":201,"%12":202,"[P@@]":203,"[Si@H]":204,"[Mn+4]":205,"p":206,"[Ba]":207,"[NH2-]":208,"[Ge]":209,"[Pb+4]":210,"[Cr+3]":211,"[Au]":212,"[LiH]":213,"[Sc+3]":214,"[o+]":215,"[Rh-3]":216,"%13":217,"[Br]":218,"[Sb-]":219,"[S@+]":220,"[I+2]":221,"[Ar]":222,"[V]":223,"[Cu-]":224,"[Al-]":225,"[Te]":226,"[13c]":227,"[13C]":228,"[Cl]":229,"[PH4+]":230,"[SiH4]":231,"[te]":232,"[CH3-]":233,"[S@@+]":234,"[Rh+3]":235,"[SH+]":236,"[Bi+3]":237,"[Br+2]":238,"[La]":239,"[La+3]":240,"[Pt-2]":241,"[N@@]":242,"[PH3+]":243,"[N@]":244,"[Si+4]":245,"[Sr+2]":246,"[Al+]":247,"[Pb]":248,"[SeH]":249,"[Si-]":250,"[V+5]":251,"[Y+3]":252,"[Re]":253,"[Ru+]":254,"[Sm]":255,"*":256,"[3H]":257,"[NH2]":258,"[Ag-]":259,"[13CH3]":260,"[OH+]":261,"[Ru+3]":262,"[OH]":263,"[Gd+3]":264,"[13CH2]":265,"[In+3]":266,"[Si@@]":267,"[Si@]":268,"[Ti+2]":269,"[Sn+]":270,"[Cl+2]":271,"[AlH-]":272,"[Pd-2]":273,"[SnH3]":274,"[B+3]":275,"[Cu-2]":276,"[Nd+3]":277,"[Pb+3]":278,"[13cH]":279,"[Fe-4]":280,"[Ga]":281,"[Sn+4]":282,"[Hg+]":283,"[11CH3]":284,"[Hf]":285,"[Pr]":286,"[Y]":287,"[S+2]":288,"[Cd]":289,"[Cr+6]":290,"[Zr+3]":291,"[Rh+]":292,"[CH3]":293,"[N-3]":294,"[Hf+2]":295,"[Th]":296,"[Sb+3]":297,"%14":298,"[Cr+2]":299,"[Ru+2]":300,"[Hf+4]":301,"[14C]":302,"[Ta]":303,"[Tl+]":304,"[B+]":305,"[Os+4]":306,"[PdH2]":307,"[Pd-]":308,"[Cd+2]":309,"[Co+3]":310,"[S+4]":311,"[Nb+5]":312,"[123I]":313,"[c+]":314,"[Rb+]":315,"[V+2]":316,"[CH3+]":317,"[Ag+2]":318,"[cH+]":319,"[Mn+3]":320,"[Se-]":321,"[As-]":322,"[Eu+3]":323,"[SH2]":324,"[Sm+3]":325,"[IH+]":326,"%15":327,"[OH3+]":328,"[PH3]":329,"[IH2+]":330,"[SH2+]":331,"[Ir+3]":332,"[AlH3]":333,"[Sc]":334,"[Yb]":335,"[15NH2]":336,"[Lu]":337,"[sH+]":338,"[Gd]":339,"[18F-]":340,"[SH3+]":341,"[SnH4]":342,"[TeH]":343,"[Si@@H]":344,"[Ga+3]":345,"[CaH2]":346,"[Tl]":347,"[Ta+5]":348,"[GeH]":349,"[Br+]":350,"[Sr]":351,"[Tl+3]":352,"[Sm+2]":353,"[PH5]":354,"%16":355,"[N@@+]":356,"[Au+3]":357,"[C-4]":358,"[Nd]":359,"[Ti+]":360,"[IH]":361,"[N@+]":362,"[125I]":363,"[Eu]":364,"[Sn+3]":365,"[Nb]":366,"[Er+3]":367,"[123I-]":368,"[14c]":369,"%17":370,"[SnH2]":371,"[YH]":372,"[Sb+5]":373,"[Pr+3]":374,"[Ir+]":375,"[N+3]":376,"[AlH2]":377,"[19F]":378,"%18":379,"[Tb]":380,"[14CH]":381,"[Mo+4]":382,"[Si+]":383,"[BH]":384,"[Be]":385,"[Rb]":386,"[pH]":387,"%19":388,"%20":389,"[Xe]":390,"[Ir-]":391,"[Be+2]":392,"[C+4]":393,"[RuH2]":394,"[15NH]":395,"[U+2]":396,"[Au-]":397,"%21":398,"%22":399,"[Au+]":400,"[15n]":401,"[Al+2]":402,"[Tb+3]":403,"[15N]":404,"[V+3]":405,"[W+6]":406,"[14CH3]":407,"[Cr+4]":408,"[ClH+]":409,"b":410,"[Ti+6]":411,"[Nd+]":412,"[Zr+]":413,"[PH2+]":414,"[Fm]":415,"[N@H+]":416,"[RuH]":417,"[Dy+3]":418,"%23":419,"[Hf+3]":420,"[W+4]":421,"[11C]":422,"[13CH]":423,"[Er]":424,"[124I]":425,"[LaH]":426,"[F]":427,"[siH]":428,"[Ga+]":429,"[Cm]":430,"[GeH3]":431,"[IH-]":432,"[U+6]":433,"[SeH+]":434,"[32P]":435,"[SeH-]":436,"[Pt-]":437,"[Ir+2]":438,"[se+]":439,"[U]":440,"[F+]":441,"[BH2]":442,"[As+]":443,"[Cf]":444,"[ClH2+]":445,"[Ni+]":446,"[TeH3]":447,"[SbH2]":448,"[Ag+3]":449,"%24":450,"[18O]":451,"[PH4]":452,"[Os+2]":453,"[Na-]":454,"[Sb+2]":455,"[V+4]":456,"[Ho+3]":457,"[68Ga]":458,"[PH-]":459,"[Bi+2]":460,"[Ce+2]":461,"[Pd+3]":462,"[99Tc]":463,"[13C@@H]":464,"[Fe+6]":465,"[c]":466,"[GeH2]":467,"[10B]":468,"[Cu+3]":469,"[Mo+2]":470,"[Cr+]":471,"[Pd+4]":472,"[Dy]":473,"[AsH]":474,"[Ba+]":475,"[SeH2]":476,"[In+]":477,"[TeH2]":478,"[BrH+]":479,"[14cH]":480,"[W+]":481,"[13C@H]":482,"[AsH2]":483,"[In+2]":484,"[N+2]":485,"[N@@H+]":486,"[SbH]":487,"[60Co]":488,"[AsH4+]":489,"[AsH3]":490,"[18OH]":491,"[Ru-2]":492,"[Na-2]":493,"[CuH2]":494,"[31P]":495,"[Ti+5]":496,"[35S]":497,"[P@@H]":498,"[ArH]":499,"[Co+]":500,"[Zr-2]":501,"[BH2-]":502,"[131I]":503,"[SH5]":504,"[VH]":505,"[B+2]":506,"[Yb+2]":507,"[14C@H]":508,"[211At]":509,"[NH3+2]":510,"[IrH]":511,"[IrH2]":512,"[Rh-]":513,"[Cr-]":514,"[Sb+]":515,"[Ni+3]":516,"[TaH3]":517,"[Tl+2]":518,"[64Cu]":519,"[Tc]":520,"[Cd+]":521,"[1H]":522,"[15nH]":523,"[AlH2+]":524,"[FH+2]":525,"[BiH3]":526,"[Ru-]":527,"[Mo+6]":528,"[AsH+]":529,"[BaH2]":530,"[BaH]":531,"[Fe+4]":532,"[229Th]":533,"[Th+4]":534,"[As+3]":535,"[NH+3]":536,"[P@H]":537,"[Li-]":538,"[7NaH]":539,"[Bi+]":540,"[PtH+2]":541,"[p-]":542,"[Re+5]":543,"[NiH]":544,"[Ni-]":545,"[Xe+]":546,"[Ca+]":547,"[11c]":548,"[Rh+4]":549,"[AcH]":550,"[HeH]":551,"[Sc+2]":552,"[Mn+]":553,"[UH]":554,"[14CH2]":555,"[SiH4+]":556,"[18OH2]":557,"[Ac-]":558,"[Re+4]":559,"[118Sn]":560,"[153Sm]":561,"[P+2]":562,"[9CH]":563,"[9CH3]":564,"[Y-]":565,"[NiH2]":566,"[Si+2]":567,"[Mn+6]":568,"[ZrH2]":569,"[C-2]":570,"[Bi+5]":571,"[24NaH]":572,"[Fr]":573,"[15CH]":574,"[Se+]":575,"[At]":576,"[P-3]":577,"[124I-]":578,"[CuH2-]":579,"[Nb+4]":580,"[Nb+3]":581,"[MgH]":582,"[Ir+4]":583,"[67Ga+3]":584,"[67Ga]":585,"[13N]":586,"[15OH2]":587,"[2NH]":588,"[Ho]":589,"[Cn]":590},"merges":[]}}
tokenizer_sl.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import LlamaTokenizerFast
2
+ import json
3
+ import os
4
+
5
+
6
+ def fn_load_tokenizer_llama(
7
+ max_seq_length,
8
+ dir_tokenizer: str = "./tokenizer.json",
9
+ # dir_tokenizer:str = os.path.abspath(os.path.join(os.getcwd(), '..', "models_mtr/tokenizer.json")), # for JUP
10
+ add_eos_token:bool = True,
11
+ ):
12
+
13
+ tokenizer = LlamaTokenizerFast(
14
+ tokenizer_file=dir_tokenizer,
15
+ model_max_length=max_seq_length,
16
+ padding_side="right",
17
+ bos_token="<s>",
18
+ eos_token="</s>",
19
+ unk_token="<unk>",
20
+ add_eos_token=add_eos_token,
21
+ )
22
+ tokenizer.add_special_tokens({"pad_token": "<pad>", "sep_token": "</s>", "cls_token": "<s>", "mask_token":"<mask>"})
23
+ # tokenizer.add_special_tokens({"pad_token": "<pad>"})
24
+
25
+ return tokenizer
26
+
27
+ def fn_load_descriptor_list(
28
+ key_descriptor_list,
29
+ dir_descriptor_list,
30
+ ):
31
+
32
+ with open(dir_descriptor_list, "r") as js:
33
+ list_descriptor = json.load(js)[key_descriptor_list]
34
+
35
+ return list_descriptor
utils_sl.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.metrics import roc_curve, auc, roc_auc_score
2
+ from sklearn.metrics import mean_squared_error
3
+ from sklearn.metrics import r2_score
4
+ from sklearn.metrics import mean_absolute_error
5
+ from scipy.stats import spearmanr
6
+ import matplotlib.pyplot as plt
7
+
8
+ import numpy as np
9
+ import os
10
+
11
+ def model_evalulator_sol(
12
+ array_predictions,
13
+ array_labels,
14
+ # dataset_dict:dict,
15
+ solute_or_solvent:str,
16
+ show_plot:bool=True,
17
+ print_result:bool=True,
18
+ ):
19
+
20
+ if print_result:
21
+ print(f"Dataset : {solute_or_solvent}")
22
+ print("N:", array_labels.shape[0])
23
+
24
+ fig, ax = plt.subplots()
25
+
26
+ metric = mean_squared_error(array_labels, array_predictions, squared=False) #RMSE
27
+ r2 = r2_score(array_labels, array_predictions)
28
+ metric2 = mean_absolute_error(array_labels, array_predictions) # MAE
29
+ ax.scatter(array_labels, array_predictions)
30
+ ax.set_title("Scatter Plot of Labels vs Predictions")
31
+ ax.set_xlabel("Labels")
32
+ ax.set_ylabel("Predictions")
33
+
34
+ if print_result:
35
+ print("R2:", r2)
36
+ print("Root Mean Square Error:", metric)
37
+ print("Mean Absolute Error:", metric2)
38
+
39
+ # correlation, p_value = spearmanr(array_labels, array_predictions)
40
+
41
+ # if print_result:
42
+ # print("Spearman correlation:", correlation)
43
+ # print("p-value:", p_value)
44
+ # print("=======================================")
45
+
46
+ xmin, xmax = ax.get_xlim()
47
+ ax.set_ylim(xmin, xmax)
48
+
49
+ if not show_plot:
50
+ plt.ioff()
51
+ plt.clf()
52
+ plt.close()
53
+ else :
54
+ plt.show()
55
+
56
+ # metrict 1 - ROC score (classification) | RMSE (regression)
57
+ # metric 2 - None (classification) | MAE ( regression)
58
+ round_decimal = 6
59
+ if metric2 != None:
60
+ metric2 = round(metric2, round_decimal)
61
+
62
+ # list_p_value = str(p_value).split('e')
63
+ # p_value_mantissa = round(float(list_p_value[0]), round_decimal)
64
+ # if len(list_p_value) == 2:
65
+ # p_value_exponent = int(list_p_value[1])
66
+ # else:
67
+ # p_value_exponent = None
68
+
69
+ return [solute_or_solvent,
70
+ round(metric, round_decimal),
71
+ metric2]
72
+ # return [solute_or_solvent,
73
+ # round(metric, round_decimal),
74
+ # metric2,
75
+ # p_value_mantissa,
76
+ # p_value_exponent]
77
+
78
+ # from .model_finetune import CustomFinetuneModel
79
+ # import model_finetune_sol
80
+ import torch
81
+ def load_model_ft_with_epoch(class_model_ft,
82
+ target_epoch:int,
83
+ dir_model_ft:str,
84
+ name_model_ft:str):
85
+ # dir_model_ft level 1
86
+ # ex /main/model_mtr/model_mtr_ep/dataset
87
+
88
+ dir_all_model_ft = f"{dir_model_ft}/{name_model_ft}/version_0/checkpoints/"
89
+ list_files_in_dir_model_ft = os.listdir(dir_all_model_ft)
90
+ # extension = '.ckpt'
91
+ extension = '.pt'
92
+ list_model_ft_in_the_dir = sorted(list_files_in_dir_model_ft, key=lambda x: float(x.split('=')[-1].split('.')[0]))
93
+
94
+ print(f"Loaded model with epoch {target_epoch}")
95
+ dir_target_model_ft = f"{dir_all_model_ft}/{list_model_ft_in_the_dir[target_epoch]}"
96
+
97
+ # class_model_ft.load_from_checkpoint(dir_target_model_ft)
98
+
99
+ loaded_state_dict = torch.load(dir_target_model_ft)
100
+ class_model_ft.load_state_dict(loaded_state_dict['state_dict'])
101
+
102
+ return class_model_ft # now is model_ft
103
+
104
+ from scipy.stats import rankdata
105
+ # rankdata does not consider decimal places!
106
+ def rank_value_sol(
107
+ list_value,
108
+ # dataset_dict:dict,
109
+ is_loss:bool=True,
110
+ ):
111
+
112
+ list_value = np.array(list_value)
113
+
114
+ return np.array(rankdata(list_value * 100000, method='min')) - 1