Huhujingjing
commited on
Commit
•
2444fad
1
Parent(s):
9a0b5b4
Upload model
Browse files- configuration_gcn.py +33 -0
- modeling_gcn.py +90 -46
configuration_gcn.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import PretrainedConfig
|
2 |
+
from typing import List
|
3 |
+
class GCNConfig(PretrainedConfig):
|
4 |
+
model_type = "gcn"
|
5 |
+
|
6 |
+
def __init__(
|
7 |
+
self,
|
8 |
+
input_feature: int=64,
|
9 |
+
emb_input: int=20,
|
10 |
+
hidden_size: int=64,
|
11 |
+
n_layers: int=6,
|
12 |
+
num_classes: int=1,
|
13 |
+
|
14 |
+
smiles: List[str] = None,
|
15 |
+
processor_class: str = "SmilesProcessor",
|
16 |
+
**kwargs,
|
17 |
+
):
|
18 |
+
|
19 |
+
self.input_feature = input_feature # the dimension of input feature
|
20 |
+
self.emb_input = emb_input # the embedding dimension of input feature
|
21 |
+
self.hidden_size = hidden_size # the hidden size of GCN
|
22 |
+
self.n_layers = n_layers # the number of GCN layers
|
23 |
+
self.num_classes = num_classes # the number of output classes
|
24 |
+
|
25 |
+
self.smiles = smiles # process smiles
|
26 |
+
self.processor_class = processor_class
|
27 |
+
|
28 |
+
super().__init__(**kwargs)
|
29 |
+
|
30 |
+
|
31 |
+
if __name__ == "__main__":
|
32 |
+
gcn_config = GCNConfig(input_feature=64, emb_input=20, hidden_size=64, n_layers=6, num_classes=1, smiles=["C", "CC", "CCC"], processor_class="SmilesProcessor")
|
33 |
+
gcn_config.save_pretrained("custom-gcn")
|
modeling_gcn.py
CHANGED
@@ -3,14 +3,45 @@ import torch.nn as nn
|
|
3 |
import torch.nn.functional as F
|
4 |
from torch_scatter import scatter
|
5 |
from transformers import PreTrainedModel
|
6 |
-
|
7 |
import torch
|
8 |
from rdkit import Chem
|
9 |
from rdkit.Chem import AllChem
|
10 |
import torch
|
11 |
from torch_geometric.data import Data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
|
|
|
|
13 |
|
|
|
14 |
class SmilesDataset(torch.utils.data.Dataset):
|
15 |
def __init__(self, smiles):
|
16 |
self.smiles_list = smiles
|
@@ -145,36 +176,6 @@ class GCNNet(torch.nn.Module):
|
|
145 |
|
146 |
return x.squeeze(-1)
|
147 |
|
148 |
-
|
149 |
-
from transformers import PretrainedConfig
|
150 |
-
from typing import List
|
151 |
-
class GCNConfig(PretrainedConfig):
|
152 |
-
model_type = "gcn"
|
153 |
-
|
154 |
-
def __init__(
|
155 |
-
self,
|
156 |
-
input_feature: int=64,
|
157 |
-
emb_input: int=20,
|
158 |
-
hidden_size: int=64,
|
159 |
-
n_layers: int=6,
|
160 |
-
num_classes: int=1,
|
161 |
-
smiles: List[str] = None,
|
162 |
-
processor_class: str = "SmilesProcessor",
|
163 |
-
**kwargs,
|
164 |
-
):
|
165 |
-
|
166 |
-
self.input_feature = input_feature # the dimension of input feature
|
167 |
-
self.emb_input = emb_input # the embedding dimension of input feature
|
168 |
-
self.hidden_size = hidden_size # the hidden size of GCN
|
169 |
-
self.n_layers = n_layers # the number of GCN layers
|
170 |
-
self.num_classes = num_classes # the number of output classes
|
171 |
-
|
172 |
-
self.smiles = smiles # process smiles
|
173 |
-
self.processor_class = processor_class
|
174 |
-
|
175 |
-
super().__init__(**kwargs)
|
176 |
-
|
177 |
-
|
178 |
class GCNModel(PreTrainedModel):
|
179 |
config_class = GCNConfig
|
180 |
|
@@ -192,27 +193,70 @@ class GCNModel(PreTrainedModel):
|
|
192 |
smiles=config.smiles,
|
193 |
)
|
194 |
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
def forward(self, tensor):
|
196 |
return self.model.forward_features(tensor)
|
197 |
|
198 |
-
def process_smiles(self, smiles):
|
199 |
-
|
200 |
|
|
|
201 |
|
202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
if __name__ == "__main__":
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
# gcnd.model.load_state_dict(torch.load(r'G:\Trans_MXM\gcn_model\gcn.pt'))
|
212 |
-
# gcnd.save_pretrained("custom-gcn")
|
213 |
-
|
214 |
-
# gcnd1 = GCNModelForMolecularPrediction(gcn_config)
|
215 |
-
#
|
216 |
-
# gcnd1.model.load_state_dict(torch.load(r'G:\Trans_MXM\gcn_model\gcn.pt'))
|
217 |
-
# gcnd1.save_pretrained("custom-gcn")
|
218 |
|
|
|
3 |
import torch.nn.functional as F
|
4 |
from torch_scatter import scatter
|
5 |
from transformers import PreTrainedModel
|
6 |
+
from gcn_model.configuration_gcn import GCNConfig
|
7 |
import torch
|
8 |
from rdkit import Chem
|
9 |
from rdkit.Chem import AllChem
|
10 |
import torch
|
11 |
from torch_geometric.data import Data
|
12 |
+
import os
|
13 |
+
from transformers import PretrainedConfig
|
14 |
+
from typing import List
|
15 |
+
from torch_geometric.loader import DataLoader
|
16 |
+
from tqdm import tqdm
|
17 |
+
import pandas as pd
|
18 |
+
from transformers import AutoModel
|
19 |
+
class GCNConfig(PretrainedConfig):
|
20 |
+
model_type = "gcn"
|
21 |
+
|
22 |
+
def __init__(
|
23 |
+
self,
|
24 |
+
input_feature: int=64,
|
25 |
+
emb_input: int=20,
|
26 |
+
hidden_size: int=64,
|
27 |
+
n_layers: int=6,
|
28 |
+
num_classes: int=1,
|
29 |
+
|
30 |
+
smiles: List[str] = None,
|
31 |
+
processor_class: str = "SmilesProcessor",
|
32 |
+
**kwargs,
|
33 |
+
):
|
34 |
+
|
35 |
+
self.input_feature = input_feature # the dimension of input feature
|
36 |
+
self.emb_input = emb_input # the embedding dimension of input feature
|
37 |
+
self.hidden_size = hidden_size # the hidden size of GCN
|
38 |
+
self.n_layers = n_layers # the number of GCN layers
|
39 |
+
self.num_classes = num_classes # the number of output classes
|
40 |
|
41 |
+
self.smiles = smiles # process smiles
|
42 |
+
self.processor_class = processor_class
|
43 |
|
44 |
+
super().__init__(**kwargs)
|
45 |
class SmilesDataset(torch.utils.data.Dataset):
|
46 |
def __init__(self, smiles):
|
47 |
self.smiles_list = smiles
|
|
|
176 |
|
177 |
return x.squeeze(-1)
|
178 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
class GCNModel(PreTrainedModel):
|
180 |
config_class = GCNConfig
|
181 |
|
|
|
193 |
smiles=config.smiles,
|
194 |
)
|
195 |
|
196 |
+
self.gcn_model = None
|
197 |
+
self.dataset = None
|
198 |
+
self.output = None
|
199 |
+
self.data_loader = None
|
200 |
+
self.pred_data = None
|
201 |
+
|
202 |
def forward(self, tensor):
|
203 |
return self.model.forward_features(tensor)
|
204 |
|
205 |
+
# def process_smiles(self, smiles):
|
206 |
+
# return self.process.get_data(smiles)
|
207 |
|
208 |
+
def predict_smiles(self, smiles, device: str='cpu', result_dir: str='./', **kwargs):
|
209 |
|
210 |
|
211 |
+
batch_size = kwargs.pop('batch_size', 1)
|
212 |
+
shuffle = kwargs.pop('shuffle', False)
|
213 |
+
drop_last = kwargs.pop('drop_last', False)
|
214 |
+
num_workers = kwargs.pop('num_workers', 0)
|
215 |
+
|
216 |
+
self.gcn_model = AutoModel.from_pretrained("Huhujingjing/custom-gcn", trust_remote_code=True).to(device)
|
217 |
+
self.gcn_model.eval()
|
218 |
+
|
219 |
+
self.dataset = self.process.get_data(smiles)
|
220 |
+
self.output = ""
|
221 |
+
self.output += ("predicted samples num: {}\n".format(len(self.dataset)))
|
222 |
+
self.output +=("predicted samples:{}\n".format(self.dataset[0]))
|
223 |
+
self.data_loader = DataLoader(self.dataset,
|
224 |
+
batch_size=batch_size,
|
225 |
+
shuffle=shuffle,
|
226 |
+
drop_last=drop_last,
|
227 |
+
num_workers=num_workers
|
228 |
+
)
|
229 |
+
self.pred_data = {
|
230 |
+
'smiles': [],
|
231 |
+
'pred': []
|
232 |
+
}
|
233 |
+
|
234 |
+
for batch in tqdm(self.data_loader):
|
235 |
+
batch = batch.to(device)
|
236 |
+
with torch.no_grad():
|
237 |
+
self.pred_data['smiles'] += batch['smiles']
|
238 |
+
self.pred_data['pred'] += self.gcn_model(batch).cpu().tolist()
|
239 |
+
|
240 |
+
pred = torch.tensor(self.pred_data['pred']).reshape(-1)
|
241 |
+
if device == 'cuda':
|
242 |
+
pred = pred.cpu().tolist()
|
243 |
+
self.pred_data['pred'] = pred
|
244 |
+
pred_df = pd.DataFrame(self.pred_data)
|
245 |
+
pred_df['pred'] = pred_df['pred'].apply(lambda x: round(x, 2))
|
246 |
+
self.output +=('-' * 40 + '\n'+'predicted result: \n'+'{}\n'.format(pred_df))
|
247 |
+
self.output +=('-' * 40)
|
248 |
+
|
249 |
+
pred_df.to_csv(os.path.join(result_dir, 'gcn.csv'), index=False)
|
250 |
+
self.output +=('\nsave predicted result to {}\n'.format(os.path.join(result_dir, 'gcn.csv')))
|
251 |
+
|
252 |
+
return self.output
|
253 |
+
|
254 |
|
255 |
if __name__ == "__main__":
|
256 |
+
gcn_config = GCNConfig(input_feature=64, emb_input=20, hidden_size=64, n_layers=6, num_classes=1,
|
257 |
+
smiles=["C", "CC", "CCC"], processor_class="SmilesProcessor")
|
258 |
+
|
259 |
+
gcnd = GCNModel(gcn_config)
|
260 |
+
gcnd.model.load_state_dict(torch.load(r'G:\Trans_MXM\gcn_model\gcn.pt'))
|
261 |
+
gcnd.save_pretrained("custom-gcn")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
|