saicharan2804 commited on
Commit
2727a59
·
1 Parent(s): 2e76d65

cleaned code

Browse files
__pycache__/molgenevalmetric.cpython-312.pyc CHANGED
Binary files a/__pycache__/molgenevalmetric.cpython-312.pyc and b/__pycache__/molgenevalmetric.cpython-312.pyc differ
 
app.py CHANGED
@@ -1,209 +1,38 @@
1
- import pandas as pd
2
- df = pd.read_csv('/Users/saicharan/chembl_10000.csv')
3
- from molgenevalmetric import SYBAscore
4
-
5
  import evaluate
6
- met = evaluate.load("saicharan2804/molgenevalmetric")
7
-
8
- ls= df['SMILES'].tolist()
9
- ls_gen = ls[0:500]
10
- ls_train = ls[500:1000]
11
-
12
- print('computing')
13
- # print(SYBAscore(gen=ls_gen))
14
- print(met.compute(gensmi = ls_gen, trainsmi = ls_train))
15
- # print(qed_metric(gen=ls_gen))
16
- # print(logP_metric(gen=ls_gen))
17
- # print(average_sascore(gen=ls_gen))
18
-
19
- # print(oracles(gen=ls_gen, train=ls_train))
20
 
21
- # import evaluate
22
- # from evaluate.utils import launch_gradio_widget
23
- # import gradio as gr
24
 
25
- # module = evaluate.load("saicharan2804/molgenevalmetric")
26
- # # launch_gradio_widget(module)
 
 
 
 
 
 
27
 
28
- # iface = gr.Interface(
29
- # fn = module,
30
- # inputs=[
31
- # gr.File(label="Generated SMILES"),
32
- # gr.File(label="Training Data", value=None),
33
- # ],
34
- # outputs="text"
35
- # )
36
 
37
- # iface.launch()
38
 
39
  # import pandas as pd
 
 
40
 
41
- # df = pd.read_csv('/home/saicharan/Downloads/chembl.csv')
42
-
43
- # df = df.rename(columns={'canonical_smiles': 'SMILES'})
44
-
45
- # df = df[0:10000]
46
-
47
- # print(df[['SMILES']].to_csv('/home/saicharan/Downloads/chembl_10000.csv'))
48
- # from SCScore import SCScorer
49
-
50
-
51
- # '''
52
- # This is a standalone, importable SCScorer model. It does not have tensorflow as a
53
- # dependency and is a more attractive option for deployment. The calculations are
54
- # fast enough that there is no real reason to use GPUs (via tf) instead of CPUs (via np)
55
- # '''
56
-
57
- # import numpy as np
58
- # import time
59
- # import rdkit.Chem as Chem
60
- # import rdkit.Chem.AllChem as AllChem
61
- # import json
62
- # import gzip
63
- # import six
64
-
65
- # import os
66
- # project_root = os.path.dirname(os.path.dirname(__file__))
67
-
68
- # score_scale = 5.0
69
- # min_separation = 0.25
70
-
71
- # FP_len = 1024
72
- # FP_rad = 2
73
-
74
- # def sigmoid(x):
75
- # return 1 / (1 + np.exp(-x))
76
-
77
- # class SCScorer():
78
- # def __init__(self, score_scale=score_scale):
79
- # self.vars = []
80
- # self.score_scale = score_scale
81
- # self._restored = False
82
-
83
- # def restore(self, weight_path=os.path.join('model.ckpt-10654.as_numpy.json.gz'), FP_rad=FP_rad, FP_len=FP_len):
84
- # self.FP_len = FP_len; self.FP_rad = FP_rad
85
- # self._load_vars(weight_path)
86
- # # print('Restored variables from {}'.format(weight_path))
87
-
88
- # if 'uint8' in weight_path or 'counts' in weight_path:
89
- # def mol_to_fp(self, mol):
90
- # if mol is None:
91
- # return np.array((self.FP_len,), dtype=np.uint8)
92
- # fp = AllChem.GetMorganFingerprint(mol, self.FP_rad, useChirality=True) # uitnsparsevect
93
- # fp_folded = np.zeros((self.FP_len,), dtype=np.uint8)
94
- # for k, v in six.iteritems(fp.GetNonzeroElements()):
95
- # fp_folded[k % self.FP_len] += v
96
- # return np.array(fp_folded)
97
- # else:
98
- # def mol_to_fp(self, mol):
99
- # if mol is None:
100
- # return np.zeros((self.FP_len,), dtype=np.float32)
101
- # return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, self.FP_rad, nBits=self.FP_len,
102
- # useChirality=True), dtype=np.bool_)
103
- # self.mol_to_fp = mol_to_fp
104
-
105
- # self._restored = True
106
- # return self
107
-
108
- # def smi_to_fp(self, smi):
109
- # if not smi:
110
- # return np.zeros((self.FP_len,), dtype=np.float32)
111
- # return self.mol_to_fp(self, Chem.MolFromSmiles(smi))
112
-
113
- # def apply(self, x):
114
- # if not self._restored:
115
- # raise ValueError('Must restore model weights!')
116
- # # Each pair of vars is a weight and bias term
117
- # for i in range(0, len(self.vars), 2):
118
- # last_layer = (i == len(self.vars)-2)
119
- # W = self.vars[i]
120
- # b = self.vars[i+1]
121
- # x = np.matmul(x, W) + b
122
- # if not last_layer:
123
- # x = x * (x > 0) # ReLU
124
- # x = 1 + (score_scale - 1) * sigmoid(x)
125
- # return x
126
-
127
- # def get_score_from_smi(self, smi='', v=False):
128
- # if not smi:
129
- # return ('', 0.)
130
- # fp = np.array((self.smi_to_fp(smi)), dtype=np.float32)
131
- # if sum(fp) == 0:
132
- # if v: print('Could not get fingerprint?')
133
- # cur_score = 0.
134
- # else:
135
- # # Run
136
- # cur_score = self.apply(fp)
137
- # if v: print('Score: {}'.format(cur_score))
138
- # mol = Chem.MolFromSmiles(smi)
139
- # if mol:
140
- # smi = Chem.MolToSmiles(mol, isomericSmiles=True, kekuleSmiles=True)
141
- # else:
142
- # smi = ''
143
- # return (smi, cur_score)
144
-
145
- # def get_avg_score(self, smis):
146
- # """
147
- # Compute the average score for a list of SMILES strings.
148
-
149
- # Args:
150
- # smis (list of str): A list of SMILES strings.
151
-
152
- # Returns:
153
- # float: The average score of the given SMILES strings.
154
- # """
155
- # if not smis: # Check if the list is empty
156
- # return 0.0
157
-
158
- # total_score = 0.0
159
- # valid_smiles_count = 0
160
-
161
- # for smi in smis:
162
- # _, score = self.get_score_from_smi(smi)
163
- # if score > 0: # Assuming only positive scores are valid
164
- # total_score += score
165
- # valid_smiles_count += 1
166
-
167
- # # Avoid division by zero
168
- # if valid_smiles_count == 0:
169
- # return 0.0
170
- # else:
171
- # return total_score / valid_smiles_count
172
-
173
- # def _load_vars(self, weight_path):
174
- # if weight_path.endswith('pickle'):
175
- # import pickle
176
- # with open(weight_path, 'rb') as fid:
177
- # self.vars = pickle.load(fid)
178
- # self.vars = [x.tolist() for x in self.vars]
179
- # elif weight_path.endswith('json.gz'):
180
- # with gzip.GzipFile(weight_path, 'r') as fin: # 4. gzip
181
- # json_bytes = fin.read() # 3. bytes (i.e. UTF-8)
182
- # json_str = json_bytes.decode('utf-8') # 2. string (i.e. JSON)
183
- # self.vars = json.loads(json_str)
184
- # self.vars = [np.array(x) for x in self.vars]
185
-
186
-
187
-
188
-
189
-
190
- # from myscscore.SCScore import SCScorer
191
- # import pandas as pd
192
-
193
- # model = SCScorer()
194
- # model.restore()
195
- # # import evaluate
196
- # # molgenevalmetric = evaluate.load("saicharan2804/molgenevalmetric")
197
-
198
- # df = pd.read_csv('/home/saicharan/Downloads/chembl_10000.csv')
199
-
200
  # ls= df['SMILES'].tolist()
201
- # ls_gen = ls[0:5000]
202
- # ls_train = ls[5000:10000]
203
 
204
  # print('computing')
205
- # average_score = model.get_avg_score(ls_gen)
 
 
 
 
 
206
 
207
- # # Print the average score
208
- # print('Average score:', average_score)
209
- # # print(molgenevalmetric.compute(gensmi = ls_gen, trainsmi = ls_train))
 
 
 
 
 
1
  import evaluate
2
+ from evaluate.utils import launch_gradio_widget
3
+ import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ module = evaluate.load("saicharan2804/molgenevalmetric")
6
+ # launch_gradio_widget(module)
 
7
 
8
+ iface = gr.Interface(
9
+ fn = module.compute,
10
+ inputs=[
11
+ gr.File(label="Generated SMILES"),
12
+ gr.File(label="Training Data", value=None),
13
+ ],
14
+ outputs="text"
15
+ )
16
 
17
+ iface.launch()
 
 
 
 
 
 
 
18
 
 
19
 
20
  # import pandas as pd
21
+ # from molgenevalmetric import penalized_logp
22
+ # import evaluate
23
 
24
+ # df = pd.read_csv('/Users/saicharan/chembl_10000.csv')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  # ls= df['SMILES'].tolist()
26
+ # ls_gen = ls[0:500]
27
+ # ls_train = ls[500:1000]
28
 
29
  # print('computing')
30
+ # print(penalized_logp(gen=ls_gen))
31
+ # print(SYBAscore(gen=ls_gen))
32
+ # print(qed_metric(gen=ls_gen))
33
+ # print(logP_metric(gen=ls_gen))
34
+ # print(average_sascore(gen=ls_gen))
35
+ # print(oracles(gen=ls_gen, train=ls_train))
36
 
37
+ # met = evaluate.load("saicharan2804/molgenevalmetric")
38
+ # print(met.compute(gensmi = ls_gen, trainsmi = ls_train))
 
molgenevalmetric.py CHANGED
@@ -2,39 +2,24 @@
2
  import evaluate
3
  import datasets
4
  import pandas as pd
5
- from tdc import Evaluator
6
- from tdc import Oracle
7
- from rdkit.Chem.QED import qed
8
- from rdkit.Chem.Crippen import MolLogP
9
- import os
10
- from collections import Counter
11
- from functools import partial
12
  import numpy as np
13
- import pandas as pd
14
  import scipy.sparse
15
  import torch
 
 
 
 
 
 
 
16
  from rdkit import Chem
17
- from rdkit.Chem import AllChem
18
  from rdkit.Chem import MACCSkeys
19
  from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect as Morgan
20
  from rdkit.Chem.QED import qed
21
- from rdkit.Chem.Scaffolds import MurckoScaffold
22
- from rdkit.Chem import Descriptors
23
- from multiprocessing import Pool
24
- from collections import UserList, defaultdict
25
- import numpy as np
26
- import pandas as pd
27
- from rdkit import rdBase
28
  from rdkit.Contrib.SA_Score import sascorer
29
- import sys
30
- from rdkit.Chem import RDConfig
31
- import os
32
- import pandas as pd
33
- from fcd_torch import FCD
34
- from syba.syba import SybaClassifier
35
 
 
36
  from myscscore.SCScore import SCScorer
37
- import warnings
38
 
39
 
40
  def get_mol(smiles_or_mol):
@@ -196,7 +181,7 @@ def calculate_sa_score(smiles):
196
  Returns:
197
  - float: SA score of the molecule, or None if the molecule couldn't be created.
198
  """
199
- mol = Chem.MolFromSmiles(smiles)
200
  if mol:
201
  return sascorer.calculateScore(mol)
202
  else:
@@ -431,7 +416,7 @@ def qed_metric(gen):
431
  qed_scores = []
432
  for smiles in gen:
433
  try:
434
- mol = Chem.MolFromSmiles(smiles)
435
  if mol: # Ensure molecule is valid
436
  qed_scores.append(qed(mol))
437
  except Exception as e:
@@ -461,7 +446,7 @@ def logP_metric(gen):
461
  logP_values = []
462
  for smiles in gen:
463
  try:
464
- mol = Chem.MolFromSmiles(smiles)
465
  if mol: # Ensure molecule is valid
466
  logP_values.append(MolLogP(mol))
467
  except Exception as e:
@@ -473,45 +458,24 @@ def logP_metric(gen):
473
  else:
474
  return 0.0 # Return 0 or suitable value if no valid molecules are processed
475
 
476
-
477
- def oracles(gen, train):
478
-
479
  """
480
- Computes scores from various oracles for a list of generated molecules.
481
 
482
  Parameters:
483
- - gen (List[str]): List of generated SMILES strings.
484
- - train (List[str]): List of training set SMILES strings.
485
 
486
  Returns:
487
- - Dict[str, Any]: A dictionary with oracle names as keys and their corresponding scores as values.
488
- """
489
-
490
- result = {}
491
-
492
- # oracle_list = [
493
- # 'QED', 'MPO', 'GSK3B', 'JNK3',
494
- # 'DRD2', 'LogP', 'Rediscovery', 'Similarity',
495
- # 'Median', 'Isomers', 'Valsartan_SMARTS', 'Hop'
496
- # ]
497
-
498
- oracle_list = ['QED', 'LogP', 'SA']
499
-
500
- for oracle_name in oracle_list:
501
- # print(oracle_name)
502
- oracle = Oracle(name=oracle_name)
503
- if oracle_name in ['Rediscovery', 'MPO', 'Similarity', 'Median', 'Isomers', 'Hop']:
504
- score = oracle(gen)
505
- if isinstance(score, dict):
506
- score = {key: sum(values)/len(values) for key, values in score.items()}
507
- else:
508
- score = oracle(gen)
509
- if isinstance(score, list):
510
- score = sum(score) / len(score)
511
 
512
- result[f"{oracle_name}"] = score
513
-
514
- return result
515
 
516
 
517
 
@@ -533,33 +497,7 @@ Returns:
533
 
534
 
535
  _CITATION = """
536
- @article{DBLP:journals/corr/abs-1811-12823,
537
- author = {Daniil Polykovskiy and
538
- Alexander Zhebrak and
539
- Benjam{\'{\i}}n S{\'{a}}nchez{-}Lengeling and
540
- Sergey Golovanov and
541
- Oktai Tatanov and
542
- Stanislav Belyaev and
543
- Rauf Kurbanov and
544
- Aleksey Artamonov and
545
- Vladimir Aladinskiy and
546
- Mark Veselov and
547
- Artur Kadurin and
548
- Sergey I. Nikolenko and
549
- Al{\'{a}}n Aspuru{-}Guzik and
550
- Alex Zhavoronkov},
551
- title = {Molecular Sets {(MOSES):} {A} Benchmarking Platform for Molecular
552
- Generation Models},
553
- journal = {CoRR},
554
- volume = {abs/1811.12823},
555
- year = {2018},
556
- url = {http://arxiv.org/abs/1811.12823},
557
- eprinttype = {arXiv},
558
- eprint = {1811.12823},
559
- timestamp = {Fri, 26 Nov 2021 15:34:30 +0100},
560
- biburl = {https://dblp.org/rec/journals/corr/abs-1811-12823.bib},
561
- bibsource = {dblp computer science bibliography, https://dblp.org}
562
- }
563
  """
564
 
565
 
@@ -582,7 +520,7 @@ class molgenevalmetric(evaluate.Metric):
582
  }
583
  ),
584
 
585
- reference_urls=["https://github.com/molecularsets/moses", "https://tdcommons.ai/functions/oracles/"],
586
  )
587
 
588
  def _compute(self, gensmi, trainsmi):
@@ -595,46 +533,11 @@ class molgenevalmetric(evaluate.Metric):
595
  metrics['FCD'] = fcd_metric(gen = gensmi, train = trainsmi)
596
  metrics['QED'] = qed_metric(gen=gensmi)
597
  metrics['LogP'] = logP_metric(gen=gensmi)
 
598
  metrics['SA'] = average_sascore(gen=gensmi)
599
- metrics['SCS'] = synthetic_complexity_score(gen=gensmi)
600
  metrics['SYBA'] = SYBAscore(gen=gensmi)
601
- metrics['Oracles'] = oracles(gen = gensmi, train = trainsmi)
602
 
603
  return metrics
604
 
605
-
606
- # generated_smiles = [s for s in generated_smiles if s != '']
607
-
608
- # evaluator = Evaluator(name = 'KL_Divergence')
609
- # KL_Divergence = evaluator(generated_smiles, train_smiles)
610
-
611
- # Results.update({
612
- # "KL_Divergence": KL_Divergence,
613
- # })
614
-
615
-
616
- # oracle_list = [
617
- # 'QED', 'SA', 'MPO', 'GSK3B', 'JNK3',
618
- # 'DRD2', 'LogP', 'Rediscovery', 'Similarity',
619
- # 'Median', 'Isomers', 'Valsartan_SMARTS', 'Hop'
620
- # ]
621
-
622
- # for oracle_name in oracle_list:
623
- # oracle = Oracle(name=oracle_name)
624
- # if oracle_name in ['Rediscovery', 'MPO', 'Similarity', 'Median', 'Isomers', 'Hop']:
625
- # score = oracle(generated_smiles)
626
- # if isinstance(score, dict):
627
- # score = {key: sum(values)/len(values) for key, values in score.items()}
628
- # else:
629
- # score = oracle(generated_smiles)
630
- # if isinstance(score, list):
631
- # score = sum(score) / len(score)
632
-
633
- # Results.update({f"{oracle_name}": score})
634
-
635
- # # keys_to_remove = ["FCD/TestSF", "SNN/TestSF", "Frag/TestSF", "Scaf/TestSF"]
636
- # # for key in keys_to_remove:
637
- # # Results.pop(key, None)
638
-
639
- # return {"results": Results}
640
-
 
2
  import evaluate
3
  import datasets
4
  import pandas as pd
 
 
 
 
 
 
 
5
  import numpy as np
 
6
  import scipy.sparse
7
  import torch
8
+ import warnings
9
+ from multiprocessing import Pool
10
+ from functools import partial
11
+ from fcd_torch import FCD
12
+
13
+ from tdc import Oracle
14
+ from rdkit.Chem.Crippen import MolLogP
15
  from rdkit import Chem
 
16
  from rdkit.Chem import MACCSkeys
17
  from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect as Morgan
18
  from rdkit.Chem.QED import qed
 
 
 
 
 
 
 
19
  from rdkit.Contrib.SA_Score import sascorer
 
 
 
 
 
 
20
 
21
+ from syba.syba import SybaClassifier
22
  from myscscore.SCScore import SCScorer
 
23
 
24
 
25
  def get_mol(smiles_or_mol):
 
181
  Returns:
182
  - float: SA score of the molecule, or None if the molecule couldn't be created.
183
  """
184
+ mol = get_mol(smiles)
185
  if mol:
186
  return sascorer.calculateScore(mol)
187
  else:
 
416
  qed_scores = []
417
  for smiles in gen:
418
  try:
419
+ mol = get_mol(smiles)
420
  if mol: # Ensure molecule is valid
421
  qed_scores.append(qed(mol))
422
  except Exception as e:
 
446
  logP_values = []
447
  for smiles in gen:
448
  try:
449
+ mol = get_mol(smiles)
450
  if mol: # Ensure molecule is valid
451
  logP_values.append(MolLogP(mol))
452
  except Exception as e:
 
458
  else:
459
  return 0.0 # Return 0 or suitable value if no valid molecules are processed
460
 
461
+ def penalized_logp(gen):
 
 
462
  """
463
+ Computes the average PyTDC's penalized logP value for a list of SMILES strings.
464
 
465
  Parameters:
466
+ - mols (List[str]): List of SMILES strings representing the molecules.
 
467
 
468
  Returns:
469
+ - float: Average penalized logP value for the list of molecules.
470
+ """
471
+ oracle = Oracle('LogP')
472
+
473
+ score = oracle(gen)
474
+ if isinstance(score, list):
475
+ score = sum(score) / len(score)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
476
 
477
+ return score
478
+
 
479
 
480
 
481
 
 
497
 
498
 
499
  _CITATION = """
500
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
501
  """
502
 
503
 
 
520
  }
521
  ),
522
 
523
+ reference_urls=["https://github.com/molecularsets/moses", "https://tdcommons.ai/functions/oracles/", "https://github.com/lich-uct/syba", "https://github.com/connorcoley/scscore"],
524
  )
525
 
526
  def _compute(self, gensmi, trainsmi):
 
533
  metrics['FCD'] = fcd_metric(gen = gensmi, train = trainsmi)
534
  metrics['QED'] = qed_metric(gen=gensmi)
535
  metrics['LogP'] = logP_metric(gen=gensmi)
536
+ metrics['Penalized LogP'] = penalized_logp(gen=gensmi)
537
  metrics['SA'] = average_sascore(gen=gensmi)
538
+ metrics['SCScore'] = synthetic_complexity_score(gen=gensmi)
539
  metrics['SYBA'] = SYBAscore(gen=gensmi)
540
+ # metrics['Oracles'] = oracles(gen = gensmi, train = trainsmi)
541
 
542
  return metrics
543