eduardosoares99 vshirasuna commited on
Commit
e131702
1 Parent(s): ce89322

Add smiles normalization (#11)

Browse files

- Added smiles normalization (d823faeb77bf4d6ea7f0795d5cebd5ff2fb32344)


Co-authored-by: Victor Yukio Shirasuna <vshirasuna@users.noreply.huggingface.co>

smi-ted/inference/smi_ted_large/load.py CHANGED
@@ -19,6 +19,12 @@ from transformers import BertTokenizer
19
  import numpy as np
20
  import pandas as pd
21
 
 
 
 
 
 
 
22
  # Standard library
23
  from functools import partial
24
  import regex as re
@@ -29,6 +35,17 @@ from tqdm import tqdm
29
  tqdm.pandas()
30
 
31
 
 
 
 
 
 
 
 
 
 
 
 
32
  class MolTranBertTokenizer(BertTokenizer):
33
  def __init__(self, vocab_file: str = '',
34
  do_lower_case=False,
@@ -476,9 +493,13 @@ class Smi_ted(nn.Module):
476
  if self.is_cuda_available:
477
  self.encoder.cuda()
478
  self.decoder.cuda()
 
 
 
 
479
 
480
  # tokenizer
481
- idx, mask = self.tokenize(smiles)
482
 
483
  ###########
484
  # Encoder #
@@ -547,6 +568,7 @@ class Smi_ted(nn.Module):
547
 
548
  # handle single str or a list of str
549
  smiles = pd.Series(smiles) if isinstance(smiles, str) else pd.Series(list(smiles))
 
550
  n_split = smiles.shape[0] // batch_size if smiles.shape[0] >= batch_size else smiles.shape[0]
551
 
552
  # process in batches
 
19
  import numpy as np
20
  import pandas as pd
21
 
22
+ # Chemistry
23
+ from rdkit import Chem
24
+ from rdkit.Chem import PandasTools
25
+ from rdkit.Chem import Descriptors
26
+ PandasTools.RenderImagesInAllDataFrames(True)
27
+
28
  # Standard library
29
  from functools import partial
30
  import regex as re
 
35
  tqdm.pandas()
36
 
37
 
38
+ # function to canonicalize SMILES
39
+ def normalize_smiles(smi, canonical=True, isomeric=False):
40
+ try:
41
+ normalized = Chem.MolToSmiles(
42
+ Chem.MolFromSmiles(smi), canonical=canonical, isomericSmiles=isomeric
43
+ )
44
+ except:
45
+ normalized = None
46
+ return normalized
47
+
48
+
49
  class MolTranBertTokenizer(BertTokenizer):
50
  def __init__(self, vocab_file: str = '',
51
  do_lower_case=False,
 
493
  if self.is_cuda_available:
494
  self.encoder.cuda()
495
  self.decoder.cuda()
496
+
497
+ # handle single str or a list of str
498
+ smiles = pd.Series(smiles) if isinstance(smiles, str) else pd.Series(list(smiles))
499
+ smiles = smiles.apply(normalize_smiles)
500
 
501
  # tokenizer
502
+ idx, mask = self.tokenize(smiles.to_list())
503
 
504
  ###########
505
  # Encoder #
 
568
 
569
  # handle single str or a list of str
570
  smiles = pd.Series(smiles) if isinstance(smiles, str) else pd.Series(list(smiles))
571
+ smiles = smiles.apply(normalize_smiles)
572
  n_split = smiles.shape[0] // batch_size if smiles.shape[0] >= batch_size else smiles.shape[0]
573
 
574
  # process in batches
smi-ted/inference/smi_ted_light/load.py CHANGED
@@ -19,6 +19,12 @@ from transformers import BertTokenizer
19
  import numpy as np
20
  import pandas as pd
21
 
 
 
 
 
 
 
22
  # Standard library
23
  from functools import partial
24
  import regex as re
@@ -29,6 +35,17 @@ from tqdm import tqdm
29
  tqdm.pandas()
30
 
31
 
 
 
 
 
 
 
 
 
 
 
 
32
  class MolTranBertTokenizer(BertTokenizer):
33
  def __init__(self, vocab_file: str = '',
34
  do_lower_case=False,
@@ -476,9 +493,13 @@ class Smi_ted(nn.Module):
476
  if self.is_cuda_available:
477
  self.encoder.cuda()
478
  self.decoder.cuda()
 
 
 
 
479
 
480
  # tokenizer
481
- idx, mask = self.tokenize(smiles)
482
 
483
  ###########
484
  # Encoder #
@@ -547,6 +568,7 @@ class Smi_ted(nn.Module):
547
 
548
  # handle single str or a list of str
549
  smiles = pd.Series(smiles) if isinstance(smiles, str) else pd.Series(list(smiles))
 
550
  n_split = smiles.shape[0] // batch_size if smiles.shape[0] >= batch_size else smiles.shape[0]
551
 
552
  # process in batches
 
19
  import numpy as np
20
  import pandas as pd
21
 
22
+ # Chemistry
23
+ from rdkit import Chem
24
+ from rdkit.Chem import PandasTools
25
+ from rdkit.Chem import Descriptors
26
+ PandasTools.RenderImagesInAllDataFrames(True)
27
+
28
  # Standard library
29
  from functools import partial
30
  import regex as re
 
35
  tqdm.pandas()
36
 
37
 
38
+ # function to canonicalize SMILES
39
+ def normalize_smiles(smi, canonical=True, isomeric=False):
40
+ try:
41
+ normalized = Chem.MolToSmiles(
42
+ Chem.MolFromSmiles(smi), canonical=canonical, isomericSmiles=isomeric
43
+ )
44
+ except:
45
+ normalized = None
46
+ return normalized
47
+
48
+
49
  class MolTranBertTokenizer(BertTokenizer):
50
  def __init__(self, vocab_file: str = '',
51
  do_lower_case=False,
 
493
  if self.is_cuda_available:
494
  self.encoder.cuda()
495
  self.decoder.cuda()
496
+
497
+ # handle single str or a list of str
498
+ smiles = pd.Series(smiles) if isinstance(smiles, str) else pd.Series(list(smiles))
499
+ smiles = smiles.apply(normalize_smiles)
500
 
501
  # tokenizer
502
+ idx, mask = self.tokenize(smiles.to_list())
503
 
504
  ###########
505
  # Encoder #
 
568
 
569
  # handle single str or a list of str
570
  smiles = pd.Series(smiles) if isinstance(smiles, str) else pd.Series(list(smiles))
571
+ smiles = smiles.apply(normalize_smiles)
572
  n_split = smiles.shape[0] // batch_size if smiles.shape[0] >= batch_size else smiles.shape[0]
573
 
574
  # process in batches