saicharan2804 commited on
Commit
f23bcf0
1 Parent(s): 1fc0c38

Added token IDs

Browse files
Files changed (3) hide show
  1. SmilesPeTokenizer.py +12 -8
  2. app.py +1 -1
  3. chembl_smiles_tokenizer.txt +0 -0
SmilesPeTokenizer.py CHANGED
@@ -1,12 +1,16 @@
1
- import codecs
2
- from SmilesPE.tokenizer import *
3
 
4
- def smilespe_tokenizer(smiles_string):
 
 
5
 
6
- spe_vob = codecs.open('chembl_smiles_tokenizer30000.txt')
7
- spe = SPE_Tokenizer(spe_vob)
8
 
9
- tokenized = spe.tokenize(smiles_string)
10
-
11
- return tokenized
12
 
 
 
 
 
 
1
+ from tokenizers import Tokenizer
 
2
 
3
+ def bpe_tokenizer(smiles_string):
4
+ # Load the tokenizer from the saved file
5
+ tokenizer = Tokenizer.from_file("chembl_bpe_tokenizer.json")
6
 
7
+ # Tokenize the SMILES string
8
+ encoded_output = tokenizer.encode(smiles_string)
9
 
10
+ # To get the tokenized output as text
11
+ tokens_text = encoded_output.tokens
 
12
 
13
+ # To get the corresponding token IDs
14
+ token_ids = encoded_output.ids
15
+
16
+ return tokens_text, token_ids
app.py CHANGED
@@ -6,7 +6,7 @@ iface = gr.Interface(
6
  inputs=[
7
  gr.Textbox(label="SMILES"),
8
  ],
9
- outputs="text"
10
  )
11
 
12
  iface.launch()
 
6
  inputs=[
7
  gr.Textbox(label="SMILES"),
8
  ],
9
+ outputs=["text", "text"]
10
  )
11
 
12
  iface.launch()
chembl_smiles_tokenizer.txt ADDED
File without changes