Spaces:

saicharan2804
/

SmilesPeTokenizer

Runtime error

saicharan2804 commited on Feb 27, 2024

Commit

6b58f50

1 Parent(s): d235aee

Added token IDs

Files changed (2) hide show

SmilesPeTokenizer.py CHANGED Viewed

@@ -1,12 +1,33 @@
 import codecs
 from SmilesPE.tokenizer import *
-def smilespe_tokenizer(smiles_string):
-    spe_vob = codecs.open('chembl_smiles_tokenizer30000.txt')
     spe = SPE_Tokenizer(spe_vob)
     tokenized = spe.tokenize(smiles_string)
-    return tokenized

 import codecs
 from SmilesPE.tokenizer import *
+def load_vocabulary_to_dict(vocabulary_path):
+    vocab_dict = {}
+    with codecs.open(vocabulary_path, 'r', 'utf-8') as file:
+        for index, line in enumerate(file):
+            token = line.strip().split()[0]  # Assuming first item is the token
+            vocab_dict[token] = index  # Or use the token itself as ID if preferable
+    return vocab_dict
+def smilespe_tokenizer(smiles_string, vocab_dict):
+    # Initialize SPE_Tokenizer with the vocabulary
+    spe_vob = codecs.open('chembl_smiles_tokenizer30000.txt', 'r', 'utf-8')
     spe = SPE_Tokenizer(spe_vob)
+    # Tokenize the SMILES string
     tokenized = spe.tokenize(smiles_string)
+    # Convert tokens to IDs using the vocab_dict
+    token_ids = [vocab_dict[token] for token in tokenized if token in vocab_dict]
+    return tokenized, token_ids
+# Load the vocabulary into a dictionary
+# vocab_path = 'chembl_smiles_tokenizer30000.txt'
+# vocab_dict = load_vocabulary_to_dict(vocab_path)
+# # Example usage
+# smiles_string = 'Brc1cccc(Nc2ncnc3ccncc23)c1NCCN1CCOCC1'
+# tokens, token_ids = smilespe_tokenizer(smiles_string, vocab_dict)
+# print("Tokens:", tokens)
+# print("Token IDs:", token_ids)

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ iface = gr.Interface(
     inputs=[
         gr.Textbox(label="SMILES"),
     ],
-    outputs="text"
 )
 iface.launch()

     inputs=[
         gr.Textbox(label="SMILES"),
     ],
+    outputs=["text", "text"]
 )
 iface.launch()