Spaces:
Runtime error
Runtime error
saicharan2804
commited on
Commit
•
6b58f50
1
Parent(s):
d235aee
Added token IDs
Browse files- SmilesPeTokenizer.py +25 -4
- app.py +1 -1
SmilesPeTokenizer.py
CHANGED
@@ -1,12 +1,33 @@
|
|
1 |
import codecs
|
2 |
from SmilesPE.tokenizer import *
|
3 |
|
4 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
-
|
|
|
|
|
7 |
spe = SPE_Tokenizer(spe_vob)
|
8 |
-
|
|
|
9 |
tokenized = spe.tokenize(smiles_string)
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
|
|
|
|
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import codecs
|
2 |
from SmilesPE.tokenizer import *
|
3 |
|
4 |
+
def load_vocabulary_to_dict(vocabulary_path):
|
5 |
+
vocab_dict = {}
|
6 |
+
with codecs.open(vocabulary_path, 'r', 'utf-8') as file:
|
7 |
+
for index, line in enumerate(file):
|
8 |
+
token = line.strip().split()[0] # Assuming first item is the token
|
9 |
+
vocab_dict[token] = index # Or use the token itself as ID if preferable
|
10 |
+
return vocab_dict
|
11 |
|
12 |
+
def smilespe_tokenizer(smiles_string, vocab_dict):
|
13 |
+
# Initialize SPE_Tokenizer with the vocabulary
|
14 |
+
spe_vob = codecs.open('chembl_smiles_tokenizer30000.txt', 'r', 'utf-8')
|
15 |
spe = SPE_Tokenizer(spe_vob)
|
16 |
+
|
17 |
+
# Tokenize the SMILES string
|
18 |
tokenized = spe.tokenize(smiles_string)
|
19 |
+
|
20 |
+
# Convert tokens to IDs using the vocab_dict
|
21 |
+
token_ids = [vocab_dict[token] for token in tokenized if token in vocab_dict]
|
22 |
+
|
23 |
+
return tokenized, token_ids
|
24 |
|
25 |
+
# Load the vocabulary into a dictionary
|
26 |
+
# vocab_path = 'chembl_smiles_tokenizer30000.txt'
|
27 |
+
# vocab_dict = load_vocabulary_to_dict(vocab_path)
|
28 |
|
29 |
+
# # Example usage
|
30 |
+
# smiles_string = 'Brc1cccc(Nc2ncnc3ccncc23)c1NCCN1CCOCC1'
|
31 |
+
# tokens, token_ids = smilespe_tokenizer(smiles_string, vocab_dict)
|
32 |
+
# print("Tokens:", tokens)
|
33 |
+
# print("Token IDs:", token_ids)
|
app.py
CHANGED
@@ -6,7 +6,7 @@ iface = gr.Interface(
|
|
6 |
inputs=[
|
7 |
gr.Textbox(label="SMILES"),
|
8 |
],
|
9 |
-
outputs="text"
|
10 |
)
|
11 |
|
12 |
iface.launch()
|
|
|
6 |
inputs=[
|
7 |
gr.Textbox(label="SMILES"),
|
8 |
],
|
9 |
+
outputs=["text", "text"]
|
10 |
)
|
11 |
|
12 |
iface.launch()
|