Spaces:
Sleeping
Sleeping
saicharan2804
commited on
Commit
·
1cd9d39
1
Parent(s):
cf89a64
Code change
Browse files- AtomwiseTokenizer.py +20 -0
- app.py +2 -4
AtomwiseTokenizer.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def atomwise_tokenizer(smi, exclusive_tokens = None):
|
2 |
+
"""
|
3 |
+
Tokenize a SMILES molecule at atom-level:
|
4 |
+
(1) 'Br' and 'Cl' are two-character tokens
|
5 |
+
(2) Symbols with bracket are considered as tokens
|
6 |
+
|
7 |
+
exclusive_tokens: A list of specifical symbols with bracket you want to keep. e.g., ['[C@@H]', '[nH]'].
|
8 |
+
Other symbols with bracket will be replaced by '[UNK]'. default is `None`.
|
9 |
+
"""
|
10 |
+
import re
|
11 |
+
pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
|
12 |
+
regex = re.compile(pattern)
|
13 |
+
tokens = [token for token in regex.findall(smi)]
|
14 |
+
|
15 |
+
if exclusive_tokens:
|
16 |
+
for i, tok in enumerate(tokens):
|
17 |
+
if tok.startswith('['):
|
18 |
+
if tok not in exclusive_tokens:
|
19 |
+
tokens[i] = '[UNK]'
|
20 |
+
return tokens
|
app.py
CHANGED
@@ -1,7 +1,5 @@
|
|
1 |
import gradio as gr
|
|
|
2 |
|
3 |
-
|
4 |
-
return "Hello " + name + "!!"
|
5 |
-
|
6 |
-
iface = gr.Interface(fn=greet, inputs="text", outputs="text")
|
7 |
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
from AtomwiseTokenizer import atomwise_tokenizer
|
3 |
|
4 |
+
iface = gr.Interface(fn=atomwise_tokenizer, inputs=["text", "text"], outputs="text")
|
|
|
|
|
|
|
5 |
iface.launch()
|