Spaces:
Sleeping
Sleeping
saicharan2804
commited on
Commit
•
a843a07
1
Parent(s):
9089432
Added tokens_to_mer
Browse files- KmerTokenizer.py +7 -1
KmerTokenizer.py
CHANGED
@@ -19,6 +19,9 @@ def atomwise_tokenizer(smi, exclusive_tokens = None):
|
|
19 |
tokens[i] = '[UNK]'
|
20 |
return tokens
|
21 |
|
|
|
|
|
|
|
22 |
def kmer_tokenizer(smiles, ngram=4, stride=1, remove_last = False, exclusive_tokens = None):
|
23 |
units = atomwise_tokenizer(smiles, exclusive_tokens = exclusive_tokens) #collect all the atom-wise tokens from the SMILES
|
24 |
if ngram == 1:
|
@@ -29,4 +32,7 @@ def kmer_tokenizer(smiles, ngram=4, stride=1, remove_last = False, exclusive_tok
|
|
29 |
if remove_last:
|
30 |
if len(tokens[-1]) < ngram: #truncate last whole k-mer if the length of the last k-mers is less than ngram.
|
31 |
tokens = tokens[:-1]
|
32 |
-
return tokens
|
|
|
|
|
|
|
|
19 |
tokens[i] = '[UNK]'
|
20 |
return tokens
|
21 |
|
22 |
+
def tokens_to_mer(toks):
|
23 |
+
return ''.join(toks)
|
24 |
+
|
25 |
def kmer_tokenizer(smiles, ngram=4, stride=1, remove_last = False, exclusive_tokens = None):
|
26 |
units = atomwise_tokenizer(smiles, exclusive_tokens = exclusive_tokens) #collect all the atom-wise tokens from the SMILES
|
27 |
if ngram == 1:
|
|
|
32 |
if remove_last:
|
33 |
if len(tokens[-1]) < ngram: #truncate last whole k-mer if the length of the last k-mers is less than ngram.
|
34 |
tokens = tokens[:-1]
|
35 |
+
return tokens
|
36 |
+
|
37 |
+
|
38 |
+
print(kmer_tokenizer('CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](NC(=O)[C@@H](N)CCSC)[C@@H](C)O)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C)C(=O)N[C@@H](Cc1c[nH]cn1)C(=O)N[C@@H](CC(N)=O)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)NCC(=O)N[C@@H](CO)C(=O)N[C@@H](CCCN=C(N)N)C(N)=O'))
|