Spaces:

saicharan2804
/

KmerTokenizer

Sleeping

saicharan2804 commited on Feb 23, 2024

Commit

a843a07

•

1 Parent(s): 9089432

Added tokens_to_mer

Files changed (1) hide show

KmerTokenizer.py CHANGED Viewed

@@ -19,6 +19,9 @@ def atomwise_tokenizer(smi, exclusive_tokens = None):
                     tokens[i] = '[UNK]'
     return tokens
 def kmer_tokenizer(smiles, ngram=4, stride=1, remove_last = False, exclusive_tokens = None):
     units = atomwise_tokenizer(smiles, exclusive_tokens = exclusive_tokens) #collect all the atom-wise tokens from the SMILES
     if ngram == 1:
@@ -29,4 +32,7 @@ def kmer_tokenizer(smiles, ngram=4, stride=1, remove_last = False, exclusive_tok
     if remove_last:
         if len(tokens[-1]) < ngram: #truncate last whole k-mer if the length of the last k-mers is less than ngram.
             tokens = tokens[:-1]
-    return tokens

                     tokens[i] = '[UNK]'
     return tokens
+def tokens_to_mer(toks):
+    return ''.join(toks)
 def kmer_tokenizer(smiles, ngram=4, stride=1, remove_last = False, exclusive_tokens = None):
     units = atomwise_tokenizer(smiles, exclusive_tokens = exclusive_tokens) #collect all the atom-wise tokens from the SMILES
     if ngram == 1:
     if remove_last:
         if len(tokens[-1]) < ngram: #truncate last whole k-mer if the length of the last k-mers is less than ngram.
             tokens = tokens[:-1]
+    return tokens
+print(kmer_tokenizer('CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](NC(=O)[C@@H](N)CCSC)[C@@H](C)O)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C)C(=O)N[C@@H](Cc1c[nH]cn1)C(=O)N[C@@H](CC(N)=O)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)NCC(=O)N[C@@H](CO)C(=O)N[C@@H](CCCN=C(N)N)C(N)=O'))