Jonghyun Lee commited on
Commit
bb74661
1 Parent(s): 3ca94a7

add tokenizer

Browse files
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "[SOS]", "eos_token": "[EOS]", "unk_token": "[UNK]", "pad_token": "[PAD]", "mask_token": "[MASK]"}
tokenizer.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[SOS]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[EOS]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[PAD]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":3,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":4,"special":true,"content":"[MASK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":5,"special":true,"content":"[CLS_1]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":6,"special":true,"content":"[CLS_2]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":7,"special":true,"content":"[CLS_3]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":8,"special":true,"content":"[CLS_4]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":9,"special":true,"content":"[CLS_5]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":10,"special":true,"content":"[CLS_6]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Split","pattern":{"Regex":"\n Si|Mg|Ca|Fe|As|Al|Cl|Br|[#%\\)\\(\\+\\-1032547698:=@CBFIHONPS\\[\\]icosn]|/|\\\\\n"},"behavior":"Isolated","invert":false},"post_processor":{"type":"TemplateProcessing","single":[{"SpecialToken":{"id":"[SOS]","type_id":0}},{"Sequence":{"id":"A","type_id":0}},{"SpecialToken":{"id":"[EOS]","type_id":0}}],"pair":[{"Sequence":{"id":"A","type_id":0}},{"Sequence":{"id":"B","type_id":1}}],"special_tokens":{"[EOS]":{"id":"[EOS]","ids":[1],"tokens":["[EOS]"]},"[SOS]":{"id":"[SOS]","ids":[0],"tokens":["[SOS]"]}}},"decoder":null,"model":{"type":"WordLevel","vocab":{"[SOS]":0,"[EOS]":1,"[PAD]":2,"[UNK]":3,"[MASK]":4,"[CLS_1]":5,"[CLS_2]":6,"[CLS_3]":7,"[CLS_4]":8,"[CLS_5]":9,"[CLS_6]":10,"c":11,"C":12,")":13,"(":14,"O":15,"1":16,"2":17,"=":18,"N":19,"@":20,"[":21,"]":22,"n":23,"3":24,"H":25,"F":26,"4":27,"-":28,"S":29,"Cl":30,"/":31,"s":32,"o":33,"5":34,"+":35,"#":36,".":37,"Br":38,"\\":39,"6":40,"P":41,"I":42,"7":43,"a":44,"%":45,"8":46,"B":47,"i":48,"9":49,"e":50,"0":51,"K":52,"L":53,"As":54,"Z":55,"Ca":56,"Te":57,"Mg":58,"Al":59,"te":60,"Ag":61,"p":62,"r":63,"Rb":64,"At":65,"b":66,"Ra":67,"Xe":68,"Kr":69},"unk_token":"[UNK]"}}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_max_length": 512, "padding_side": "right", "truncation_side": "left", "bos_token": "[SOS]", "eos_token": "[EOS]", "pad_token": "[PAD]", "unk_token": "[UNK]", "mask_token": "[MASK]", "special_tokens_map_file": "tokenizer/special_tokens_map.json", "name_or_path": "tokenizer", "tokenizer_class": "PreTrainedTokenizerFast"}