ncfrey commited on
Commit
0164ca1
1 Parent(s): 8423575

add tokenizer

Browse files
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {}
tokenizer.json ADDED
@@ -0,0 +1 @@
 
1
+ {"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[PAD]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[CLS]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":3,"special":true,"content":"[SEP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":4,"special":true,"content":"[MASK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Split","pattern":{"Regex":"\\[(.*?)\\]"},"behavior":"Isolated","invert":false},"post_processor":{"type":"TemplateProcessing","single":[{"SpecialToken":{"id":"[CLS]","type_id":0}},{"Sequence":{"id":"A","type_id":0}},{"SpecialToken":{"id":"[SEP]","type_id":0}}],"pair":[{"SpecialToken":{"id":"[CLS]","type_id":0}},{"Sequence":{"id":"A","type_id":0}},{"SpecialToken":{"id":"[SEP]","type_id":0}},{"Sequence":{"id":"B","type_id":1}},{"SpecialToken":{"id":"[SEP]","type_id":1}}],"special_tokens":{"[CLS]":{"id":"[CLS]","ids":[2],"tokens":["[CLS]"]},"[SEP]":{"id":"[SEP]","ids":[3],"tokens":["[SEP]"]}}},"decoder":{"type":"WordPiece","prefix":"##","cleanup":true},"model":{"type":"WordPiece","unk_token":"[UNK]","continuing_subword_prefix":"##","max_input_chars_per_word":100,"vocab":{"[UNK]":0,"[PAD]":1,"[CLS]":2,"[SEP]":3,"[MASK]":4,"#":5,"+":6,"-":7,"1":8,"2":9,"3":10,"4":11,"=":12,"A":13,"B":14,"C":15,"D":16,"E":17,"F":18,"G":19,"H":20,"I":21,"K":22,"L":23,"M":24,"N":25,"O":26,"P":27,"R":28,"S":29,"T":30,"U":31,"V":32,"W":33,"X":34,"Y":35,"Z":36,"[":37,"]":38,"_":39,"a":40,"b":41,"c":42,"d":43,"e":44,"f":45,"g":46,"h":47,"i":48,"l":49,"m":50,"n":51,"o":52,"p":53,"r":54,"s":55,"t":56,"u":57,"x":58,"y":59,"##T":60,"##c":61,"##e":62,"##x":63,"##p":64,"##l":65,"##]":66,"##m":67,"###":68,"##S":69,"##-":70,"##Z":71,"##r":72,"##=":73,"##P":74,"##A":75,"##s":76,"##H":77,"##I":78,"##3":79,"##N":80,"##b":81,"##G":82,"##2":83,"##R":84,"##a":85,"##+":86,"##Y":87,"##M":88,"##n":89,"##i":90,"##u":91,"##f":92,"##U":93,"##B":94,"##F":95,"##C":96,"##d":97,"##X":98,"##O":99,"##t":100,"##h":101,"##o":102,"##1":103,"##_":104,"##E":105,"##g":106,"##D":107,"##y":108,"##W":109,"##V":110,"##L":111,"##4":112,"##K":113,"##C]":114,"[C]":115,"##1]":116,"[B":117,"[Br":118,"##ch":119,"##an":120,"[Bran":121,"[Branch":122,"[=":123,"##1_":124,"[Branch1_":125,"[=C]":126,"##Ri":127,"##ng":128,"##Ring":129,"[Ring":130,"##2]":131,"[Ring1]":132,"[Branch1_1]":133,"##O]":134,"##N]":135,"[Branch1_2]":136,"[N]":137,"##2_":138,"[Branch2_":139,"[O]":140,"[=O]":141,"[Ring2]":142,"##l]":143,"[Branch2_1]":144,"##xp":145,"##3]":146,"##exp":147,"##expl]":148,"[=N]":149,"##S]":150,"[S]":151,"[Branch1_3]":152,"[N":153,"##+expl]":154,"[NH":155,"[F":156,"[F]":157,"[Branch2_2]":158,"[Branch2_3]":159,"[#":160,"[C":161,"[Cl]":162,"[#C]":163,"##-expl]":164,"[P":165,"[P]":166,"[O":167,"[O-expl]":168,"[NH+expl]":169,"[E":170,"##l=":171,"##xpl=":172,"[Expl=":173,"[Expl=Ring":174,"##2+expl]":175,"[NH2+expl]":176,"[Expl=Ring1]":177,"[Br]":178,"[NHexpl]":179,"##3+expl]":180,"[NH3+expl]":181,"[N+expl]":182,"[=N":183,"[#N]":184,"[Expl=Ring2]":185,"[=NH":186,"[=S]":187,"[=NH+expl]":188,"[S":189,"[Si":190,"[Siexpl]":191,"[I":192,"[I]":193,"[=N+expl]":194,"[=NH2+expl]":195,"[B]":196,"[N-expl]":197,"[=N-expl]":198,"[S-expl]":199,"[P+expl]":200,"##Hexpl]":201,"##eexpl]":202,"[#N":203,"[#N+expl]":204,"[C-expl]":205,"[=P":206,"[=P]":207,"[Seexpl]":208,"[CHexpl]":209,"##3_":210,"[Branch3_":211,"[Branch3_1]":212,"##H2":213,"##H2expl]":214,"##nexpl]":215,"[Snexpl]":216,"[SiHexpl]":217,"[SiH2expl]":218,"[A":219,"[B-expl]":220,"[S+expl]":221,"##sexpl]":222,"[Asexpl]":223,"[G":224,"##3expl]":225,"[T":226,"[=S":227,"[Geexpl]":228,"[Teexpl]":229,"##H3expl]":230,"[SiH3expl]":231,"[Al":232,"##O+expl]":233,"[C+expl]":234,"[IH2expl]":235,"[=O+expl]":236,"[Alexpl]":237,"##C-expl]":238,"[=P+expl]":239,"[Ring3]":240,"[#C-expl]":241,"[H":242,"[Hg":243,"[Hgexpl]":244,"[O+expl]":245,"##bexpl]":246,"[=Seexpl]":247,"[CH2expl]":248,"##iexpl]":249,"[PHexpl]":250,"[#P":251,"[#P]":252,"##Wexpl]":253,"##H2-expl]":254,"[I+expl]":255,"[Se":256,"[I-expl]":257,"[=S+expl]":258,"[Sbexpl]":259,"[Pbexpl]":260,"##H+expl]":261,"[Cl":262,"##+3expl]":263,"[=C":264,"[=Wexpl]":265,"[CH2-expl]":266,"[Cl+3expl]":267,"[=Siexpl]":268,"##aexpl]":269,"[=B":270,"[PH+expl]":271,"[Biexpl]":272,"[=B]":273,"[Sn":274,"[Gaexpl]":275,"##H-expl]":276,"[=C+expl]":277,"[Inexpl]":278,"[=A":279,"[=I":280,"[SHexpl]":281,"[#S]":282,"##2expl]":283,"[=Asexpl]":284,"##+2expl]":285,"[=SHexpl]":286,"##uexpl]":287,"[SeHexpl]":288,"##rexpl]":289,"[=IH2expl]":290,"[=R":291,"[Se-expl]":292,"##lexpl]":293,"[As":294,"[Ge":295,"[=Ruexpl]":296,"[BHexpl]":297,"##Vexpl]":298,"[CH-expl]":299,"[=T":300,"[Tlexpl]":301,"##oexpl]":302,"[=Crexpl]":303,"[BH-expl]":304,"[=Vexpl]":305,"[Se+expl]":306,"[=C-expl]":307,"[NH-expl]":308,"[BH2-expl]":309,"[Wexpl]":310,"[=Teexpl]":311,"##e+expl]":312,"##H3":313,"[BH3":314,"[BH3-expl]":315,"[As+expl]":316,"##Moexpl]":317,"[Branch3_2]":318,"[#S":319,"[Sn+expl]":320,"[GeHexpl]":321,"[Al-expl]":322,"[=Si":323,"[R":324,"[=Z":325,"##texpl]":326,"[SnHexpl]":327,"[=SiHexpl]":328,"[AsHexpl]":329,"[Sn+2expl]":330,"[GeH2expl]":331,"[CH+expl]":332,"[Al+expl]":333,"##eHexpl]":334,"[Te+expl]":335,"[Hg+expl]":336,"[=F":337,"[SnH2expl]":338,"[=G":339,"[=PHexpl]":340,"[=Ptexpl]":341,"[=Snexpl]":342,"[#Siexpl]":343,"##r+2expl]":344,"##H2+expl]":345,"##hexpl]":346,"[=Moexpl]":347,"[=I]":348,"[=Feexpl]":349,"[Moexpl]":350,"[OH+expl]":351,"[Si+expl]":352,"[=Sbexpl]":353,"[Cl+expl]":354,"[P-expl]":355,"[Expl=Ring3]":356,"[Sb":357,"[Z":358,"[=Zr+2expl]":359,"[TeHexpl]":360,"[Vexpl]":361,"[#O+expl]":362,"[=Zrexpl]":363,"[=Geexpl]":364,"[M":365,"[X":366,"##dexpl]":367,"[=O":368,"[Poexpl]":369,"[PH2+expl]":370,"[Al+2expl]":371,"[SnH3expl]":372,"[Sn+3expl]":373,"[=Rhexpl]":374,"[#SHexpl]":375,"[Ruexpl]":376,"[Bi":377,"[=Se+expl]":378,"[Br+2expl]":379,"[Feexpl]":380,"[Crexpl]":381,"[I+2expl]":382,"[#P+expl]":383,"[GeH3expl]":384,"[=H":385,"[#Wexpl]":386,"[Pb":387,"[Si-expl]":388,"[Cl+2expl]":389,"[=B-expl]":390,"[=Auexpl]":391,"[=Pdexpl]":392,"[=Alexpl]":393,"[Xeexpl]":394,"##Uexpl]":395,"[PH2-expl]":396,"[I+3expl]":397,"[K":398,"##cexpl]":399,"##-2expl]":400,"##f+2expl]":401,"[#Moexpl]":402,"[=Niexpl]":403,"[=SH+expl]":404,"[=Inexpl]":405,"[=Ti":406,"[=Tiexpl]":407,"[Reexpl]":408,"[=Znexpl]":409,"[Sb+expl]":410,"[Zrexpl]":411,"[Mnexpl]":412,"[Kexpl]":413,"##gexpl]":414,"[=M":415,"[Ptexpl]":416,"[SiH-expl]":417,"[IH3expl]":418,"[Atexpl]":419,"[Tiexpl]":420,"[Taexpl]":421,"[=Coexpl]":422,"[AsH2expl]":423,"[=Fe+expl]":424,"[=GeHexpl]":425,"[=Osexpl]":426,"[=OH+expl]":427,"[BiH2expl]":428,"[=Ti+2expl]":429,"[L":430,"##Yexpl]":431,"##u+expl]":432,"[Naexpl]":433,"[Osexpl]":434,"[=Nbexpl]":435,"[IHexpl]":436,"[Auexpl]":437,"[=Cuexpl]":438,"[=Irexpl]":439,"[=Ru+expl]":440,"[Liexpl]":441,"[t":442,"##mexpl]":443,"##Ybexpl]":444,"##n-expl]":445,"##4expl]":446,"[Csexpl]":447,"[Caexpl]":448,"[Coexpl]":449,"[OH2+expl]":450,"[In-expl]":451,"[AlHexpl]":452,"[=Biexpl]":453,"[=Taexpl]":454,"[=SiH2expl]":455,"[Raexpl]":456,"[Rhexpl]":457,"[SbHexpl]":458,"[Zr+2expl]":459,"[XeHexpl]":460,"[PbHexpl]":461,"[teexpl]":462,"##sHexpl]":463,"##H4expl]":464,"##a-expl]":465,"##B]":466,"##o+2expl]":467,"[=W":468,"[=Uexpl]":469,"[Niexpl]":470,"[#G":471,"[#C":472,"[#B]":473,"[Cuexpl]":474,"[Prexpl]":475,"[Os":476,"[=Pbexpl]":477,"[Tl":478,"[=S-expl]":479,"[Hf+2expl]":480,"[=Tcexpl]":481,"[Ru":482,"[=Gaexpl]":483,"[Sb-expl]":484,"[Znexpl]":485,"[=Hf+2expl]":486,"[=Hgexpl]":487,"[Pb+2expl]":488,"[=Mo+2expl]":489,"[Uexpl]":490,"##-3expl]":491,"##Reexpl]":492,"##i+expl]":493,"##fexpl]":494,"##o+expl]":495,"##W+expl]":496,"[=U":497,"[=Ybexpl]":498,"[#T":499,"[#Yexpl]":500,"[#Reexpl]":501,"[#W+expl]":502,"[CH2+expl]":503,"[PH-expl]":504,"[Euexpl]":505,"[Ir":506,"[Branch3_3]":507,"[Ga-expl]":508,"[Thexpl]":509,"[Tcexpl]":510,"[=AsHexpl]":511,"[Rh":512,"[Rbexpl]":513,"[Mn":514,"[Bi+2expl]":515,"[=Hfexpl]":516,"[=Mnexpl]":517,"[=WHexpl]":518,"[#Geexpl]":519,"[W":520,"[Yexpl]":521,"[Ybexpl]":522,"##eH2expl]":523,"##r+expl]":524,"##I]":525,"##a+expl]":526,"##Mo+expl]":527,"##i-expl]":528,"##iHexpl]":529,"##Osexpl]":530,"##Euexpl]":531,"##g+expl]":532,"##VHexpl]":533,"[Baexpl]":534,"[Br+expl]":535,"[Nbexpl]":536,"[Ni-expl]":537,"[Fe":538,"[Fe+expl]":539,"[#Uexpl]":540,"[#I]":541,"[#Mo+expl]":542,"[#Osexpl]":543,"[Cr+expl]":544,"[Pt":545,"[SH2expl]":546,"[SH+expl]":547,"[IH2":548,"[Irexpl]":549,"[=P-expl]":550,"[Agexpl]":551,"[TeH2expl]":552,"[=SeHexpl]":553,"[AlH2expl]":554,"[=CH-expl]":555,"[=Bi+expl]":556,"[Sn-expl]":557,"[=Reexpl]":558,"[=Tlexpl]":559,"[#S-expl]":560,"[#Sbexpl]":561,"[#SiHexpl]":562,"[SbH2expl]":563,"[Mg+expl]":564,"[Bi+expl]":565,"[#Crexpl]":566,"[TlHexpl]":567,"[TlH2expl]":568,"[=UHexpl]":569,"[Rh+expl]":570,"[Mn-2expl]":571,"[IH2+3expl]":572,"[U":573,"[VHexpl]":574,"##mH3expl]":575,"##r-2expl]":576,"##AsHexpl]":577,"##s+2expl]":578,"##H+3expl]":579,"##Hoexpl]":580,"##b+expl]":581,"##+4expl]":582,"##YHexpl]":583,"##nH2expl]":584,"##uH3expl]":585,"##f+4expl]":586,"##d-expl]":587,"##hH2expl]":588,"##o-3expl]":589,"##Dy":590,"##Laexpl]":591,"[Ba+expl]":592,"[=Yexpl]":593,"[=Euexpl]":594,"[=VHexpl]":595,"[=YHexpl]":596,"[Noexpl]":597,"[Ni+expl]":598,"[#Euexpl]":599,"[#AsHexpl]":600,"[#Hoexpl]":601,"[#Dy":602,"[#Laexpl]":603,"[Ceexpl]":604,"[Cdexpl]":605,"[Co-3expl]":606,"[PH2expl]":607,"[Paexpl]":608,"[Pmexpl]":609,"[PH4expl]":610,"[Pd-expl]":611,"[EuH3expl]":612,"[Srexpl]":613,"[Smexpl]":614,"[SmH3expl]":615,"[IH4expl]":616,"[IH+3expl]":617,"[#Ndexpl]":618,"[=PH2expl]":619,"[=PH+expl]":620,"[Acexpl]":621,"[Amexpl]":622,"[Gdexpl]":623,"[Tmexpl]":624,"[Ta-expl]":625,"[ThH2expl]":626,"[=Sb+expl]":627,"[=SnH2expl]":628,"[AlH-expl]":629,"[Al-2expl]":630,"[Hoexpl]":631,"[Hf+4expl]":632,"[Hg-expl]":633,"[HgHexpl]":634,"[#Prexpl]":635,"[=Ceexpl]":636,"[=Cdexpl]":637,"[=Baexpl]":638,"[=Agexpl]":639,"[AsH4expl]":640,"[=Thexpl]":641,"[=Tmexpl]":642,"[#Seexpl]":643,"[#SH-expl]":644,"[=Si+expl]":645,"[=FeHexpl]":646,"[Zr-2expl]":647,"[Mtexpl]":648,"[Mo+expl]":649,"[=Os+2expl]":650,"[BiHexpl]":651,"[PbH2expl]":652,"[PbH4expl]":653,"[=Ti+expl]":654,"[=W-expl]":655,"[#Gdexpl]":656,"[#Ceexpl]":657,"[#Coexpl]":658,"[Os+expl]":659,"[OsH2expl]":660,"[Os+2expl]":661,"[Os-3expl]":662,"[RuH2expl]":663,"[Ru+3expl]":664,"[Ru+2expl]":665,"[Ru-2expl]":666,"[=U+2expl]":667,"[#Teexpl]":668,"[#Tbexpl]":669,"[#Ta+expl]":670,"[Ir+3expl]":671,"[Ir-2expl]":672,"[Ir-3expl]":673,"[RhHexpl]":674,"[MnHexpl]":675,"[WH2expl]":676,"[W+2expl]":677,"[Fe-expl]":678,"[Fe-2expl]":679,"[Pt-expl]":680,"[Pt-2expl]":681,"[U+3expl]":682,"[#Dyexpl]":683}}}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"special_tokens_map_file": "/data1/groups/ai4bio_shared/molecules/tokenizers/pubchem10M_tokenizer/special_tokens_map.json", "name_or_path": "/data1/groups/ai4bio_shared/molecules/tokenizers/pubchem10M_tokenizer/", "tokenizer_class": "PreTrainedTokenizerFast"}