pparasurama commited on
Commit
dc58658
1 Parent(s): 8ed89d1

add tokenizer

Browse files
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"pad_token": "[PAD]", "mask_token": "[MASK]"}
tokenizer.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[CLS]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SEP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":3,"special":true,"content":"[PAD]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":4,"special":true,"content":"[MASK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"WhitespaceSplit"},"post_processor":{"type":"BertProcessing","sep":["[SEP]",2],"cls":["[CLS]",1]},"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[UNK]":0,"[CLS]":1,"[SEP]":2,"[PAD]":3,"[MASK]":4,"!":5,"\"":6,"#":7,"%":8,"&":9,"'":10,"(":11,")":12,"*":13,"+":14,",":15,"-":16,".":17,"/":18,"0":19,"1":20,"2":21,"3":22,"4":23,"5":24,"6":25,"7":26,"8":27,"9":28,":":29,";":30,"=":31,"?":32,"@":33,"A":34,"B":35,"C":36,"D":37,"E":38,"F":39,"G":40,"H":41,"I":42,"J":43,"K":44,"L":45,"M":46,"N":47,"O":48,"P":49,"Q":50,"R":51,"S":52,"T":53,"U":54,"V":55,"W":56,"X":57,"Y":58,"Z":59,"[":60,"\\":61,"]":62,"`":63,"a":64,"b":65,"c":66,"d":67,"e":68,"f":69,"g":70,"h":71,"i":72,"j":73,"k":74,"l":75,"m":76,"n":77,"o":78,"p":79,"q":80,"r":81,"s":82,"t":83,"u":84,"v":85,"w":86,"x":87,"y":88,"z":89,"}":90,"~":91,"€":92,"‚":93,"ƒ":94,"„":95,"†":96,"‡":97,"ˆ":98,"‰":99,"‹":100,"Œ":101,"‘":102,"“":103,"–":104,"—":105,"˜":106,"™":107,"š":108,"›":109,"œ":110,"ž":111,"Ÿ":112,"¡":113,"¢":114,"£":115,"¤":116,"¥":117,"§":118,"¨":119,"©":120,"«":121,"­":122,"®":123,"¯":124,"°":125,"±":126,"²":127,"³":128,"¶":129,"·":130,"¸":131,"¹":132,"º":133,"»":134,"¼":135,"½":136,"¾":137,"¿":138,"À":139,"Á":140,"Â":141,"Ã":142,"Ä":143,"Å":144,"Æ":145,"Ç":146,"È":147,"É":148,"Ê":149,"Ë":150,"Ì":151,"Í":152,"Î":153,"Ï":154,"Ð":155,"Ñ":156,"Ò":157,"Ó":158,"Ô":159,"Õ":160,"Ö":161,"Ø":162,"Ù":163,"Ú":164,"Û":165,"Ü":166,"Ý":167,"ß":168,"à":169,"á":170,"â":171,"ã":172,"ä":173,"å":174,"æ":175,"ç":176,"è":177,"é":178,"ê":179,"ë":180,"ì":181,"í":182,"î":183,"ï":184,"ð":185,"ñ":186,"ò":187,"ó":188,"ô":189,"õ":190,"ö":191,"ø":192,"ù":193,"ú":194,"ü":195,"ý":196,"þ":197,"Ÿ":198,"Μ":199,"ER":200,"an":201,"ar":202,"AN":203,"ON":204,"AR":205,"er":206,"el":207,"IN":208,"LL":209,"en":210,"ri":211,"EN":212,"th":213,"in":214,"on":215,"ch":216,"LE":217,"OR":218,"le":219,"RO":220,"am":221,"CH":222,"RI":223,"ES":224,"mar":225,"jo":226,"SON":227,"li":228,"st":229,"EZ":230,"da":231,"ST":232,"ra":233,"AL":234,"AM":235,"ia":236,"EL":237,"ro":238,"es":239,"il":240,"al":241,"RE":242,"re":243,"sh":244,"CO":245,"AS":246,"CK":247,"TH":248,"TT":249,"ILL":250,"mi":251,"la":252,"ELL":253,"ph":254,"IS":255,"ber":256,"is":257,"RA":258,"MAN":259,"ma":260,"ie":261,"and":262,"ic":263,"ne":264,"or":265,"AND":266,"se":267,"UR":268,"OL":269,"bert":270,"mich":271,"GU":272,"na":273,"car":274,"ine":275,"MI":276,"ly":277,"MA":278,"LA":279,"tt":280,"vi":281,"char":282,"pa":283,"rist":284,"OS":285,"ING":286,"ry":287,"jam":288,"ael":289,"ARD":290,"BER":291,"AD":292,"joh":293,"robert":294,"ART":295,"ka":296,"EY":297,"UN":298,"MC":299,"john":300,"IL":301,"ju":302,"OW":303,"sa":304,"michael":305,"be":306,"ja":307,"ALL":308,"ce":309,"SH":310,"lo":311,"IA":312,"do":313,"wil":314,"DE":315,"james":316,"ara":317,"de":318,"AT":319,"OM":320,"jose":321,"christ":322,"ED":323,"LEY":324,"RU":325,"AV":326,"AU":327,"TON":328,"LO":329,"IT":330,"ta":331,"willi":332,"AY":333,"ste":334,"ric":335,"ol":336,"vid":337,"ge":338,"david":339,"dan":340,"PER":341,"NE":342,"william":343,"ina":344,"di":345,"ALE":346,"im":347,"ni":348,"mary":349,"GH":350,"ERS":351,"OD":352,"us":353,"ald":354,"ed":355,"cy":356,"FF":357,"INS":358,"VER":359,"ana":360,"ren":361,"ca":362,"pat":363,"BRO":364,"SCH":365,"ter":366,"QU":367,"ANT":368,"ILLI":369,"CE":370,"JO":371,"je":372,"CA":373,"ley":374,"iel":375,"lin":376,"KI":377,"patric":378,"bri":379,"MOR":380,"ran":381,"don":382,"AMS":383,"SMI":384,"US":385,"co":386,"HO":387,"BA":388,"SMITH":389,"ane":390,"MON":391,"chard":392,"BO":393,"richard":394,"HN":395,"ET":396,"RY":397,"ther":398,"DRI":399,"CAR":400,"ke":401,"KER":402,"HER":403,"za":404,"NER":405,"ela":406,"les":407,"joseph":408,"son":409,"BAR":410,"fer":411,"eli":412,"gar":413,"STE":414,"fre":415,"ORD":416,"GO":417,"IER":418,"yl":419,"uel":420,"maria":421,"daniel":422,"KE":423,"BE":424,"MO":425,"oma":426,"vin":427,"ica":428,"GE":429,"GAR":430,"jen":431,"anne":432,"GUEZ":433,"omas":434,"ann":435,"JOHN":436,"WILLI":437,"len":438,"LI":439,"bar":440,"MAR":441,"DER":442,"fran":443,"lau":444,"thomas":445,"PH":446,"oph":447,"sha":448,"MART":449,"SI":450,"RODRI":451,"patricia":452,"lu":453,"TER":454,"ROS":455,"RAN":456,"RODRIGUEZ":457,"MER":458,"thy":459,"LER":460,"HAR":461,"WILLIAMS":462,"IR":463,"tte":464,"ale":465,"opher":466,"BUR":467,"dy":468,"DO":469,"steph":470,"DI":471,"NI":472,"lie":473,"ard":474,"andra":475,"deb":476,"beth":477,"christopher":478,"kar":479,"MARTIN":480,"pau":481,"JOHNSON":482,"mel":483,"WE":484,"jes":485,"lyn":486,"yn":487,"ang":488,"BROW":489,"barb":490,"linda":491,"barbara":492,"GON":493,"SE":494,"charles":495,"jenni":496,"GER":497,"ton":498,"BU":499},"merges":["E R","a n","a r","A N","O N","A R","e r","e l","I N","L L","e n","r i","E N","t h","i n","o n","c h","L E","O R","l e","R O","a m","C H","R I","E S","m ar","j o","S ON","l i","s t","E Z","d a","S T","r a","A L","A M","i a","E L","r o","e s","i l","a l","R E","r e","s h","C O","A S","C K","T H","T T","I LL","m i","l a","E LL","p h","I S","b er","i s","R A","M AN","m a","i e","an d","i c","n e","o r","AN D","s e","U R","O L","ber t","mi ch","G U","n a","c ar","in e","M I","l y","M A","L A","t t","v i","ch ar","p a","ri st","O S","IN G","r y","j am","a el","AR D","B ER","A D","jo h","ro bert","AR T","k a","E Y","U N","M C","joh n","I L","j u","O W","s a","mich ael","b e","j a","A LL","c e","S H","l o","I A","d o","w il","D E","jam es","ar a","d e","A T","O M","jo se","ch rist","E D","LE Y","R U","A V","A U","T ON","L O","I T","t a","wil li","A Y","st e","ri c","o l","vi d","g e","da vid","d an","P ER","N E","willi am","in a","d i","A LE","i m","n i","mar y","G H","ER S","O D","u s","al d","e d","c y","F F","IN S","V ER","an a","r en","c a","pa t","B RO","S CH","t er","Q U","AN T","ILL I","C E","J O","j e","C A","le y","i el","l in","K I","pat ric","b ri","M OR","r an","d on","AM S","S MI","U S","c o","H O","B A","SMI TH","an e","M ON","char d","B O","ri chard","H N","E T","R Y","th er","D RI","C AR","k e","K ER","H ER","z a","N ER","el a","le s","jose ph","s on","B AR","f er","el i","g ar","ST E","f re","OR D","G O","I ER","y l","u el","mar ia","dan iel","K E","B E","M O","o ma","v in","ic a","G E","G AR","j en","an ne","GU EZ","oma s","an n","JO HN","W ILLI","l en","L I","b ar","M AR","D ER","f ran","la u","th omas","P H","o ph","sh a","M ART","S I","RO DRI","patric ia","l u","T ER","RO S","R AN","RODRI GUEZ","M ER","th y","L ER","H AR","WILLI AMS","I R","tt e","a le","oph er","B UR","d y","D O","ste ph","D I","N I","li e","ar d","and ra","de b","be th","christ opher","k ar","MART IN","pa u","JOHN SON","m el","W E","j es","ly n","y n","an g","BRO W","bar b","lin da","barb ara","G ON","S E","char les","jen ni","G ER","t on","B U"]}}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"return_special_tokens_mask": true, "tokenizer_class": "PreTrainedTokenizerFast"}