flboehm commited on
Commit
513b8a0
1 Parent(s): e68d230

add tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bill clinton": 30568, "bohr": 30641, "lucas parker": 30584, "jason khalipa": 30585, "hubble": 30625, "las vegas": 30523, "the korean war": 30555, "the tonight show": 30598, "the world health organization": 30666, "kathy newman": 30607, "imf": 30665, "lenny bruce": 30599, "the state department": 30653, "devil's island": 30531, "palm beach": 30663, "world war ii": 30552, "kkk": 30597, "nist": 30678, "broadmoor": 30541, "spacex": 30622, "u.s.": 30522, "nobel prize": 30574, "jeffrey epstein": 30626, "world war one": 30534, "the federal reserve": 30583, "hong kong": 30587, "clintons": 30638, "ice cube": 30571, "crossfit games": 30575, "cosby": 30591, "gacy": 30651, "the soviet union": 30586, "george bush": 30613, "pan am": 30624, "spinks": 30646, "the white house": 30634, "national geographic": 30549, "bohemian grove": 30669, "biden": 30635, "new mexico": 30594, "new york times": 30593, "hillary clinton": 30614, "social security": 30529, "french guiana": 30533, "bill cosby": 30596, "marxists": 30636, "ruger": 30539, "the south china sea": 30652, "great britain": 30577, "giza": 30656, "the indian ocean": 30544, "sam briggs": 30588, "seaworld": 30592, "the united nations": 30618, "saddam hussein": 30615, "froning": 30573, "rothschilds": 30674, "roseanne": 30644, "oprah": 30620, "the world trade center": 30551, "the cold war": 30553, "alex jones": 30604, "prince andrew": 30673, "the united states": 30524, "new york city": 30554, "goodfellas": 30600, "dostoyevsky": 30608, "farrakhan": 30676, "north american": 30610, "world war two": 30536, "crossfit": 30562, "south america": 30532, "bill gates": 30565, "the great pyramid": 30655, "jordan peterson": 30558, "the federal government": 30617, "opie": 30632, "brian shaw": 30567, "nagumo": 30525, "paul vario": 30648, "mahabharata": 30659, "los angeles": 30545, "ben smith": 30572, "hitchens": 30639, "nra": 30556, "ben shapiro": 30603, "silicon valley": 30579, "north korea": 30590, "anderson cooper": 30627, "george knapp": 30680, "lazar": 30679, "rich froning": 30570, "amanda barnhart": 30580, "darpa": 30547, "native american": 30612, "british columbia": 30569, "david icke": 30645, "los alamos": 30661, "donald trump": 30601, "trudeau": 30602, "vagos": 30654, "mexicans": 30621, "new york": 30535, "walmart": 30540, "john wick": 30527, "san diego": 30537, "paul mcguire": 30664, "joe biden": 30637, "san francisco": 30560, "rockefellers": 30671, "the central intelligence agency": 30672, "mike tyson": 30628, "bob lazar": 30623, "comey": 30616, "arthur benjamin": 30657, "9/11": 30550, "new zealand": 30557, "fort knox": 30561, "barack obama": 30582, "al-qaeda": 30633, "mark zuckerberg": 30668, "grissom": 30681, "joey diaz": 30595, "the crossfit games": 30564, "barnhart": 30581, "glock": 30528, "caitlyn jenner": 30629, "henry hill": 30647, "the big bang": 30631, "illuminati": 30667, "the bilderberg group": 30670, "xerox": 30658, "the sahara desert": 30660, "freemasons": 30611, "washington dc": 30548, "sam harris": 30606, "albert einstein": 30640, "jimmy burke": 30649, "easter island": 30526, "the north pole": 30576, "new jersey": 30530, "9 11": 30677, "voynich": 30642, "north america": 30538, "the catholic church": 30630, "monsanto": 30578, "the middle east": 30605, "john wayne gacy": 30650, "the new york times": 30609, "economy lube": 30546, "the united states of america": 30542, "karina": 30662, "camille leblanc-bazinet": 30589, "south africa": 30563, "joe rogan": 30559, "elon musk": 30566, "charles bonnet": 30643, "united states": 30543, "the supreme court": 30619, "flat earth": 30675}
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-uncased", "tokenizer_class": "BertTokenizer"}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff