C10X
/

v23

Model card Files Files and versions

v23 / tokenizer_config.json

C10X's picture

Update tokenizer_config.json

67a21d5 verified 1 day ago

history blame contribute delete

4.07 kB

	{
	"backend": "tokenizers",
	"bos_token": "<\|startoftext\|>",
	"chatml_role_tokens": [
	"<\|system\|>",
	"<\|user\|>",
	"<\|assistant\|>",
	"<\|developer\|>",
	"<\|tool\|>",
	"<\|function\|>"
	],
	"clean_up_tokenization_spaces": false,
	"eos_token": "<\|im_end\|>",
	"expected_vocab_size": 16384,
	"falcon_latex_tokens": [
	"\\blindtext",
	"\\newpage",
	"\\boxed",
	"\\framebox",
	"\\fbox",
	"\\tag",
	"\\nonumber",
	"\\item",
	"\\centering",
	"\\caption",
	"\\includegraphics",
	"\\label",
	"\\multicolumn",
	"\\cline",
	"\\hline",
	"\\end",
	"\\begin",
	"\\tableofcontents",
	"\\maketitle",
	"\\date",
	"\\author",
	"\\title",
	"\\chapter",
	"\\subsubsection",
	"\\subsection",
	"\\section",
	"\\noindent",
	"\\newline",
	"\\par",
	"\\ddot",
	"\\dot",
	"\\bar",
	"\\iff",
	"\\implies",
	"\\neg",
	"\\lor",
	"\\land",
	"\\qquad",
	"\\quad",
	"\\arctan",
	"\\arccos",
	"\\arcsin",
	"\\cot",
	"\\sec",
	"\\csc",
	"\\tan",
	"\\cos",
	"\\sin",
	"\\widetilde",
	"\\widehat",
	"\\complement",
	"\\varnothing",
	"\\tilde",
	"\\hat",
	"\\vec",
	"\\mathfrak",
	"\\mathcal",
	"\\mathbb",
	"\\emptyset",
	"\\bot",
	"\\aleph",
	"\\Re",
	"\\Im",
	"\\ell",
	"\\hbar",
	"\\exists",
	"\\forall",
	"\\partial",
	"\\nabla",
	"\\infty",
	"\\binom",
	"\\overline",
	"\\sqrt",
	"\\tfrac",
	"\\dfrac",
	"\\frac",
	"\\textsc",
	"\\textsf",
	"\\texttt",
	"\\emph",
	"\\underline",
	"\\textit",
	"\\textbf",
	"\\bigoplus",
	"\\bigcap",
	"\\bigcup",
	"\\prod",
	"\\sum",
	"\\oint",
	"\\iiint",
	"\\iint",
	"\\int",
	"\\right",
	"\\left",
	"\\Bigr",
	"\\Bigl",
	"\\bigr",
	"\\bigl",
	"\\rceil",
	"\\lceil",
	"\\rfloor",
	"\\lfloor",
	"\\rangle",
	"\\langle",
	"\\overleftarrow",
	"\\overleftrightarrow",
	"\\overrightarrow",
	"\\Longleftrightarrow",
	"\\longleftrightarrow",
	"\\longrightarrow",
	"\\longleftarrow",
	"\\Longrightarrow",
	"\\Longleftarrow",
	"\\gets",
	"\\to",
	"\\mapsto",
	"\\Updownarrow",
	"\\Downarrow",
	"\\Uparrow",
	"\\updownarrow",
	"\\downarrow",
	"\\uparrow",
	"\\Leftrightarrow",
	"\\Rightarrow",
	"\\Leftarrow",
	"\\leftrightarrow",
	"\\rightarrow",
	"\\leftarrow",
	"\\perp",
	"\\propto",
	"\\ni",
	"\\notin",
	"\\in",
	"\\supseteq",
	"\\supset",
	"\\sqsupseteq",
	"\\sqsubseteq",
	"\\subseteq",
	"\\subset",
	"\\cong",
	"\\approx",
	"\\simeq",
	"\\sim",
	"\\equiv",
	"\\neq",
	"\\geq",
	"\\leq",
	"\\oslash",
	"\\otimes",
	"\\ominus",
	"\\oplus",
	"\\wedge",
	"\\vee",
	"\\sqcup",
	"\\sqcap",
	"\\uplus",
	"\\cup",
	"\\cap",
	"\\cdot",
	"\\bullet",
	"\\circ",
	"\\star",
	"\\ast",
	"\\div",
	"\\times",
	"\\mp",
	"\\pm",
	"\\omega",
	"\\psi",
	"\\chi",
	"\\varphi",
	"\\varsigma",
	"\\phi",
	"\\upsilon",
	"\\tau",
	"\\sigma",
	"\\rho",
	"\\pi",
	"\\xi",
	"\\nu",
	"\\mu",
	"\\lambda",
	"\\kappa",
	"\\iota",
	"\\theta",
	"\\eta",
	"\\zeta",
	"\\epsilon",
	"\\delta",
	"\\gamma",
	"\\beta",
	"\\alpha"
	],
	"fresh_training_vocab_note": "This tokenizer intentionally repurposes obsolete Harmony/control token IDs 2,3,4,7,8,9 as ChatML role tokens. It is intended for from-scratch base-model training; do not use it with old checkpoints or old tokenized caches.",
	"hf_compatibility_note": "Set tokenizer_class to PreTrainedTokenizerFast for vanilla Hugging Face loading; original source used a custom TokenizersBackend name.",
	"is_local": true,
	"local_files_only": false,
	"model_input_names": [
	"input_ids",
	"attention_mask"
	],
	"model_max_length": 1000000000000000019884624838656,
	"pad_token": "<\|im_end\|>",
	"tokenizer_class": "PreTrainedTokenizerFast"
	}