File size: 18,968 Bytes

07b428c

/*
 * tokenizer_reader.h — HuggingFace tokenizer.json Parser
 *
 * Extracts vocabulary, merge rules, and special token IDs from
 * HuggingFace tokenizer.json files for embedding into GGUF.
 *
 * Supports: LLaMA/Mistral BPE tokenizers (sentencepiece-derived)
 */

#ifndef TOKENIZER_READER_H
#define TOKENIZER_READER_H

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>

#define TOK_MAX_TOKENS    256000   /* Max supported vocab size      */
#define TOK_MAX_MERGES    512000   /* Max supported merge rules     */
#define TOK_MAX_TOKEN_LEN 512      /* Max length of a single token  */

/* Token types matching GGUF tokenizer.ggml.token_type */
typedef enum {
    TOK_TYPE_NORMAL    = 1,
    TOK_TYPE_UNKNOWN   = 2,
    TOK_TYPE_CONTROL   = 3,
    TOK_TYPE_USER_DEF  = 4,
    TOK_TYPE_UNUSED    = 5,
    TOK_TYPE_BYTE      = 6
} TokenType;

typedef struct {
    char   **tokens;         /* Token strings indexed by ID            */
    float   *scores;         /* Token scores/priorities                */
    int32_t *token_types;    /* Token type enum per token              */
    int32_t  vocab_size;     /* Total vocabulary size                  */

    char   **merges;         /* BPE merge rule strings                 */
    int32_t  n_merges;       /* Number of merge rules                  */

    int32_t  bos_id;         /* Beginning of sequence token ID         */
    int32_t  eos_id;         /* End of sequence token ID               */
    int32_t  unk_id;         /* Unknown token ID                       */
    int32_t  pad_id;         /* Padding token ID (-1 if none)          */

    char     model_type[32]; /* "llama", "gpt2", etc.                  */
} TokenizerData;

/* ═══════════════════════════════════════════════════════════════════
 * JSON HELPER — Minimal extraction utilities
 *
 * These are NOT a general JSON parser — they target the specific
 * structure of HuggingFace tokenizer.json files.
 * ═══════════════════════════════════════════════════════════════════ */

/* Skip whitespace */
static inline const char *tok_skip_ws(const char *p) {
    while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') p++;
    return p;
}

/* Extract a JSON string value starting at the opening quote.
 * Handles basic escape sequences. Returns pointer after closing quote.
 * Copies unescaped string into buf. */
static const char *tok_extract_string(const char *p, char *buf, int buflen)
{
    if (*p != '"') return NULL;
    p++;  /* skip opening quote */

    int i = 0;
    while (*p && *p != '"' && i < buflen - 1) {
        if (*p == '\\' && p[1]) {
            p++;
            switch (*p) {
                case '"':  buf[i++] = '"'; break;
                case '\\': buf[i++] = '\\'; break;
                case '/':  buf[i++] = '/'; break;
                case 'n':  buf[i++] = '\n'; break;
                case 'r':  buf[i++] = '\r'; break;
                case 't':  buf[i++] = '\t'; break;
                case 'u': {
                    /* Parse \uXXXX unicode escape */
                    if (p[1] && p[2] && p[3] && p[4]) {
                        unsigned int cp = 0;
                        char hex[5] = {p[1], p[2], p[3], p[4], 0};
                        cp = (unsigned int)strtoul(hex, NULL, 16);
                        p += 4;
                        /* Encode as UTF-8 */
                        if (cp < 0x80) {
                            buf[i++] = (char)cp;
                        } else if (cp < 0x800) {
                            if (i + 1 < buflen - 1) {
                                buf[i++] = (char)(0xC0 | (cp >> 6));
                                buf[i++] = (char)(0x80 | (cp & 0x3F));
                            }
                        } else {
                            if (i + 2 < buflen - 1) {
                                buf[i++] = (char)(0xE0 | (cp >> 12));
                                buf[i++] = (char)(0x80 | ((cp >> 6) & 0x3F));
                                buf[i++] = (char)(0x80 | (cp & 0x3F));
                            }
                        }
                    }
                    break;
                }
                default: buf[i++] = *p; break;
            }
        } else {
            buf[i++] = *p;
        }
        p++;
    }
    buf[i] = '\0';

    if (*p == '"') p++;  /* skip closing quote */
    return p;
}

/* Find a key in JSON and return pointer to its value */
static const char *tok_find_key(const char *json, const char *key)
{
    char search[TOK_MAX_TOKEN_LEN + 4];
    snprintf(search, sizeof(search), "\"%s\"", key);

    const char *p = strstr(json, search);
    if (!p) return NULL;

    p += strlen(search);
    p = tok_skip_ws(p);
    if (*p == ':') p++;
    p = tok_skip_ws(p);
    return p;
}

/* ═══════════════════════════════════════════════════════════════════
 * VOCAB PARSER — Extract "model": { "vocab": { ... } }
 * ═══════════════════════════════════════════════════════════════════ */

static int tok_parse_vocab(const char *json, TokenizerData *td)
{
    /* Find "vocab" key inside "model" object */
    const char *model_p = tok_find_key(json, "model");
    if (!model_p) return -1;

    /* Extract model type */
    const char *type_p = tok_find_key(model_p, "type");
    if (type_p) {
        char type_buf[64];
        tok_extract_string(type_p, type_buf, sizeof(type_buf));
        if (strcasecmp(type_buf, "BPE") == 0) {
            strcpy(td->model_type, "llama");
        } else {
            strncpy(td->model_type, type_buf, sizeof(td->model_type) - 1);
        }
    }

    /* Find "vocab": { */
    const char *vocab_p = tok_find_key(model_p, "vocab");
    if (!vocab_p || *vocab_p != '{') return -1;
    vocab_p++;  /* skip '{' */

    /* Parse each "token_string": id pair */
    char token_buf[TOK_MAX_TOKEN_LEN];
    int max_id = -1;

    /* First pass: count entries and find max ID */
    const char *scan = vocab_p;
    int count = 0;
    while (*scan && *scan != '}') {
        scan = tok_skip_ws(scan);
        if (*scan == ',') { scan++; continue; }
        if (*scan != '"') break;

        /* Skip key */
        char dummy[TOK_MAX_TOKEN_LEN];
        scan = tok_extract_string(scan, dummy, sizeof(dummy));
        if (!scan) break;
        scan = tok_skip_ws(scan);
        if (*scan == ':') scan++;
        scan = tok_skip_ws(scan);

        /* Read value (integer) */
        int id = (int)strtol(scan, (char **)&scan, 10);
        if (id > max_id) max_id = id;
        count++;
    }

    if (count == 0 || max_id < 0) return -1;

    td->vocab_size = max_id + 1;

    /* Allocate arrays */
    td->tokens = (char **)calloc(td->vocab_size, sizeof(char *));
    td->scores = (float *)calloc(td->vocab_size, sizeof(float));
    td->token_types = (int32_t *)calloc(td->vocab_size, sizeof(int32_t));

    /* Initialize with defaults */
    for (int i = 0; i < td->vocab_size; i++) {
        td->tokens[i] = strdup("");
        td->scores[i] = 0.0f;
        td->token_types[i] = TOK_TYPE_NORMAL;
    }

    /* Second pass: fill in tokens */
    scan = vocab_p;
    while (*scan && *scan != '}') {
        scan = tok_skip_ws(scan);
        if (*scan == ',') { scan++; continue; }
        if (*scan != '"') break;

        scan = tok_extract_string(scan, token_buf, sizeof(token_buf));
        if (!scan) break;
        scan = tok_skip_ws(scan);
        if (*scan == ':') scan++;
        scan = tok_skip_ws(scan);

        int id = (int)strtol(scan, (char **)&scan, 10);

        if (id >= 0 && id < td->vocab_size) {
            free(td->tokens[id]);
            td->tokens[id] = strdup(token_buf);
            /* Score = negative index for BPE ordering (higher ID = lower priority) */
            td->scores[id] = -(float)id;
        }
    }

    return 0;
}

/* ═══════════════════════════════════════════════════════════════════
 * MERGES PARSER — Extract "model": { "merges": [ ... ] }
 * ═══════════════════════════════════════════════════════════════════ */

static int tok_parse_merges(const char *json, TokenizerData *td)
{
    const char *model_p = tok_find_key(json, "model");
    if (!model_p) return -1;

    const char *merges_p = tok_find_key(model_p, "merges");
    if (!merges_p || *merges_p != '[') return -1;
    merges_p++;  /* skip '[' */

    /* Allocate with growth pattern — start with 64k slots */
    int capacity = 65536;
    td->merges = (char **)calloc(capacity, sizeof(char *));
    td->n_merges = 0;

    /* Extract merge strings */
    const char *scan = merges_p;
    char merge_buf[TOK_MAX_TOKEN_LEN * 2];
    while (*scan && *scan != ']' && td->n_merges < TOK_MAX_MERGES) {
        scan = tok_skip_ws(scan);
        if (*scan == ',') { scan++; continue; }
        if (*scan != '"') { scan++; continue; }

        scan = tok_extract_string(scan, merge_buf, sizeof(merge_buf));
        if (!scan) break;

        /* Grow if needed */
        if (td->n_merges >= capacity) {
            capacity *= 2;
            td->merges = (char **)realloc(td->merges, capacity * sizeof(char *));
        }

        td->merges[td->n_merges] = strdup(merge_buf);
        td->n_merges++;
    }

    return 0;
}

/* ═══════════════════════════════════════════════════════════════════
 * SPECIAL TOKENS — Extract from "added_tokens" array
 * ═══════════════════════════════════════════════════════════════════ */

static void tok_parse_added_tokens(const char *json, TokenizerData *td)
{
    const char *added_p = tok_find_key(json, "added_tokens");
    if (!added_p || *added_p != '[') return;
    added_p++;

    /* Scan through the array of objects */
    while (*added_p && *added_p != ']') {
        added_p = tok_skip_ws(added_p);
        if (*added_p == ',') { added_p++; continue; }
        if (*added_p != '{') { added_p++; continue; }

        /* Find end of this object */
        const char *obj_start = added_p;
        int depth = 1;
        added_p++;
        while (*added_p && depth > 0) {
            if (*added_p == '{') depth++;
            if (*added_p == '}') depth--;
            added_p++;
        }

        /* Extract content and id from this object */
        char content[TOK_MAX_TOKEN_LEN] = "";
        int id = -1;
        int is_special = 0;

        const char *id_p = tok_find_key(obj_start, "id");
        if (id_p) id = (int)strtol(id_p, NULL, 10);

        const char *content_p = tok_find_key(obj_start, "content");
        if (content_p && *content_p == '"')
            tok_extract_string(content_p, content, sizeof(content));

        const char *special_p = tok_find_key(obj_start, "special");
        if (special_p) {
            is_special = (strncmp(special_p, "true", 4) == 0);
        }

        /* Mark special tokens */
        if (id >= 0 && id < td->vocab_size) {
            if (is_special) {
                td->token_types[id] = TOK_TYPE_CONTROL;
            }
            /* Update token string if needed */
            if (content[0] && (!td->tokens[id] || !td->tokens[id][0])) {
                free(td->tokens[id]);
                td->tokens[id] = strdup(content);
            }
        }
    }
}

/* ═══════════════════════════════════════════════════════════════════
 * SPECIAL TOKEN IDs — Extract from tokenizer_config.json
 * ═══════════════════════════════════════════════════════════════════ */

static void tok_parse_config(const char *config_json, TokenizerData *td)
{
    /* Look for bos_token, eos_token, unk_token content strings */
    /* Then find their IDs in the vocab */

    /* Search for token content in the config */
    struct { const char *key; int32_t *id_ptr; const char *default_content; } specials[] = {
        {"bos_token", &td->bos_id, "<s>"},
        {"eos_token", &td->eos_id, "</s>"},
        {"unk_token", &td->unk_id, "<unk>"},
        {NULL, NULL, NULL}
    };

    for (int s = 0; specials[s].key; s++) {
        const char *p = tok_find_key(config_json, specials[s].key);
        if (!p) {
            /* Try to find in vocab by default content */
            for (int i = 0; i < td->vocab_size; i++) {
                if (td->tokens[i] && strcmp(td->tokens[i], specials[s].default_content) == 0) {
                    *specials[s].id_ptr = i;
                    break;
                }
            }
            continue;
        }

        /* The value might be a string directly or an object with "content" */
        if (*p == '"') {
            char content[TOK_MAX_TOKEN_LEN];
            tok_extract_string(p, content, sizeof(content));
            /* Find this content in vocab */
            for (int i = 0; i < td->vocab_size; i++) {
                if (td->tokens[i] && strcmp(td->tokens[i], content) == 0) {
                    *specials[s].id_ptr = i;
                    break;
                }
            }
        } else if (*p == '{') {
            /* Object with "content" field */
            const char *cp = tok_find_key(p, "content");
            if (cp && *cp == '"') {
                char content[TOK_MAX_TOKEN_LEN];
                tok_extract_string(cp, content, sizeof(content));
                for (int i = 0; i < td->vocab_size; i++) {
                    if (td->tokens[i] && strcmp(td->tokens[i], content) == 0) {
                        *specials[s].id_ptr = i;
                        break;
                    }
                }
            }
        }
    }
}

/* ═══════════════════════════════════════════════════════════════════
 * MAIN API — Load tokenizer from directory
 * ═══════════════════════════════════════════════════════════════════ */

static char *tok_read_file(const char *path)
{
    FILE *f = fopen(path, "rb");
    if (!f) return NULL;

    fseek(f, 0, SEEK_END);
    long size = ftell(f);
    fseek(f, 0, SEEK_SET);

    char *buf = (char *)malloc(size + 1);
    if (!buf) { fclose(f); return NULL; }

    fread(buf, 1, size, f);
    buf[size] = '\0';
    fclose(f);
    return buf;
}

static TokenizerData *tok_load(const char *tokenizer_json_path,
                                const char *config_json_path)
{
    TokenizerData *td = (TokenizerData *)calloc(1, sizeof(TokenizerData));
    if (!td) return NULL;

    td->bos_id = 1;
    td->eos_id = 2;
    td->unk_id = 0;
    td->pad_id = -1;
    strcpy(td->model_type, "llama");

    /* Read tokenizer.json */
    char *json = tok_read_file(tokenizer_json_path);
    if (!json) {
        fprintf(stderr, "  WARNING: Could not read '%s'\n", tokenizer_json_path);
        free(td);
        return NULL;
    }

    /* Parse vocab */
    if (tok_parse_vocab(json, td) != 0) {
        fprintf(stderr, "  WARNING: Failed to parse vocab from tokenizer.json\n");
        free(json);
        free(td);
        return NULL;
    }

    /* Parse merges */
    tok_parse_merges(json, td);

    /* Parse added tokens (special tokens) */
    tok_parse_added_tokens(json, td);

    /* Detect byte tokens: <0x00> through <0xFF> */
    for (int i = 0; i < td->vocab_size; i++) {
        if (td->tokens[i] && td->tokens[i][0] == '<' &&
            td->tokens[i][1] == '0' && td->tokens[i][2] == 'x' &&
            strlen(td->tokens[i]) == 6 && td->tokens[i][5] == '>') {
            td->token_types[i] = TOK_TYPE_BYTE;
        }
    }

    free(json);

    /* Read config if available */
    if (config_json_path) {
        char *config = tok_read_file(config_json_path);
        if (config) {
            tok_parse_config(config, td);
            free(config);
        }
    }

    return td;
}

static void tok_free(TokenizerData *td)
{
    if (!td) return;
    if (td->tokens) {
        for (int i = 0; i < td->vocab_size; i++)
            free(td->tokens[i]);
        free(td->tokens);
    }
    if (td->merges) {
        for (int i = 0; i < td->n_merges; i++)
            free(td->merges[i]);
        free(td->merges);
    }
    free(td->scores);
    free(td->token_types);
    free(td);
}

/* Print summary */
static void tok_print_summary(const TokenizerData *td)
{
    printf("  ╔═══════════════════════════════════════════════════════════════╗\n");
    printf("  ║  Tokenizer                                                  ║\n");
    printf("  ╠═══════════════════════════════════════════════════════════════╣\n");
    printf("  ║  Model:            %-40s ║\n", td->model_type);
    printf("  ║  Vocab size:       %-40d ║\n", td->vocab_size);
    printf("  ║  Merges:           %-40d ║\n", td->n_merges);
    printf("  ║  BOS token:        %-3d  %-36s ║\n", td->bos_id,
           (td->bos_id >= 0 && td->bos_id < td->vocab_size) ? td->tokens[td->bos_id] : "");
    printf("  ║  EOS token:        %-3d  %-36s ║\n", td->eos_id,
           (td->eos_id >= 0 && td->eos_id < td->vocab_size) ? td->tokens[td->eos_id] : "");
    printf("  ║  UNK token:        %-3d  %-36s ║\n", td->unk_id,
           (td->unk_id >= 0 && td->unk_id < td->vocab_size) ? td->tokens[td->unk_id] : "");
    printf("  ╚═══════════════════════════════════════════════════════════════╝\n\n");
}

#endif /* TOKENIZER_READER_H */