llm_mutil_npu / src /tokenizer.cpp
xianglarry's picture
Initial C++ aclnn EAGER inference for Qwen3-235B-A22B MoE on Ascend 910 × 16 NPU
4b9fefd
#include "tokenizer.h"
#include <array>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <memory>
#include <sstream>
#include <unistd.h>
bool Tokenizer::load(const std::string& vocab_bin_path) {
std::ifstream f(vocab_bin_path, std::ios::binary);
if (!f) {
fprintf(stderr, "Tokenizer: cannot open %s\n", vocab_bin_path.c_str());
return false;
}
uint32_t num;
f.read((char*)&num, 4);
if (!f) return false;
id_to_bytes_.resize(num);
for (uint32_t i = 0; i < num; i++) {
uint32_t len;
f.read((char*)&len, 4);
if (!f) return false;
id_to_bytes_[i].resize(len);
if (len > 0) f.read(id_to_bytes_[i].data(), len);
}
return true;
}
std::string Tokenizer::decode(int id) const {
if (id < 0 || (size_t)id >= id_to_bytes_.size()) return "";
return id_to_bytes_[id];
}
std::string Tokenizer::decode(const std::vector<int>& ids) const {
std::string out;
for (int id : ids) out += decode(id);
return out;
}
std::vector<int> Tokenizer::encode_via_python(const std::string& model_dir,
const std::string& prompt,
bool apply_chat_template) const {
// Call python subprocess to tokenize. Embed prompt via stdin to avoid shell-escape bugs.
std::string cmd;
// Set QWEN3_PYENV_INIT to override the Python env activation sequence (e.g., "source /opt/my_env/activate && ")
// Default assumes conda at ~/miniconda3 with env 'qwen3' and Ascend toolkit installed.
if (const char* init = std::getenv("QWEN3_PYENV_INIT")) {
cmd += init;
} else {
cmd += "source ${HOME}/miniconda3/etc/profile.d/conda.sh 2>/dev/null && ";
cmd += "conda activate qwen3 2>/dev/null || true; ";
cmd += "source /usr/local/Ascend/ascend-toolkit/set_env.sh 2>/dev/null || true; ";
}
cmd += "python3 -c \"";
cmd += "import sys, json;";
cmd += "from transformers import AutoTokenizer;";
cmd += "t = AutoTokenizer.from_pretrained('" + model_dir + "');";
cmd += "p = sys.stdin.read();";
if (apply_chat_template) {
cmd += "msg = [{'role': 'user', 'content': p}];";
cmd += "ids = t.apply_chat_template(msg, add_generation_prompt=True);";
} else {
cmd += "ids = t.encode(p);";
}
cmd += "print(' '.join(str(i) for i in ids));";
cmd += "\"";
// popen with stdin: use the two-pipe dance via temp file for safety
char tmpl[] = "/tmp/lca_prompt_XXXXXX";
int fd = mkstemp(tmpl);
if (fd < 0) { perror("mkstemp"); return {}; }
write(fd, prompt.data(), prompt.size());
close(fd);
std::string full = cmd + " < " + tmpl + " 2>/dev/null";
FILE* pipe = popen(full.c_str(), "r");
if (!pipe) { perror("popen"); unlink(tmpl); return {}; }
std::string out;
char buf[4096];
while (size_t n = fread(buf, 1, sizeof(buf), pipe)) out.append(buf, n);
pclose(pipe);
unlink(tmpl);
std::vector<int> ids;
std::istringstream iss(out);
int x;
while (iss >> x) ids.push_back(x);
return ids;
}
// Shell-quote a string for embedding in a JSON string (escape ", \, control chars).
static std::string json_escape(const std::string& s) {
std::string out;
out.reserve(s.size() + 8);
for (char c : s) {
switch (c) {
case '"': out += "\\\""; break;
case '\\': out += "\\\\"; break;
case '\n': out += "\\n"; break;
case '\r': out += "\\r"; break;
case '\t': out += "\\t"; break;
default:
if ((unsigned char)c < 0x20) {
char buf[8];
snprintf(buf, sizeof(buf), "\\u%04x", (unsigned char)c);
out += buf;
} else {
out += c;
}
}
}
return out;
}
std::vector<int> Tokenizer::encode_conversation_via_python(
const std::string& model_dir,
const std::vector<std::pair<std::string, std::string>>& conversation,
bool add_generation_prompt) const
{
// Build JSON array of messages. Pass via stdin to avoid shell-escape issues.
std::string json_msgs = "[";
for (size_t i = 0; i < conversation.size(); i++) {
if (i > 0) json_msgs += ",";
json_msgs += "{\"role\":\"" + json_escape(conversation[i].first) + "\",";
json_msgs += "\"content\":\"" + json_escape(conversation[i].second) + "\"}";
}
json_msgs += "]";
std::string cmd;
// Set QWEN3_PYENV_INIT to override the Python env activation sequence (e.g., "source /opt/my_env/activate && ")
// Default assumes conda at ~/miniconda3 with env 'qwen3' and Ascend toolkit installed.
if (const char* init = std::getenv("QWEN3_PYENV_INIT")) {
cmd += init;
} else {
cmd += "source ${HOME}/miniconda3/etc/profile.d/conda.sh 2>/dev/null && ";
cmd += "conda activate qwen3 2>/dev/null || true; ";
cmd += "source /usr/local/Ascend/ascend-toolkit/set_env.sh 2>/dev/null || true; ";
}
cmd += "python3 -c \"";
cmd += "import sys, json;";
cmd += "from transformers import AutoTokenizer;";
cmd += "t = AutoTokenizer.from_pretrained('" + model_dir + "');";
cmd += "msgs = json.loads(sys.stdin.read());";
cmd += "ids = t.apply_chat_template(msgs, add_generation_prompt=";
cmd += add_generation_prompt ? "True" : "False";
cmd += ");";
cmd += "print(' '.join(str(i) for i in ids));";
cmd += "\"";
char tmpl[] = "/tmp/lca_conv_XXXXXX";
int fd = mkstemp(tmpl);
if (fd < 0) { perror("mkstemp"); return {}; }
write(fd, json_msgs.data(), json_msgs.size());
close(fd);
std::string full = cmd + " < " + tmpl + " 2>/dev/null";
FILE* pipe = popen(full.c_str(), "r");
if (!pipe) { perror("popen"); unlink(tmpl); return {}; }
std::string out;
char buf[4096];
while (size_t n = fread(buf, 1, sizeof(buf), pipe)) out.append(buf, n);
pclose(pipe);
unlink(tmpl);
std::vector<int> ids;
std::istringstream iss(out);
int x;
while (iss >> x) ids.push_back(x);
return ids;
}