Initial C++ aclnn EAGER inference for Qwen3-235B-A22B MoE on Ascend 910 × 16 NPU

4b9fefd 11 days ago

6.21 kB

	#include "tokenizer.h"

	#include <array>
	#include <cstdint>
	#include <cstdio>
	#include <cstdlib>
	#include <cstring>
	#include <fstream>
	#include <memory>
	#include <sstream>
	#include <unistd.h>

	bool Tokenizer::load(const std::string& vocab_bin_path) {
	std::ifstream f(vocab_bin_path, std::ios::binary);
	if (!f) {
	fprintf(stderr, "Tokenizer: cannot open %s\n", vocab_bin_path.c_str());
	return false;
	}
	uint32_t num;
	f.read((char*)&num, 4);
	if (!f) return false;
	id_to_bytes_.resize(num);
	for (uint32_t i = 0; i < num; i++) {
	uint32_t len;
	f.read((char*)&len, 4);
	if (!f) return false;
	id_to_bytes_[i].resize(len);
	if (len > 0) f.read(id_to_bytes_[i].data(), len);
	}
	return true;
	}

	std::string Tokenizer::decode(int id) const {
	if (id < 0 \|\| (size_t)id >= id_to_bytes_.size()) return "";
	return id_to_bytes_[id];
	}

	std::string Tokenizer::decode(const std::vector<int>& ids) const {
	std::string out;
	for (int id : ids) out += decode(id);
	return out;
	}

	std::vector<int> Tokenizer::encode_via_python(const std::string& model_dir,
	const std::string& prompt,
	bool apply_chat_template) const {
	// Call python subprocess to tokenize. Embed prompt via stdin to avoid shell-escape bugs.
	std::string cmd;
	// Set QWEN3_PYENV_INIT to override the Python env activation sequence (e.g., "source /opt/my_env/activate && ")
	// Default assumes conda at ~/miniconda3 with env 'qwen3' and Ascend toolkit installed.
	if (const char* init = std::getenv("QWEN3_PYENV_INIT")) {
	cmd += init;
	} else {
	cmd += "source ${HOME}/miniconda3/etc/profile.d/conda.sh 2>/dev/null && ";
	cmd += "conda activate qwen3 2>/dev/null \|\| true; ";
	cmd += "source /usr/local/Ascend/ascend-toolkit/set_env.sh 2>/dev/null \|\| true; ";
	}
	cmd += "python3 -c \"";
	cmd += "import sys, json;";
	cmd += "from transformers import AutoTokenizer;";
	cmd += "t = AutoTokenizer.from_pretrained('" + model_dir + "');";
	cmd += "p = sys.stdin.read();";
	if (apply_chat_template) {
	cmd += "msg = [{'role': 'user', 'content': p}];";
	cmd += "ids = t.apply_chat_template(msg, add_generation_prompt=True);";
	} else {
	cmd += "ids = t.encode(p);";
	}
	cmd += "print(' '.join(str(i) for i in ids));";
	cmd += "\"";

	// popen with stdin: use the two-pipe dance via temp file for safety
	char tmpl[] = "/tmp/lca_prompt_XXXXXX";
	int fd = mkstemp(tmpl);
	if (fd < 0) { perror("mkstemp"); return {}; }
	write(fd, prompt.data(), prompt.size());
	close(fd);

	std::string full = cmd + " < " + tmpl + " 2>/dev/null";
	FILE* pipe = popen(full.c_str(), "r");
	if (!pipe) { perror("popen"); unlink(tmpl); return {}; }

	std::string out;
	char buf[4096];
	while (size_t n = fread(buf, 1, sizeof(buf), pipe)) out.append(buf, n);
	pclose(pipe);
	unlink(tmpl);

	std::vector<int> ids;
	std::istringstream iss(out);
	int x;
	while (iss >> x) ids.push_back(x);
	return ids;
	}

	// Shell-quote a string for embedding in a JSON string (escape ", \, control chars).
	static std::string json_escape(const std::string& s) {
	std::string out;
	out.reserve(s.size() + 8);
	for (char c : s) {
	switch (c) {
	case '"': out += "\\\""; break;
	case '\\': out += "\\\\"; break;
	case '\n': out += "\\n"; break;
	case '\r': out += "\\r"; break;
	case '\t': out += "\\t"; break;
	default:
	if ((unsigned char)c < 0x20) {
	char buf[8];
	snprintf(buf, sizeof(buf), "\\u%04x", (unsigned char)c);
	out += buf;
	} else {
	out += c;
	}
	}
	}
	return out;
	}

	std::vector<int> Tokenizer::encode_conversation_via_python(
	const std::string& model_dir,
	const std::vector<std::pair<std::string, std::string>>& conversation,
	bool add_generation_prompt) const
	{
	// Build JSON array of messages. Pass via stdin to avoid shell-escape issues.
	std::string json_msgs = "[";
	for (size_t i = 0; i < conversation.size(); i++) {
	if (i > 0) json_msgs += ",";
	json_msgs += "{\"role\":\"" + json_escape(conversation[i].first) + "\",";
	json_msgs += "\"content\":\"" + json_escape(conversation[i].second) + "\"}";
	}
	json_msgs += "]";

	std::string cmd;
	// Set QWEN3_PYENV_INIT to override the Python env activation sequence (e.g., "source /opt/my_env/activate && ")
	// Default assumes conda at ~/miniconda3 with env 'qwen3' and Ascend toolkit installed.
	if (const char* init = std::getenv("QWEN3_PYENV_INIT")) {
	cmd += init;
	} else {
	cmd += "source ${HOME}/miniconda3/etc/profile.d/conda.sh 2>/dev/null && ";
	cmd += "conda activate qwen3 2>/dev/null \|\| true; ";
	cmd += "source /usr/local/Ascend/ascend-toolkit/set_env.sh 2>/dev/null \|\| true; ";
	}
	cmd += "python3 -c \"";
	cmd += "import sys, json;";
	cmd += "from transformers import AutoTokenizer;";
	cmd += "t = AutoTokenizer.from_pretrained('" + model_dir + "');";
	cmd += "msgs = json.loads(sys.stdin.read());";
	cmd += "ids = t.apply_chat_template(msgs, add_generation_prompt=";
	cmd += add_generation_prompt ? "True" : "False";
	cmd += ");";
	cmd += "print(' '.join(str(i) for i in ids));";
	cmd += "\"";

	char tmpl[] = "/tmp/lca_conv_XXXXXX";
	int fd = mkstemp(tmpl);
	if (fd < 0) { perror("mkstemp"); return {}; }
	write(fd, json_msgs.data(), json_msgs.size());
	close(fd);

	std::string full = cmd + " < " + tmpl + " 2>/dev/null";
	FILE* pipe = popen(full.c_str(), "r");
	if (!pipe) { perror("popen"); unlink(tmpl); return {}; }

	std::string out;
	char buf[4096];
	while (size_t n = fread(buf, 1, sizeof(buf), pipe)) out.append(buf, n);
	pclose(pipe);
	unlink(tmpl);

	std::vector<int> ids;
	std::istringstream iss(out);
	int x;
	while (iss >> x) ids.push_back(x);
	return ids;
	}