| #include "tokenizer.h" |
|
|
| #include <array> |
| #include <cstdint> |
| #include <cstdio> |
| #include <cstdlib> |
| #include <cstring> |
| #include <fstream> |
| #include <memory> |
| #include <sstream> |
| #include <unistd.h> |
|
|
| bool Tokenizer::load(const std::string& vocab_bin_path) { |
| std::ifstream f(vocab_bin_path, std::ios::binary); |
| if (!f) { |
| fprintf(stderr, "Tokenizer: cannot open %s\n", vocab_bin_path.c_str()); |
| return false; |
| } |
| uint32_t num; |
| f.read((char*)&num, 4); |
| if (!f) return false; |
| id_to_bytes_.resize(num); |
| for (uint32_t i = 0; i < num; i++) { |
| uint32_t len; |
| f.read((char*)&len, 4); |
| if (!f) return false; |
| id_to_bytes_[i].resize(len); |
| if (len > 0) f.read(id_to_bytes_[i].data(), len); |
| } |
| return true; |
| } |
|
|
| std::string Tokenizer::decode(int id) const { |
| if (id < 0 || (size_t)id >= id_to_bytes_.size()) return ""; |
| return id_to_bytes_[id]; |
| } |
|
|
| std::string Tokenizer::decode(const std::vector<int>& ids) const { |
| std::string out; |
| for (int id : ids) out += decode(id); |
| return out; |
| } |
|
|
| std::vector<int> Tokenizer::encode_via_python(const std::string& model_dir, |
| const std::string& prompt, |
| bool apply_chat_template) const { |
| |
| std::string cmd; |
| |
| |
| if (const char* init = std::getenv("QWEN3_PYENV_INIT")) { |
| cmd += init; |
| } else { |
| cmd += "source ${HOME}/miniconda3/etc/profile.d/conda.sh 2>/dev/null && "; |
| cmd += "conda activate qwen3 2>/dev/null || true; "; |
| cmd += "source /usr/local/Ascend/ascend-toolkit/set_env.sh 2>/dev/null || true; "; |
| } |
| cmd += "python3 -c \""; |
| cmd += "import sys, json;"; |
| cmd += "from transformers import AutoTokenizer;"; |
| cmd += "t = AutoTokenizer.from_pretrained('" + model_dir + "');"; |
| cmd += "p = sys.stdin.read();"; |
| if (apply_chat_template) { |
| cmd += "msg = [{'role': 'user', 'content': p}];"; |
| cmd += "ids = t.apply_chat_template(msg, add_generation_prompt=True);"; |
| } else { |
| cmd += "ids = t.encode(p);"; |
| } |
| cmd += "print(' '.join(str(i) for i in ids));"; |
| cmd += "\""; |
|
|
| |
| char tmpl[] = "/tmp/lca_prompt_XXXXXX"; |
| int fd = mkstemp(tmpl); |
| if (fd < 0) { perror("mkstemp"); return {}; } |
| write(fd, prompt.data(), prompt.size()); |
| close(fd); |
|
|
| std::string full = cmd + " < " + tmpl + " 2>/dev/null"; |
| FILE* pipe = popen(full.c_str(), "r"); |
| if (!pipe) { perror("popen"); unlink(tmpl); return {}; } |
|
|
| std::string out; |
| char buf[4096]; |
| while (size_t n = fread(buf, 1, sizeof(buf), pipe)) out.append(buf, n); |
| pclose(pipe); |
| unlink(tmpl); |
|
|
| std::vector<int> ids; |
| std::istringstream iss(out); |
| int x; |
| while (iss >> x) ids.push_back(x); |
| return ids; |
| } |
|
|
| |
| static std::string json_escape(const std::string& s) { |
| std::string out; |
| out.reserve(s.size() + 8); |
| for (char c : s) { |
| switch (c) { |
| case '"': out += "\\\""; break; |
| case '\\': out += "\\\\"; break; |
| case '\n': out += "\\n"; break; |
| case '\r': out += "\\r"; break; |
| case '\t': out += "\\t"; break; |
| default: |
| if ((unsigned char)c < 0x20) { |
| char buf[8]; |
| snprintf(buf, sizeof(buf), "\\u%04x", (unsigned char)c); |
| out += buf; |
| } else { |
| out += c; |
| } |
| } |
| } |
| return out; |
| } |
|
|
| std::vector<int> Tokenizer::encode_conversation_via_python( |
| const std::string& model_dir, |
| const std::vector<std::pair<std::string, std::string>>& conversation, |
| bool add_generation_prompt) const |
| { |
| |
| std::string json_msgs = "["; |
| for (size_t i = 0; i < conversation.size(); i++) { |
| if (i > 0) json_msgs += ","; |
| json_msgs += "{\"role\":\"" + json_escape(conversation[i].first) + "\","; |
| json_msgs += "\"content\":\"" + json_escape(conversation[i].second) + "\"}"; |
| } |
| json_msgs += "]"; |
|
|
| std::string cmd; |
| |
| |
| if (const char* init = std::getenv("QWEN3_PYENV_INIT")) { |
| cmd += init; |
| } else { |
| cmd += "source ${HOME}/miniconda3/etc/profile.d/conda.sh 2>/dev/null && "; |
| cmd += "conda activate qwen3 2>/dev/null || true; "; |
| cmd += "source /usr/local/Ascend/ascend-toolkit/set_env.sh 2>/dev/null || true; "; |
| } |
| cmd += "python3 -c \""; |
| cmd += "import sys, json;"; |
| cmd += "from transformers import AutoTokenizer;"; |
| cmd += "t = AutoTokenizer.from_pretrained('" + model_dir + "');"; |
| cmd += "msgs = json.loads(sys.stdin.read());"; |
| cmd += "ids = t.apply_chat_template(msgs, add_generation_prompt="; |
| cmd += add_generation_prompt ? "True" : "False"; |
| cmd += ");"; |
| cmd += "print(' '.join(str(i) for i in ids));"; |
| cmd += "\""; |
|
|
| char tmpl[] = "/tmp/lca_conv_XXXXXX"; |
| int fd = mkstemp(tmpl); |
| if (fd < 0) { perror("mkstemp"); return {}; } |
| write(fd, json_msgs.data(), json_msgs.size()); |
| close(fd); |
|
|
| std::string full = cmd + " < " + tmpl + " 2>/dev/null"; |
| FILE* pipe = popen(full.c_str(), "r"); |
| if (!pipe) { perror("popen"); unlink(tmpl); return {}; } |
|
|
| std::string out; |
| char buf[4096]; |
| while (size_t n = fread(buf, 1, sizeof(buf), pipe)) out.append(buf, n); |
| pclose(pipe); |
| unlink(tmpl); |
|
|
| std::vector<int> ids; |
| std::istringstream iss(out); |
| int x; |
| while (iss >> x) ids.push_back(x); |
| return ids; |
| } |
|
|