self-chat / models /cpp_qwen2.py
xu song
update
4e4c514
raw
history blame
5.6 kB
"""
https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/local.py
https://github.com/awinml/llama-cpp-python-bindings
python convert_hf_to_gguf.py --outtype f16 Qwen1.5-0.5B-Chat
python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/
./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "I believe the meaning of life is" -n 128
./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -f prompt.txt -n 128
./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "You are a helpful assistant" -cnv
## reference
- https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/llms/llamacpp.py
- https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/server.py
- https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py
"""
import json
import copy
import os
from models.base_model import Simulator
import llama_cpp
# import llama_cpp.llama_tokenizer
from transformers import AutoTokenizer
from utils.logging_util import logger
import config
class Qwen2Simulator(Simulator):
def __init__(self):
local_path = "/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct-GGUF/qwen2-0_5b-instruct-fp16.gguf"
if os.path.exists(local_path):
self.hf_tokenizer = AutoTokenizer.from_pretrained(
"/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/")
self.llm = llama_cpp.Llama( # n_ctx, n_threads
model_path=local_path,
# 默认的tokenizer有bug,tokenize后的id不同
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
n_ctx=config.MAX_SEQUENCE_LENGTH, #
# n_threads=None, # 默认会根据cpu数来设置 n_threads
# use_mlock=True,
verbose=True,
)
else:
self.hf_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
self.llm = llama_cpp.Llama.from_pretrained(
repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
filename="*fp16.gguf",
n_ctx=config.MAX_SEQUENCE_LENGTH,
# use_mlock=True,
verbose=True,
)
logger.info(f"llm has been initialized: {self.llm}, "
f"n_threads={self.llm.n_threads}, n_ctx={self.llm.n_ctx}, "
f"env[CACHE]={os.environ.get('CACHE', None)}")
self.generation_kwargs = dict(
temperature=config.DEFAULT_TEMPERATURE,
top_p=config.DEFAULT_TOP_P,
top_k=config.DEFAULT_TOP_K,
max_tokens=config.DEFAULT_MAX_TOKENS,
repeat_penalty=1.1,
# qwen2-0.5b-chat 有时内容生成结束没有<|im_end|>,直接跟 <|im_start|>
stop=[
"<|im_end|>",
"<|im_start|>",
"<|endoftext|>",
],
)
def tokenize(self, text):
return self.llm.tokenize(text.encode("utf-8"))
def generate(self, history, stream=True):
if history[-1]['role'] in ["user"]:
start_tokens = self.tokenize("<|im_start|>assistant\n")
elif history[-1]['role'] in ["assistant", "system"]:
start_tokens = self.tokenize("<|im_start|>user\n")
input_ids = []
for message in history:
if "tokens" not in message:
message["tokens"] = self.tokenize(message["content"])
input_ids += self.tokenize(f"<|im_start|>{message['role']}\n") \
+ message["tokens"] \
+ self.tokenize("<|im_end|>\n")
input_ids += start_tokens
if stream:
return self._stream_generate(input_ids)
else:
return self._generate(input_ids)
def _stream_generate(self, input_ids):
logger.info(f"generation_kwargs {self.generation_kwargs}")
# self.llm.generate .set_cache .last_n_tokens_size .reset .ctx ._ctx
output = self.llm.create_completion(
input_ids,
stream=True,
**self.generation_kwargs
)
# TODO: 检测finish reason,如果是length,则shift,并继续生成。
# TODO: 返回 token_id,
for out in output:
stream = copy.deepcopy(out)
if stream["choices"][0]["finish_reason"] is None:
yield stream["choices"][0]["completion_text"], stream["choices"][0]["completion_tokens"]
else:
print(f'finish with text: {stream["choices"][0]["completion_text"]}, tokens: {stream["choices"][0]["completion_tokens"]}')
bot = Qwen2Simulator()
if __name__ == "__main__":
messages = [{"role": "system", "content": "你是一个导游。"}]
generated_tokens = None
print("######## requesting", messages)
for generated_text, generated_tokens in bot.generate(messages, stream=True):
print(generated_text, generated_tokens)
for i in range(3):
messages.append({"role": "user" if i % 2 == 0 else "assistant", "content": generated_text, "tokens": generated_tokens})
print("######## requesting", messages)
for generated_text, generated_tokens in bot.generate(messages, stream=True):
pass
# print(generated_text, all_tokens)