Spaces:
Running
Running
File size: 6,333 Bytes
15f14f5 8dd4d48 15f14f5 8dd4d48 15f14f5 8dd4d48 15f14f5 8dd4d48 15f14f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
model_name = "deepseek-coder-6.7b-instruct"
cmd_to_install = "ๆช็ฅ" # "`pip install -r request_llms/requirements_qwen.txt`"
import os
from toolbox import ProxyNetworkActivate
from toolbox import get_conf
from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns
from threading import Thread
import torch
def download_huggingface_model(model_name, max_retry, local_dir):
from huggingface_hub import snapshot_download
for i in range(1, max_retry):
try:
snapshot_download(repo_id=model_name, local_dir=local_dir, resume_download=True)
break
except Exception as e:
print(f'\n\nไธ่ฝฝๅคฑ่ดฅ๏ผ้่ฏ็ฌฌ{i}ๆฌกไธญ...\n\n')
return local_dir
# ------------------------------------------------------------------------------------------------------------------------
# ๐๐ป Local Model
# ------------------------------------------------------------------------------------------------------------------------
class GetCoderLMHandle(LocalLLMHandle):
def load_model_info(self):
# ๐โโ๏ธ๐โโ๏ธ๐โโ๏ธ ๅญ่ฟ็จๆง่ก
self.model_name = model_name
self.cmd_to_install = cmd_to_install
def load_model_and_tokenizer(self):
# ๐โโ๏ธ๐โโ๏ธ๐โโ๏ธ ๅญ่ฟ็จๆง่ก
with ProxyNetworkActivate('Download_LLM'):
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
model_name = "deepseek-ai/deepseek-coder-6.7b-instruct"
# local_dir = f"~/.cache/{model_name}"
# if not os.path.exists(local_dir):
# tokenizer = download_huggingface_model(model_name, max_retry=128, local_dir=local_dir)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
self._streamer = TextIteratorStreamer(tokenizer)
device_map = {
"transformer.word_embeddings": 0,
"transformer.word_embeddings_layernorm": 0,
"lm_head": 0,
"transformer.h": 0,
"transformer.ln_f": 0,
"model.embed_tokens": 0,
"model.layers": 0,
"model.norm": 0,
}
# ๆฃๆฅ้ๅ้
็ฝฎ
quantization_type = get_conf('LOCAL_MODEL_QUANT')
if get_conf('LOCAL_MODEL_DEVICE') != 'cpu':
if quantization_type == "INT8":
from transformers import BitsAndBytesConfig
# ไฝฟ็จ INT8 ้ๅ
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, load_in_8bit=True,
device_map=device_map)
elif quantization_type == "INT4":
from transformers import BitsAndBytesConfig
# ไฝฟ็จ INT4 ้ๅ
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True,
quantization_config=bnb_config, device_map=device_map)
else:
# ไฝฟ็จ้ป่ฎค็ FP16
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True,
torch_dtype=torch.bfloat16, device_map=device_map)
else:
# CPU ๆจกๅผ
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True,
torch_dtype=torch.bfloat16)
return model, tokenizer
def llm_stream_generator(self, **kwargs):
# ๐โโ๏ธ๐โโ๏ธ๐โโ๏ธ ๅญ่ฟ็จๆง่ก
def adaptor(kwargs):
query = kwargs['query']
max_length = kwargs['max_length']
top_p = kwargs['top_p']
temperature = kwargs['temperature']
history = kwargs['history']
return query, max_length, top_p, temperature, history
query, max_length, top_p, temperature, history = adaptor(kwargs)
history.append({ 'role': 'user', 'content': query})
messages = history
inputs = self._tokenizer.apply_chat_template(messages, return_tensors="pt")
if inputs.shape[1] > max_length:
inputs = inputs[:, -max_length:]
inputs = inputs.to(self._model.device)
generation_kwargs = dict(
inputs=inputs,
max_new_tokens=max_length,
do_sample=False,
top_p=top_p,
streamer = self._streamer,
top_k=50,
temperature=temperature,
num_return_sequences=1,
eos_token_id=32021,
)
thread = Thread(target=self._model.generate, kwargs=generation_kwargs, daemon=True)
thread.start()
generated_text = ""
for new_text in self._streamer:
generated_text += new_text
# print(generated_text)
yield generated_text
def try_to_import_special_deps(self, **kwargs): pass
# import something that will raise error if the user does not install requirement_*.txt
# ๐โโ๏ธ๐โโ๏ธ๐โโ๏ธ ไธป่ฟ็จๆง่ก
# import importlib
# importlib.import_module('modelscope')
# ------------------------------------------------------------------------------------------------------------------------
# ๐๐ป GPT-Academic Interface
# ------------------------------------------------------------------------------------------------------------------------
predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetCoderLMHandle, model_name, history_format='chatglm3') |