File size: 6,333 Bytes
15f14f5
 
 
 
 
 
 
 
8dd4d48
15f14f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8dd4d48
 
 
 
 
 
 
 
 
 
 
 
 
 
15f14f5
8dd4d48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15f14f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8dd4d48
 
 
 
15f14f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
model_name = "deepseek-coder-6.7b-instruct"
cmd_to_install = "ๆœช็Ÿฅ" # "`pip install -r request_llms/requirements_qwen.txt`"

import os
from toolbox import ProxyNetworkActivate
from toolbox import get_conf
from .local_llm_class import LocalLLMHandle, get_local_llm_predict_fns
from threading import Thread
import torch

def download_huggingface_model(model_name, max_retry, local_dir):
    from huggingface_hub import snapshot_download
    for i in range(1, max_retry):
        try:
            snapshot_download(repo_id=model_name, local_dir=local_dir, resume_download=True)
            break
        except Exception as e:
            print(f'\n\nไธ‹่ฝฝๅคฑ่ดฅ๏ผŒ้‡่ฏ•็ฌฌ{i}ๆฌกไธญ...\n\n')
    return local_dir
# ------------------------------------------------------------------------------------------------------------------------
# ๐Ÿ”Œ๐Ÿ’ป Local Model
# ------------------------------------------------------------------------------------------------------------------------
class GetCoderLMHandle(LocalLLMHandle):

    def load_model_info(self):
        # ๐Ÿƒโ€โ™‚๏ธ๐Ÿƒโ€โ™‚๏ธ๐Ÿƒโ€โ™‚๏ธ ๅญ่ฟ›็จ‹ๆ‰ง่กŒ
        self.model_name = model_name
        self.cmd_to_install = cmd_to_install

    def load_model_and_tokenizer(self):
        # ๐Ÿƒโ€โ™‚๏ธ๐Ÿƒโ€โ™‚๏ธ๐Ÿƒโ€โ™‚๏ธ ๅญ่ฟ›็จ‹ๆ‰ง่กŒ
        with ProxyNetworkActivate('Download_LLM'):
            from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
            model_name = "deepseek-ai/deepseek-coder-6.7b-instruct"
            # local_dir = f"~/.cache/{model_name}"
            # if not os.path.exists(local_dir):
            #     tokenizer = download_huggingface_model(model_name, max_retry=128, local_dir=local_dir)
            tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
            self._streamer = TextIteratorStreamer(tokenizer)
            device_map = {
                "transformer.word_embeddings": 0,
                "transformer.word_embeddings_layernorm": 0,
                "lm_head": 0,
                "transformer.h": 0,
                "transformer.ln_f": 0,
                "model.embed_tokens": 0,
                "model.layers": 0,
                "model.norm": 0,
            }

            # ๆฃ€ๆŸฅ้‡ๅŒ–้…็ฝฎ
            quantization_type = get_conf('LOCAL_MODEL_QUANT')

            if get_conf('LOCAL_MODEL_DEVICE') != 'cpu':
                if quantization_type == "INT8":
                    from transformers import BitsAndBytesConfig
                    # ไฝฟ็”จ INT8 ้‡ๅŒ–
                    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, load_in_8bit=True,
                                                                 device_map=device_map)
                elif quantization_type == "INT4":
                    from transformers import BitsAndBytesConfig
                    # ไฝฟ็”จ INT4 ้‡ๅŒ–
                    bnb_config = BitsAndBytesConfig(
                        load_in_4bit=True,
                        bnb_4bit_use_double_quant=True,
                        bnb_4bit_quant_type="nf4",
                        bnb_4bit_compute_dtype=torch.bfloat16
                    )
                    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True,
                                                                 quantization_config=bnb_config, device_map=device_map)
                else:
                    # ไฝฟ็”จ้ป˜่ฎค็š„ FP16
                    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True,
                                                                 torch_dtype=torch.bfloat16, device_map=device_map)
            else:
                # CPU ๆจกๅผ
                model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True,
                                                             torch_dtype=torch.bfloat16)

        return model, tokenizer

    def llm_stream_generator(self, **kwargs):
        # ๐Ÿƒโ€โ™‚๏ธ๐Ÿƒโ€โ™‚๏ธ๐Ÿƒโ€โ™‚๏ธ ๅญ่ฟ›็จ‹ๆ‰ง่กŒ
        def adaptor(kwargs):
            query = kwargs['query']
            max_length = kwargs['max_length']
            top_p = kwargs['top_p']
            temperature = kwargs['temperature']
            history = kwargs['history']
            return query, max_length, top_p, temperature, history
        
        query, max_length, top_p, temperature, history = adaptor(kwargs)
        history.append({ 'role': 'user', 'content': query})
        messages = history
        inputs = self._tokenizer.apply_chat_template(messages, return_tensors="pt")
        if inputs.shape[1] > max_length:
            inputs = inputs[:, -max_length:]
        inputs = inputs.to(self._model.device)
        generation_kwargs = dict(
                                    inputs=inputs, 
                                    max_new_tokens=max_length,
                                    do_sample=False,
                                    top_p=top_p,
                                    streamer = self._streamer,
                                    top_k=50,
                                    temperature=temperature,
                                    num_return_sequences=1, 
                                    eos_token_id=32021,
                                )
        thread = Thread(target=self._model.generate, kwargs=generation_kwargs, daemon=True)
        thread.start()
        generated_text = ""
        for new_text in self._streamer:
            generated_text += new_text
            # print(generated_text)
            yield generated_text


    def try_to_import_special_deps(self, **kwargs): pass
        # import something that will raise error if the user does not install requirement_*.txt
        # ๐Ÿƒโ€โ™‚๏ธ๐Ÿƒโ€โ™‚๏ธ๐Ÿƒโ€โ™‚๏ธ ไธป่ฟ›็จ‹ๆ‰ง่กŒ
        # import importlib
        # importlib.import_module('modelscope')


# ------------------------------------------------------------------------------------------------------------------------
# ๐Ÿ”Œ๐Ÿ’ป GPT-Academic Interface
# ------------------------------------------------------------------------------------------------------------------------
predict_no_ui_long_connection, predict = get_local_llm_predict_fns(GetCoderLMHandle, model_name, history_format='chatglm3')