import spaces import gradio as gr from huggingface_hub import InferenceClient from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig import torch import subprocess subprocess.run("rm -rf /data-nvme/zerogpu-offload/*", env={}, shell=True) subprocess.run("pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True) kwargs = {} """ https://hugging-face.cn/docs/transformers/quantization/bitsandbytes """ # quantization_config = BitsAndBytesConfig( # load_in_4bit=True, # bnb_4bit_quant_type="nf4", # bnb_4bit_use_double_quant=True, # bnb_4bit_compute_dtype=torch.bfloat16, # ) # quantization_config = BitsAndBytesConfig( # load_in_8bit=True, # # llm_int8_enable_fp32_cpu_offload=True, # ) # kwargs = { "quantization_config": quantization_config, "low_cpu_mem_usage": True } tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", trust_remote_code=True, torch_dtype=torch.bfloat16, **kwargs).cuda() @spaces.GPU(duration=120) def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, ): modelx = model if len(message) < 1: message = "write a quick sort algorithm in python." messages = [ { "role": "user", "content": message } ] inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(modelx.device) outputs = modelx.generate(inputs, max_new_tokens=max_tokens, do_sample=True, temperature=temperature, top_k=50, top_p=top_p, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id) return tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True) """ For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference """ # client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") # @spaces.GPU # def respond( # message, # history: list[tuple[str, str]], # system_message, # max_tokens, # temperature, # top_p, # ): # messages = [{"role": "system", "content": system_message}] # for val in history: # if val[0]: # messages.append({"role": "user", "content": val[0]}) # if val[1]: # messages.append({"role": "assistant", "content": val[1]}) # if len(message) < 1: # message = "write a quick sort algorithm in python." # messages.append({"role": "user", "content": message}) # response = "" # for message in client.chat_completion( # messages, # max_tokens=max_tokens, # stream=True, # temperature=temperature, # top_p=top_p, # ): # token = message.choices[0].delta.content # response += token # yield response """ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/main/docs/gradio/chatinterface """ css = """ #msg_input { flex-grow: 7; } """ demo = gr.ChatInterface( fn=respond, textbox=gr.Textbox(elem_id="msg_input", placeholder="write a quick sort algorithm in python."), additional_inputs=[ gr.Textbox(value="You are a friendly Chatbot.", label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", ), ], css=css, ) if __name__ == "__main__": demo.launch()