Command-R

Runtime error

File size: 3,597 Bytes

175bb86
 
709d394
 
486a2f6
cf5249f
709d394
349f644
51dbac2
02596d2
5cf089b
02596d2
51dbac2
d587d97
 
ec6946b
f7f4304
 
 
 
 
 
7364237
709d394
f24926f
9b3882c
f24926f
ec6946b
179b4d7
c8b11d6
 
3f46449
709d394
76de27f
66dd4b0
709d394
 
3ea359d
709d394
 
dc0acc6
98b2176
709d394
ef2fea2
600a2a9
709d394
 
 
 
29437cc
 
 
 
709d394
 
1cdad52
4f6966f
b5aae38
 
 
 
 
6e1661f
cd0aa02
6e1661f
 
 
cd0aa02
a13c01c
6e1661f
a13c01c
 
b5aae38
acf224c
7fc9307
acf224c
 
7fc9307
a13c01c
 
 
 
 
8325138
1cdad52
4f6966f
 
 
 
 
1cdad52

import torch
torch.jit.script = lambda f: f
import spaces
import gradio as gr
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,AwqConfig
import torch
import os
import bitnet
key =  os.environ.get("key")
from huggingface_hub import login 
login(key)
from bitnet import replace_linears_in_hf
# os.system("mkdir c4ai-command-r-v01-exl2")
# os.system("huggingface-cli download bartowski/c4ai-command-r-v01-exl2 --revision 6_5 --local-dir c4ai-command-r-v01-exl2 --local-dir-use-symlinks False")
# os.system("pip install flash-attn --no-build-isolation")
nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)
model_id = "IEITYuan/Yuan2-M32-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             # load_in_8bit=True,
                                             quantization_config=nf4_config,
                                            # attn_implementation="flash_attention_2",
                                             # torch_dtype = torch.bfloat16,
                                             device_map="auto",
                                             trust_remote_code=True
                                            )

# replace_linears_in_hf(model)
model.eval()
@spaces.GPU
def generate_response(user_input, max_new_tokens, temperature):
    os.system("nvidia-smi")
    messages = [{"role": "user", "content": user_input}]
    input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
    input_ids = input_ids.to(model.device)
    os.system("nvidia-smi")
    gen_tokens = model.generate(
        input_ids = input_ids, 
        max_new_tokens=max_new_tokens,
        do_sample=True, 
        temperature=temperature,
    )

    gen_text = tokenizer.decode(gen_tokens[0], skip_special_tokens=True)
    if gen_text.startswith(user_input):
        gen_text = gen_text[len(user_input):].lstrip()

    return gen_text


    
examples = [
        {"message": "What is the weather like today?", "max_new_tokens": 250, "temperature": 0.5},
        {"message": "Tell me a joke.", "max_new_tokens": 650, "temperature": 0.7},
        {"message": "Explain the concept of machine learning.", "max_new_tokens": 980, "temperature": 0.4}
]
example_choices = [f"Example {i+1}" for i in range(len(examples))]

def load_example(choice):
    index = example_choices.index(choice)
    example = examples[index]
    return example["message"], example["max_new_tokens"], example["temperature"]


with gr.Blocks() as demo:
    with gr.Row():
        max_new_tokens_slider = gr.Slider(minimum=100, maximum=4000, value=980, label="Max New Tokens")
        temperature_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.3, label="Temperature")
    message_box = gr.Textbox(lines=2, label="Your Message")
    generate_button = gr.Button("Try🫡Command-R")
    output_box = gr.Textbox(label="🫡Command-R")

    generate_button.click(
        fn=generate_response,
        inputs=[message_box, max_new_tokens_slider, temperature_slider],
        outputs=output_box
    )
    example_dropdown = gr.Dropdown(label="🫡Load Example", choices=example_choices)
    example_button = gr.Button("🫡Load")
    example_button.click(
        fn=load_example,
        inputs=example_dropdown,
        outputs=[message_box, max_new_tokens_slider, temperature_slider]
    )

demo.launch()