File size: 3,597 Bytes
175bb86 709d394 486a2f6 cf5249f 709d394 349f644 51dbac2 02596d2 5cf089b 02596d2 51dbac2 d587d97 ec6946b f7f4304 7364237 709d394 f24926f 9b3882c f24926f ec6946b 179b4d7 c8b11d6 3f46449 709d394 76de27f 66dd4b0 709d394 3ea359d 709d394 dc0acc6 98b2176 709d394 ef2fea2 600a2a9 709d394 29437cc 709d394 1cdad52 4f6966f b5aae38 6e1661f cd0aa02 6e1661f cd0aa02 a13c01c 6e1661f a13c01c b5aae38 acf224c 7fc9307 acf224c 7fc9307 a13c01c 8325138 1cdad52 4f6966f 1cdad52 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import torch
torch.jit.script = lambda f: f
import spaces
import gradio as gr
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,AwqConfig
import torch
import os
import bitnet
key = os.environ.get("key")
from huggingface_hub import login
login(key)
from bitnet import replace_linears_in_hf
# os.system("mkdir c4ai-command-r-v01-exl2")
# os.system("huggingface-cli download bartowski/c4ai-command-r-v01-exl2 --revision 6_5 --local-dir c4ai-command-r-v01-exl2 --local-dir-use-symlinks False")
# os.system("pip install flash-attn --no-build-isolation")
nf4_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
model_id = "IEITYuan/Yuan2-M32-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
# load_in_8bit=True,
quantization_config=nf4_config,
# attn_implementation="flash_attention_2",
# torch_dtype = torch.bfloat16,
device_map="auto",
trust_remote_code=True
)
# replace_linears_in_hf(model)
model.eval()
@spaces.GPU
def generate_response(user_input, max_new_tokens, temperature):
os.system("nvidia-smi")
messages = [{"role": "user", "content": user_input}]
input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
input_ids = input_ids.to(model.device)
os.system("nvidia-smi")
gen_tokens = model.generate(
input_ids = input_ids,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=temperature,
)
gen_text = tokenizer.decode(gen_tokens[0], skip_special_tokens=True)
if gen_text.startswith(user_input):
gen_text = gen_text[len(user_input):].lstrip()
return gen_text
examples = [
{"message": "What is the weather like today?", "max_new_tokens": 250, "temperature": 0.5},
{"message": "Tell me a joke.", "max_new_tokens": 650, "temperature": 0.7},
{"message": "Explain the concept of machine learning.", "max_new_tokens": 980, "temperature": 0.4}
]
example_choices = [f"Example {i+1}" for i in range(len(examples))]
def load_example(choice):
index = example_choices.index(choice)
example = examples[index]
return example["message"], example["max_new_tokens"], example["temperature"]
with gr.Blocks() as demo:
with gr.Row():
max_new_tokens_slider = gr.Slider(minimum=100, maximum=4000, value=980, label="Max New Tokens")
temperature_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.3, label="Temperature")
message_box = gr.Textbox(lines=2, label="Your Message")
generate_button = gr.Button("Try🫡Command-R")
output_box = gr.Textbox(label="🫡Command-R")
generate_button.click(
fn=generate_response,
inputs=[message_box, max_new_tokens_slider, temperature_slider],
outputs=output_box
)
example_dropdown = gr.Dropdown(label="🫡Load Example", choices=example_choices)
example_button = gr.Button("🫡Load")
example_button.click(
fn=load_example,
inputs=example_dropdown,
outputs=[message_box, max_new_tokens_slider, temperature_slider]
)
demo.launch()
|