Spaces:
Runtime error
Runtime error
import os | |
import torch | |
import gradio as gr | |
import transformers | |
import accelerate | |
import huggingface_hub | |
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig | |
from peft import PeftModel | |
huggingface_hub.login(token = os.environ['HF_TOKEN']) | |
bnb_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_compute_dtype=torch.bfloat16 | |
) | |
base_model_id = "google/gemma-7b" | |
base_model = AutoModelForCausalLM.from_pretrained( | |
base_model_id, | |
#quantization_config=bnb_config, | |
device_map="auto", | |
trust_remote_code=True, | |
token=True, | |
offload_folder="offload/", | |
) | |
tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True) | |
ft_model = PeftModel.from_pretrained(base_model, "msinghy/gemma-7b-ft-80row-alpaca-correcting-mistakes", offload_folder="offload/") | |
def respond(query): | |
eval_prompt = "###Input: " + query + "\n\n###Output: " | |
model_input = tokenizer(eval_prompt, return_tensors="pt")#.to("cuda") | |
output = ft_model.generate(input_ids=model_input["input_ids"]#.to(device), | |
,attention_mask=model_input["attention_mask"], | |
max_new_tokens=500) | |
result = tokenizer.decode(output[0], skip_special_tokens=True).replace(eval_prompt, "") | |
return result | |
def chat_response(message, history): | |
return respond(message) | |
demo = gr.ChatInterface(chat_response) | |
demo.launch() |