Spaces:
Runtime error
Runtime error
File size: 1,461 Bytes
9424db7 8a05110 4de2ced b9c2f05 b577899 0541147 fcddbd2 b9c2f05 4de2ced 9424db7 0541147 6358494 5c9bbd9 6358494 b577899 6358494 25b5932 5c9bbd9 02ebcf9 25b5932 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
import os
import torch
import gradio as gr
import transformers
import accelerate
import huggingface_hub
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
huggingface_hub.login(token = os.environ['HF_TOKEN'])
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
base_model_id = "google/gemma-7b"
base_model = AutoModelForCausalLM.from_pretrained(
base_model_id,
#quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
token=True,
offload_folder="offload/",
)
tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True)
ft_model = PeftModel.from_pretrained(base_model, "msinghy/gemma-7b-ft-80row-alpaca-correcting-mistakes", offload_folder="offload/")
def respond(query):
eval_prompt = "###Input: " + query + "\n\n###Output: "
model_input = tokenizer(eval_prompt, return_tensors="pt")#.to("cuda")
output = ft_model.generate(input_ids=model_input["input_ids"]#.to(device),
,attention_mask=model_input["attention_mask"],
max_new_tokens=500)
result = tokenizer.decode(output[0], skip_special_tokens=True).replace(eval_prompt, "")
return result
def chat_response(message, history):
return respond(message)
demo = gr.ChatInterface(chat_response)
demo.launch() |