Spaces:
Sleeping
Sleeping
File size: 2,058 Bytes
fc78381 99c5cd4 4289215 a52d777 a64ef34 4289215 fc78381 875530c 3b5eac6 4289215 875530c 67c3a59 875530c 4289215 875530c 99c5cd4 67c3a59 8a18adb 4289215 875530c 67c3a59 8a18adb a65b38c 67c3a59 4289215 67c3a59 99c5cd4 6c3fbea 3b5eac6 99c5cd4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import os
import gradio as gr
from huggingface_hub import InferenceClient
import torch
from transformers import AutoTokenizer
from model.modeling_llamask import LlamaskForCausalLM
from model.tokenizer_utils import generate_custom_mask, prepare_tokenizer
access_token = os.getenv("HF_TOKEN")
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
device = 'cuda'
model = LlamaskForCausalLM.from_pretrained(model_id, torch_dtype= torch.bfloat16, token=access_token)
model = model.to(device)
model.load_adapter('theostos/zLlamask', adapter_name="zzlamask")
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
prepare_tokenizer(tokenizer)
def respond(
message,
history: list[tuple[str, str]],
max_tokens,
temperature,
):
prompt = f"""<|start_header_id|>system<|end_header_id|>
You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
{message}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""
model_inputs = generate_custom_mask(tokenizer, [prompt], device)
model.disable_adapters()
outputs = model.generate(temperature=0.7, max_tokens=32, **model_inputs)
outputs = outputs[:, model_inputs['input_ids'].shape[1]:]
result_no_ft = tokenizer.batch_decode(outputs, skip_special_tokens=True)
model.enable_adapters()
outputs = model.generate(temperature=0.7, max_tokens=32, **model_inputs)
outputs = outputs[:, model_inputs['input_ids'].shape[1]:]
result_ft = tokenizer.batch_decode(outputs, skip_special_tokens=True)
return f"Without finetuning:\n{result_no_ft}\n\nWith finetuning:\n{result_ft}"
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Slider(minimum=1, maximum=128, value=32, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"),
],
)
if __name__ == "__main__":
demo.launch() |