File size: 3,137 Bytes
964b21c e3e14e0 643af8c 10e25d6 e3e14e0 ef276fb 643af8c 4bd73cd 002b4c2 c7187f3 643af8c 4bd73cd 643af8c 964b21c c7187f3 643af8c 964b21c 002b4c2 4bd73cd 8dad029 f6ff724 ef276fb 643af8c 7103f56 643af8c ef276fb 643af8c 964b21c 7103f56 c7187f3 643af8c 7103f56 643af8c 7103f56 643af8c c7187f3 643af8c c7e3230 ef276fb d741816 3930035 d741816 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
from transformers import AutoModelForCausalLM, AutoTokenizer
import gradio as gr
import torch
import os
from huggingface_hub import login
login(os.getenv('HF_KEY'))
def init_model():
system_prompt = "You are a pirate chatbot who always responds in pirate speak!"
# system_prompt = "### System:\nYou are StableBeluga, an AI that follows instructions extremely well. Help as much as you can.\n\n"
# model = AutoModelForCausalLM.from_pretrained(
# "stabilityai/StableBeluga2",
# torch_dtype=torch.float16,
# low_cpu_mem_usage=True,
# device_map="auto")
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
)
# model = AutoModelForCausalLM.from_pretrained(
# "stabilityai/stablelm-2-12b",
# torch_dtype="auto",
# )
# model.cuda()
# model = AutoModelForCausalLM.from_pretrained(
# 'stabilityai/stablelm-2-12b-chat',
# device_map="auto",
# )
# tokenizer = AutoTokenizer.from_pretrained("stabilityai/stablelm-2-12b")
# tokenizer = AutoTokenizer.from_pretrained('stabilityai/stablelm-2-12b-chat')
# print(tokenizer.decode(output[0], skip_special_tokens=True))
# tokenizer = AutoTokenizer.from_pretrained("stabilityai/StableBeluga-7B", use_fast=True)
# model = AutoModelForCausalLM.from_pretrained("stabilityai/StableBeluga-7B", load_in_8bit=True, low_cpu_mem_usage=True, device_map=0)
# model = AutoModelForCausalLM.from_pretrained("stabilityai/StableBeluga-7B", low_cpu_mem_usage=True, device_map=0)
# model = AutoModelForCausalLM.from_pretrained("stabilityai/StableBeluga-7B", device_map=0)
return system_prompt, tokenizer, model
system_prompt, tokenizer, model = init_model()
# def make_prompt(user, syst=system_prompt):
# # return f"{syst}### User: {user}\n\n### Assistant:\n"
# ] return [{'role': 'user', 'content': user}]
def ask_assistant(prompt, token=tokenizer, md=model, system_prompt=system_prompt):
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
]
input_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt"
).to(md.device)
terminators = [
token.eos_token_id,
token.convert_tokens_to_ids("<|eot_id|>")
]
outputs = md.generate(
input_ids,
max_new_tokens=256,
eos_token_id=terminators,
do_sample=True,
temperature=0.6,
top_p=0.9,
)
response = outputs[0][input_ids.shape[-1]:]
return tokenizer.decode(response, skip_special_tokens=True)
def ask(prompt):
return ask_assistant(prompt)
demo = gr.Interface(
gr.Radio(["LLaMa-3", "StableBeluga-2-12b", "Falcon-11b"], label="Model",),
fn=ask,
inputs=["text"],
outputs=["text"],
)
demo.launch() |