File size: 3,137 Bytes
964b21c
e3e14e0
643af8c
10e25d6
 
 
 
e3e14e0
ef276fb
643af8c
 
4bd73cd
002b4c2
c7187f3
 
 
 
643af8c
 
4bd73cd
643af8c
 
 
 
 
 
 
 
 
 
964b21c
 
 
 
 
c7187f3
643af8c
964b21c
002b4c2
 
4bd73cd
8dad029
f6ff724
ef276fb
 
 
 
643af8c
 
 
 
 
 
 
7103f56
643af8c
ef276fb
643af8c
 
 
964b21c
7103f56
c7187f3
643af8c
7103f56
 
643af8c
 
7103f56
643af8c
 
 
 
 
 
c7187f3
643af8c
 
 
c7e3230
ef276fb
d741816
 
 
 
3930035
d741816
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from transformers import AutoModelForCausalLM, AutoTokenizer
import gradio as gr
import torch
import os
from huggingface_hub import login

login(os.getenv('HF_KEY'))

def init_model():
    system_prompt = "You are a pirate chatbot who always responds in pirate speak!"
    # system_prompt = "### System:\nYou are StableBeluga, an AI that follows instructions extremely well. Help as much as you can.\n\n"

    # model = AutoModelForCausalLM.from_pretrained(
    #     "stabilityai/StableBeluga2", 
    #     torch_dtype=torch.float16, 
    #     low_cpu_mem_usage=True, 
    #     device_map="auto")
    model_id = "meta-llama/Meta-Llama-3-8B-Instruct"    
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )

    # model = AutoModelForCausalLM.from_pretrained(
    #     "stabilityai/stablelm-2-12b",
    #     torch_dtype="auto",
    #     )
    # model.cuda()

    # model = AutoModelForCausalLM.from_pretrained(
    # 'stabilityai/stablelm-2-12b-chat',
    # device_map="auto",
    # )
    
    # tokenizer = AutoTokenizer.from_pretrained("stabilityai/stablelm-2-12b")
    # tokenizer = AutoTokenizer.from_pretrained('stabilityai/stablelm-2-12b-chat')
    # print(tokenizer.decode(output[0], skip_special_tokens=True))
    # tokenizer = AutoTokenizer.from_pretrained("stabilityai/StableBeluga-7B", use_fast=True)
    # model = AutoModelForCausalLM.from_pretrained("stabilityai/StableBeluga-7B", load_in_8bit=True, low_cpu_mem_usage=True, device_map=0)
    # model = AutoModelForCausalLM.from_pretrained("stabilityai/StableBeluga-7B", low_cpu_mem_usage=True, device_map=0)
    # model = AutoModelForCausalLM.from_pretrained("stabilityai/StableBeluga-7B", device_map=0)
    return system_prompt, tokenizer, model

system_prompt, tokenizer, model = init_model()

# def make_prompt(user, syst=system_prompt): 
#     # return f"{syst}### User: {user}\n\n### Assistant:\n"
# ]   return [{'role': 'user', 'content': user}]

def ask_assistant(prompt, token=tokenizer, md=model, system_prompt=system_prompt):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt},
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(md.device)

    terminators = [
        token.eos_token_id,
        token.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = md.generate(
        input_ids,
        max_new_tokens=256,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )

    response = outputs[0][input_ids.shape[-1]:]    

    return tokenizer.decode(response, skip_special_tokens=True)

def ask(prompt):
    return ask_assistant(prompt)

demo = gr.Interface(
    gr.Radio(["LLaMa-3", "StableBeluga-2-12b", "Falcon-11b"], label="Model",),
    fn=ask,
    inputs=["text"],
    outputs=["text"],
)

demo.launch()