File size: 1,010 Bytes
8f746af
cf4adb6
a40bb81
 
 
 
 
9ebefd7
 
 
a40bb81
8f746af
 
 
230646e
8f746af
a40bb81
 
9ebefd7
477cfd2
d157505
a40bb81
 
e5085a0
230646e
d157505
477cfd2
a40bb81
d157505
 
8f746af
ba54308
d157505
 
aeac9e6
 
8f746af
477cfd2
8f746af
52534e6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import gradio as gr
import torch
from transformers import AutoTokenizer
from awq import AutoAWQForCausalLM

model_path = "bragour/Camel-7b-chat-awq"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoAWQForCausalLM.from_quantized(model_path, fuse_layers=True, trust_remote_code=False, safetensors=True).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=False)


def respond(
    message
):
    formatted_prompt = f"<s>[INST]{message}[/INST]"

    tokens = tokenizer(formatted_prompt, return_tensors='pt').input_ids.to(device)

    # Generate the response from the API
    result = model.generate(
        tokens,
        do_sample=False,
        max_new_tokens=200
    )

    response = tokenizer.decode(result[0], skip_special_tokens=True)

    return response

# Define the Gradio interface
demo = gr.Interface(
    fn=respond,
    inputs="text",
    outputs=["text"]
)
demo.launch(inline=False)

if __name__ == "__main__":
    demo.launch()