File size: 4,680 Bytes
2bc99a0
 
 
 
927b5de
 
2bc99a0
a3c3064
f2227ae
 
63a0917
b6dc5a5
2bc99a0
f2227ae
 
b6dc5a5
a3c3064
f2227ae
 
2bc99a0
 
9bc49ef
0d5c130
9bc49ef
 
 
 
f2227ae
0d5c130
63a0917
fd37061
9bc49ef
2bc99a0
 
b6dc5a5
 
a3c3064
2bc99a0
 
b6dc5a5
2bc99a0
 
 
 
fd37061
5ab0bbc
a3c3064
2bc99a0
 
a3c3064
fd37061
f2227ae
fd37061
93d63cd
0a0db1b
8503364
 
f2227ae
5ab0bbc
fd37061
f2227ae
8de5029
fd37061
36072c8
670dcbd
8503364
fd37061
f2227ae
fd37061
 
 
 
 
 
93d63cd
36072c8
fd37061
 
 
 
f2227ae
fd37061
 
 
 
 
 
 
 
 
 
ee86b18
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import math
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import gradio as gr
import sentencepiece

title = "# ²Welcome to 🙋🏻‍♂️Tonic's🧠🤌🏻Neural Chat (From Intel)!"
description = """Try out [Intel/neural-chat-7b-v3-1](https://huggingface.co/Intel/neural-chat-7b-v3-1) the Instruct Llama Finetune using the [mistralai/Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) recipe. You can use [Intel/neural-chat-7b-v3-1](https://huggingface.co/Intel/neural-chat-7b-v3-1) here via API using Gradio by scrolling down and clicking Use 'Via API' or privately by [cloning this space on huggingface](https://huggingface.co/spaces/TeamTonic/NeuralChat?duplicate=true)  . [Join my active builders' server on discord](https://discord.gg/VqTxc76K3u). Let's build together!. """

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:50'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_name = "Intel/neural-chat-7b-v3-1"
tokenizer = AutoTokenizer.from_pretrained("Intel/neural-chat-7b-v3-1")
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")

class IntelChatBot:
    def __init__(self, model, tokenizer, system_message="You are 🧠🤌🏻Neuro, an AI language model created by Tonic-AI. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior."):
        self.model = model
        self.tokenizer = tokenizer
        self.system_message = system_message

    def set_system_message(self, new_system_message):
        self.system_message = new_system_message

    def format_prompt(self, user_message):
        prompt = f"### System:\n {self.system_message}\n ### User:\n{user_message}\n### Assistant:\n"
        return prompt

    def predict(self, user_message, temperature, max_new_tokens, top_p, repetition_penalty, do_sample):
        prompt = self.format_prompt(user_message)
        inputs = self.tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
        input_ids = inputs["input_ids"].to(self.model.device)
        attention_mask = inputs["attention_mask"].to(self.model.device)


        output_ids = self.model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=input_ids.shape[1] + max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
            do_sample=do_sample
        )

        response = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
        return response

def gradio_predict(user_message, system_message, max_new_tokens, temperature, top_p, repetition_penalty, do_sample):
    Intel_bot.set_system_message(system_message)
    if not do_sample:
        max_length = 780
        temperature = 0.9 
        top_p = 0.9        
        repetition_penalty = 0.9  
    response = Intel_bot.predict(user_message, temperature, max_new_tokens, top_p, repetition_penalty, do_sample)
    return response

Intel_bot = IntelChatBot(model, tokenizer)


with gr.Blocks(theme = "ParityError/Anime") as demo:
    gr.Markdown(title)  
    gr.Markdown(description)  
    with gr.Row():
        system_message = gr.Textbox(label="Optional 🧠🤌🏻NeuralChat Assistant Message", lines=2)
        user_message = gr.Textbox(label="Your Message", lines=3)
    with gr.Row():
        do_sample = gr.Checkbox(label="Advanced", value=False)
    
    with gr.Accordion("Advanced Settings", open=lambda do_sample: do_sample):
        with gr.Row():
            max_new_tokens = gr.Slider(label="Max new tokens", value=780, minimum=550, maximum=3200, step=1)
            temperature = gr.Slider(label="Temperature", value=0.3, minimum=0.1, maximum=1.0, step=0.1)
            top_p = gr.Slider(label="Top-p (nucleus sampling)", value=0.90, minimum=0.01, maximum=0.99, step=0.05)
            repetition_penalty = gr.Slider(label="Repetition penalty", value=1.9, minimum=1.0, maximum=2.0, step=0.05)

    submit_button = gr.Button("Submit")
    output_text = gr.Textbox(label="🧠🤌🏻NeuralChat Response")

    def process(user_message, system_message, max_new_tokens, temperature, top_p, repetition_penalty, do_sample):
        return gradio_predict(user_message, system_message, max_new_tokens, temperature, top_p, repetition_penalty, do_sample)

    submit_button.click(
        process,
        inputs=[user_message, system_message, max_new_tokens, temperature, top_p, repetition_penalty, do_sample],
        outputs=output_text
    )
    
demo.launch()