File size: 4,829 Bytes
2bc99a0
 
 
432f817
927b5de
 
2bc99a0
a3c3064
8290337
6132d55
63a0917
b6dc5a5
2bc99a0
f2227ae
 
b6dc5a5
432f817
a3c3064
f2227ae
 
2bc99a0
 
9bc49ef
0d5c130
9bc49ef
 
 
 
34723ea
0d5c130
63a0917
37c2762
9bc49ef
2bc99a0
 
b6dc5a5
 
a3c3064
2bc99a0
 
b6dc5a5
2bc99a0
 
 
 
432f817
fd37061
5ab0bbc
a3c3064
2bc99a0
 
a3c3064
4e972fb
f2227ae
fd37061
93d63cd
0a0db1b
8503364
 
d402103
5ab0bbc
fd37061
f2227ae
8de5029
fd37061
36072c8
670dcbd
8503364
fd37061
f2227ae
fd37061
 
 
 
 
 
8290337
36072c8
fd37061
8290337
fd37061
 
f2227ae
fd37061
 
4e972fb
fd37061
 
 
 
 
 
 
ee86b18
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
import math
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer , TextStreamer
import torch
import gradio as gr
import sentencepiece

title = "# Welcome to 🙋🏻‍♂️Tonic's🧠🤌🏻Neural Chat (From Intel)!"
description = """Try out [Intel/neural-chat-7b-v3-1](https://huggingface.co/Intel/neural-chat-7b-v3-1) the Instruct of [Intel/neural-chat-7b-v3](https://huggingface.co/Intel/neural-chat-7b-v3) Llama Finetune using the [mistralai/Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) recipe. You can use [Intel/neural-chat-7b-v3-1](https://huggingface.co/Intel/neural-chat-7b-v3-1) here via API using Gradio by scrolling down and clicking Use 'Via API' or privately by [cloning this space on huggingface](https://huggingface.co/spaces/TeamTonic/NeuralChat?duplicate=true)  . [Join my active builders' server on discord](https://discord.gg/VqTxc76K3u). Let's build together!. """

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:50'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_name = "Intel/neural-chat-7b-v3-1"
tokenizer = AutoTokenizer.from_pretrained("Intel/neural-chat-7b-v3-1")
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
streamer = TextStreamer(tokenizer)

class IntelChatBot:
    def __init__(self, model, tokenizer, system_message="You are 🧠🤌🏻Neuro, an AI language model created by Tonic-AI. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior."):
        self.model = model
        self.tokenizer = tokenizer
        self.system_message = system_message

    def set_system_message(self, new_system_message):
        self.system_message = new_system_message

    def format_prompt(self, user_message):
        prompt = f"### System:\n {self.system_message}\n ### User:\n{user_message}\n### System:\n"
        return prompt

    def neuro(self, user_message, temperature, max_new_tokens, top_p, repetition_penalty, do_sample):
        prompt = self.format_prompt(user_message)
        inputs = self.tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
        input_ids = inputs["input_ids"].to(self.model.device)
        attention_mask = inputs["attention_mask"].to(self.model.device)


        output_ids = self.model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=input_ids.shape[1] + max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
            streamer=streamer,
            do_sample=do_sample
        )

        response = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
        return response

def gradio_predict(user_message, system_message, max_new_tokens, temperature, top_p, repetition_penalty, do_sample):
    Intel_bot.set_system_message(system_message)
    if not do_sample:
        max_length = 780
        temperature = 0.9 
        top_p = 0.9        
        repetition_penalty = 0.9  
    response = Intel_bot.neuro(user_message, temperature, max_new_tokens, top_p, repetition_penalty, do_sample)
    return response

Intel_bot = IntelChatBot(model, tokenizer)


with gr.Blocks(theme = "ParityError/Anime") as demo:
    gr.Markdown(title)  
    gr.Markdown(description)  
    with gr.Row():
        system_message = gr.Textbox(label="Optional 🧠🤌🏻NeuralChat Assistant Message", lines=2)
        user_message = gr.Textbox(label="Your Message", lines=3)
    with gr.Row():
        do_sample = gr.Checkbox(label="Advanced", value=False)
    
    with gr.Accordion("Advanced Settings", open=lambda do_sample: do_sample):
        with gr.Row():
            max_new_tokens = gr.Slider(label="Max new tokens", value=780, minimum=150, maximum=3200, step=1)
            temperature = gr.Slider(label="Temperature", value=0.3, minimum=0.1, maximum=1.0, step=0.1)
            top_p = gr.Slider(label="Top-p (nucleus sampling)", value=0.90, minimum=0.01, maximum=0.99, step=0.05)
            repetition_penalty = gr.Slider(label="Repetition penalty", value=0.9, minimum=1.0, maximum=1.0, step=0.05)

    submit_button = gr.Button("Submit")
    output_text = gr.Textbox(label="🧠🤌🏻NeuralChat Response")

    def process(user_message, system_message, max_new_tokens, temperature, top_p, repetition_penalty, do_sample):
        return gradio_predict(user_message, system_message, max_new_tokens, temperature, top_p, repetition_penalty, do_sample)

    submit_button.click(
        process,
        inputs=[user_message, system_message, max_new_tokens, temperature, top_p, repetition_penalty, do_sample],
        outputs=output_text
    )
    
demo.launch()