File size: 3,481 Bytes
dcb9dfd
7ab6c0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import gradio as gr
import plotly.express as px
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Set environment variables for GPU usage and memory allocation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
torch.cuda.empty_cache()
torch.cuda.set_per_process_memory_fraction(0.8)  # Adjust the fraction as needed

# Define device
device = "cuda"  # The device to load the model onto

# System message (placeholder, adjust as needed)
system_message = ""

# Load the model and tokenizer
def hermes_model():
    tokenizer = AutoTokenizer.from_pretrained("TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ")
    model = AutoModelForCausalLM.from_pretrained(
        "TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ", low_cpu_mem_usage=True, device_map="auto"
    )
    return model, tokenizer

model, tokenizer = hermes_model()

# Function to generate a response from the model
def chat_response(msg_prompt: str) -> str:
    """
    Generates a response from the model given a prompt.

    Args:
        msg_prompt (str): The user's message prompt.

    Returns:
        str: The model's response.
    """
    generation_params = {
        "do_sample": True,
        "temperature": 0.7,
        "top_p": 0.95,
        "top_k": 40,
        "max_new_tokens": 512,
        "repetition_penalty": 1.1,
    }
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, **generation_params)
    try:
        prompt_template = f'''system
        {system_message}
        user
        {msg_prompt}
        assistant
        '''
        pipe_output = pipe(prompt_template)[0]['generated_text']
        
        # Separate assistant's response from the output
        response_lines = pipe_output.split('assistant')
        assistant_response = response_lines[-1].strip() if len(response_lines) > 1 else pipe_output.strip()

        return assistant_response
    except Exception as e:
        return str(e)

# Function to generate a random plot
def random_plot():
    df = px.data.iris()
    fig = px.scatter(df, x="sepal_width", y="sepal_length", color="species",
                     size='petal_length', hover_data=['petal_width'])
    return fig

# Function to handle likes/dislikes (for demonstration purposes)
def print_like_dislike(x: gr.LikeData):
    print(x.index, x.value, x.liked)

# Function to add messages to the chat history
def add_message(history, message):
    for x in message["files"]:
        history.append(((x,), None))
    if message["text"] is not None:
        history.append((message["text"], None))
    return history, gr.update(value=None, interactive=True)

# Function to simulate the bot response
def bot(history):
    user_message = history[-1][0]
    bot_response = chat_response(user_message)
    history[-1][1] = bot_response
    return history

fig = random_plot()

# Gradio interface setup
with gr.Blocks(fill_height=True) as demo:
    chatbot = gr.Chatbot(elem_id="chatbot", bubble_full_width=False, scale=1)

    chat_input = gr.MultimodalTextbox(
        interactive=True,
        file_count="multiple",
        placeholder="Enter message or upload file...",
        show_label=False
    )

    chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input])
    bot_msg = chat_msg.then(bot, chatbot, chatbot)
    bot_msg.then(lambda: gr.update(interactive=True), None, [chat_input])

    chatbot.like(print_like_dislike, None, None)

demo.queue()
demo.launch()