chat

Sleeping

File size: 4,237 Bytes

5416372
a2e6c05
 
d81ed7c
a2e6c05
d81ed7c
a2e6c05
 
 
 
a65de5c
fd701cd
 
a2e6c05
 
 
 
 
 
 
5fe2c9a
 
3b39700
a2e6c05
5fe2c9a
63ba25e
a2e6c05
5fe2c9a
63ba25e
a2e6c05
d81ed7c
 
1a382ff
d81ed7c
 
 
 
 
 
a2e6c05
d81ed7c
a2e6c05
 
 
 
 
 
0a910e6
 
 
 
 
 
 
a2e6c05
 
 
4993069
a2e6c05
 
d81ed7c
a2e6c05
 
776563f
a2e6c05
 
 
 
 
 
 
d81ed7c
a2e6c05
d81ed7c
a2e6c05
 
 
 
 
 
 
 
 
d81ed7c
a2e6c05
 
 
 
 
 
 
 
 
d81ed7c
 
 
 
a2e6c05
d81ed7c
 
 
 
 
 
 
 
fd701cd
d81ed7c
b23a519
 
 
18e5a55
72fd759
18e5a55
3b39700
014d21e
 
a2e6c05
 
 
 
 
 
b93337b
d81ed7c
 
 
a2e6c05

import spaces
import json
import subprocess
import gradio as gr
from huggingface_hub import hf_hub_download

subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
subprocess.run('pip install llama-cpp-agent==0.2.10', shell=True)

hf_hub_download(repo_id="bartowski/dolphin-2.9.1-yi-1.5-34b-GGUF", filename="dolphin-2.9.1-yi-1.5-34b-Q6_K.gguf",  local_dir = "./models")
hf_hub_download(repo_id="crusoeai/dolphin-2.9.1-llama-3-70b-GGUF", filename="dolphin-2.9.1-llama-3-70b.Q3_K_M.gguf",  local_dir = "./models")
# hf_hub_download(repo_id="bartowski/dolphin-2.9.1-yi-1.5-9b-GGUF", filename="dolphin-2.9.1-yi-1.5-9b-f32.gguf",  local_dir = "./models")
# hf_hub_download(repo_id="crusoeai/dolphin-2.9.1-llama-3-8b-GGUF", filename="dolphin-2.9.1-llama-3-8b.Q6_K.gguf",  local_dir = "./models")

css = """
.message-row {
    justify-content: space-evenly !important;
}
.message-bubble-border {
    border-radius: 6px !important;
}
.dark.message-bubble-border {
    border-color: #21293b !important;
}
.dark.user {
    background: #0a1120 !important;
}
.dark.assistant, .dark.pending {
    background: transparent !important;
}
"""

@spaces.GPU(duration=120)
def respond(
    message,
    history: list[tuple[str, str]],
    max_tokens,
    temperature,
    top_p,
    model,
):
    from llama_cpp import Llama
    from llama_cpp_agent import LlamaCppAgent
    from llama_cpp_agent import MessagesFormatterType
    from llama_cpp_agent.providers import LlamaCppPythonProvider
    from llama_cpp_agent.chat_history import BasicChatHistory
    from llama_cpp_agent.chat_history.messages import Roles
    print(message)
    print(history)
    print(max_tokens)
    print(temperature)
    print(top_p)
    print(model)
    
    llm = Llama(
        model_path=f"models/{model}",
        n_gpu_layers=81,
        n_ctx=8192,
    )
    provider = LlamaCppPythonProvider(llm)

    agent = LlamaCppAgent(
        provider,
        system_prompt="You are Dolphin an AI assistant that helps humanity.",
        predefined_messages_formatter_type=MessagesFormatterType.CHATML,
        debug_output=True
    )
    
    settings = provider.get_provider_default_settings()
    settings.max_tokens = max_tokens
    settings.stream = True

    messages = BasicChatHistory()

    for msn in history:
        user = {
            'role': Roles.user,
            'content': msn[0]
        }
        assistant = {
            'role': Roles.assistant,
            'content': msn[1]
        }

        messages.add_message(user)
        messages.add_message(assistant)
    
    stream = agent.get_chat_response(message, llm_sampling_settings=settings, chat_history=messages, returns_streaming_generator=True, print_output=False)
    
    outputs = ""
    for output in stream:
        outputs += output
        yield outputs

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Slider(minimum=1, maximum=8192, value=8192, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
        gr.Dropdown(['dolphin-2.9.1-yi-1.5-34b-Q6_K.gguf', 'dolphin-2.9.1-llama-3-70b.Q3_K_M.gguf'], value="dolphin-2.9.1-yi-1.5-34b-Q6_K.gguf", label="Model"),
    ],
    theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="blue", neutral_hue="gray",font=[gr.themes.GoogleFont("Exo"), "ui-sans-serif", "system-ui", "sans-serif"]).set(
        body_background_fill_dark="#0f172a",
        block_background_fill_dark="#0f172a",
        block_title_background_fill_dark="#070d1b",
        input_background_fill_dark="#0c1425",
        button_secondary_background_fill_dark="#070d1b",
        border_color_primary_dark="#21293b",
        background_fill_secondary_dark="#0f172a",
        color_accent_soft_dark="transparent"
    ),
    css=css,
    retry_btn="Retry",
    undo_btn="Undo",
    clear_btn="Clear",
    submit_btn="Send",
    description="Cognitive Computation: 🐬 Chat multi llm"
)

if __name__ == "__main__":
    demo.launch()