Spaces:

DevQuasar
/

llama3_on_sbc

Sleeping

File size: 3,679 Bytes

import gradio as gr
import os
import requests
import json

sbc_host_url = os.environ['URL']

# def get_completion(prompt:str, messages:str = '', n_predict=128):
#     system = "### System: You are a helpful assistant helps to brainstorm ideas.\n"
#     prompt_templated = f'{system} {messages}\n ### HUMAN:\n{prompt} \n ### ASSISTANT:'

#     headers = {
#         "Content-Type": "application/json"
#     }
#     data = {
#         "prompt": prompt_templated,
#         "n_predict": n_predict,
#         "stop": ["### HUMAN:", "### ASSISTANT:", "HUMAN"],
#         "stream": "True"
#     }
#     try:
#         response = requests.post(sbc_host_url, headers=headers, data=json.dumps(data))
        
#         if response.status_code == 200:
#             return response.json()['content']
#         else:
#             response.raise_for_status()
#     except:
#         raise gr.Warning("Apologies for the inconvenience! Our model is currently self-hosted and unavailable at the moment.")


# def chatty(prompt, messages):
#     # print(prompt)
#     # print(f'messages: {messages}')
#     past_messages = ''
#     if len(messages) > 0:
#         for idx, message in enumerate(messages):
#             print(f'idx: {idx}, message: {message}')
#             past_messages += f'\n### HUMAN: {message[0]}'
#             past_messages += f'\n### ASSISTANT: {message[1]}'
                
                
#         # past_messages = messages[0][0]
#     # print(f'past_messages: {past_messages}')
#     messages = get_completion(prompt, past_messages)
#     return messages.split('### ASSISTANT:')[-1]

# stream
def chatty(prompt, messages, n_predict=128):
    # print(prompt)
    # print(f'messages: {messages}')
    past_messages = ''
    if len(messages) > 0:
        for idx, message in enumerate(messages):
            # print(f'idx: {idx}, message: {message}')
            past_messages += f'\n### HUMAN: {message[0]}'
            past_messages += f'\n### ASSISTANT: {message[1]}'
                
    system = "### System: You help to brainstorm ideas.\n"
    prompt_templated = f'{system} {messages}\n ### HUMAN:\n{prompt} \n ### ASSISTANT:'
    
    headers = {
        "Content-Type": "application/json"
    }
    data = {
        "prompt": prompt_templated,
        "n_predict": n_predict,
        "stop": ["### HUMAN:", "### ASSISTANT:", "HUMAN"],
        "stream": True
    }

    result = ""
    try:
        response = requests.post(sbc_host_url, headers=headers, data=json.dumps(data), stream=True)
        
        if response.status_code == 200:
            for line in response.iter_lines():
                if line:
                    try:
                        result += json.loads(line.decode('utf-8').replace('data: ', ''))['content']
                    except:
                        # LMStudio response has empty token 
                        pass
                    yield result
        else:
            response.raise_for_status()
    except requests.exceptions.RequestException as e:
        raise gr.Warning("Apologies for the inconvenience! Our model is currently self-hosted and unavailable at the moment.")


with gr.Blocks() as demo:
    gr.Image("sbc.jpg")
    gr.ChatInterface(
    fn=chatty,
    title="DevQuasar/llama3_8b_chat_brainstorm-GGUF on Orange Pi5 plus with llama.cpp",
    description="Brainstorm facilitates idea exploration through interaction with a Language Model (LLM). Rather than providing direct answers, the model engages in a dialogue with users, offering probing questions aimed at fostering deeper contemplation and consideration of various facets of their ideas."
) 


if __name__ == "__main__":
    demo.launch()