Spaces:

artificialguybr
/

LLAMA-2-70B-FREE-DEMO

Running

File size: 2,805 Bytes

c551206
 
020a962
1df13e1
c551206
8cd9af7
 
c551206
1df13e1
c551206
 
 
 
 
 
 
 
9faed3d
fc26c64
 
14126e6
fc26c64
8cd9af7
c551206
a9af4d7
c551206
 
 
 
 
85dbf4a
c551206
 
0e16686
c551206
 
 
0e16686
cb4c132
0e16686
8c77830
cb4c132
9faed3d
a414401
9faed3d
8c77830
9faed3d
 
 
 
9809955
fc26c64
 
 
 
9faed3d
c551206
9faed3d
 
 
fc26c64
 
9faed3d
fc26c64
 
cb4c132
 
fc26c64

import gradio as gr
import requests
import os
import json

API_KEY = os.getenv('API_KEY')
INVOKE_URL = "https://api.nvcf.nvidia.com/v2/nvcf/pexec/functions/0e349b44-440a-44e1-93e9-abe8dcb27158"
FETCH_URL_FORMAT = "https://api.nvcf.nvidia.com/v2/nvcf/pexec/status/"

headers = {
    "Authorization": f"Bearer {API_KEY}",
    "Accept": "application/json",
    "Content-Type": "application/json",
}

BASE_SYSTEM_MESSAGE = "I carefully provide accurate, factual, thoughtful, nuanced answers and am brilliant at reasoning."

def call_nvidia_api(message, history_api, system_message, max_tokens, temperature, top_p):
    messages = [{"role": "system", "content": system_message}] if system_message else []
    messages.extend([{"role": "user", "content": message}])
    for msg in history_api:
        messages.extend([{"role": "user", "content": msg[0]}, {"role": "assistant", "content": msg[1]}])

    payload = {
        "messages": messages,
        "temperature": temperature,
        "top_p": top_p,
        "max_tokens": max_tokens,
        "stream": False
    }

    session = requests.Session()
    response = session.post(INVOKE_URL, headers=headers, json=payload)
    while response.status_code == 202:
        request_id = response.headers.get("NVCF-REQID")
        fetch_url = FETCH_URL_FORMAT + request_id
        response = session.get(fetch_url, headers=headers)
    response.raise_for_status()
    response_body = response.json()

    if response_body.get("choices"):
        assistant_message = response_body["choices"][0]["message"]["content"]
        return assistant_message
    else:
        return "Desculpe, ocorreu um erro ao gerar a resposta."

def chatbot_function(message, history_api, system_message, max_tokens, temperature, top_p):
    assistant_message = call_nvidia_api(message, history_api, system_message, max_tokens, temperature, top_p)
    history_api.append([message, assistant_message])
    return assistant_message, history_api

system_msg = gr.Textbox(value=BASE_SYSTEM_MESSAGE, label="System Message", placeholder="System prompt.", lines=5)
max_tokens = gr.Slider(minimum=20, maximum=1024, label="Max Tokens", step=20, value=1024)
temperature = gr.Slider(minimum=0.0, maximum=1.0, label="Temperature", step=0.1, value=0.2)
top_p = gr.Slider(minimum=0.0, maximum=1.0, label="Top P", step=0.05, value=0.7)

with gr.Blocks() as demo:
    chat_history_state = gr.State([])
    chat_interface = gr.ChatInterface(
        fn=chatbot_function,
        inputs=["message", "history_api", system_msg, max_tokens, temperature, top_p],
        outputs=["assistant_message", "history_api"],
        title="LLAMA 70B Free Demo",
        description="Explore the capabilities of LLAMA 2 70B",
        additional_inputs=[system_msg, max_tokens, temperature, top_p]
    )

demo.launch()