import gradio as gr import time import requests import json import os MODEL = "gpt-4-0125-preview" API_URL = os.getenv("API_URL") API_KEY = os.getenv("API_KEY") print(f"API_URL: {API_URL}") print(f"API_KEY: {API_KEY}") url = f"{API_URL}/v1/chat/completions" # The headers for the HTTP request headers = { "accept": "application/json", "Content-Type": "application/json", "Authorization": f"Bearer {API_KEY}", } def is_valid_json(data): try: # Attempt to parse the JSON data parsed_data = json.loads(data) return True, parsed_data except ValueError as e: # If an error occurs, the JSON is not valid return False, str(e) with gr.Blocks() as demo: markup = gr.Markdown( """ # Mistral 7B Instruct v0.2 This is a demo of the Mistral 7B Instruct quantized model in GGUF (Q2) hosted on K8s cluster. The original models can be found [MaziyarPanahi/Mistral-7B-Instruct-v0.2-GGUF](https://huggingface.co/MaziyarPanahi/Mistral-7B-Instruct-v0.2-GGUF)""" ) chatbot = gr.Chatbot() msg = gr.Textbox(lines=1, label="User Message") clear = gr.Button("Clear") with gr.Row(): with gr.Column(scale=2): # Define inputs for additional parameters system_prompt_input = gr.Textbox( label="System Prompt", placeholder="Type system prompt here...", value="You are a helpful assistant.", ) temperature_input = gr.Slider( label="Temperature", minimum=0.0, maximum=1.0, value=0.9, step=0.01 ) max_new_tokens_input = gr.Slider( label="Max New Tokens", minimum=0, maximum=1024, value=256, step=1 ) with gr.Column(scale=2): top_p_input = gr.Slider( label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.01 ) top_k_input = gr.Slider( label="Top K", minimum=1, maximum=100, value=50, step=1 ) repetition_penalty_input = gr.Slider( label="Repetition Penalty", minimum=1.0, maximum=2.0, value=1.1, step=0.01, ) def update_globals( system_prompt, temperature, max_new_tokens, top_p, top_k, repetition_penalty ): global global_system_prompt, global_temperature, global_max_new_tokens, global_top_p, global_repetition_penalty, global_top_k global_system_prompt = system_prompt global_temperature = temperature global_max_new_tokens = max_new_tokens global_top_p = top_p global_top_k = top_k global_repetition_penalty = repetition_penalty def user(user_message, history): # print(f"User: {user_message}") # print(f"History: {history}") return "", history + [[user_message, None]] def bot( history, system_prompt, temperature, max_new_tokens, top_p, top_k, repetition_penalty, ): print(f"History in bot: {history}") print(f"System Prompt: {system_prompt}") print(f"Temperature: {temperature}") print(f"Max New Tokens: {max_new_tokens}") print(f"Top P: {top_p}") print(f"Top K: {top_k}") print(f"Repetition Penalty: {repetition_penalty}") # print(f"History in bot: {history}") # [['Capital of France', 'The capital city of France is Paris.'], ['Thansk', 'You are welcome.'], ['What is the capital of France?', '']] # convert this to [['Capital of France', 'The capital city of France is Paris.'], ['Thansk', 'You are welcome.'], ['What is the capital of France?', '']] to list of dict of role user and assiatant history_messages = [{"content": h[0], "role": "user"} for h in history if h[0]] # let's extract the user's question which should be the last touple first element # user_question = history[-1][0] history[-1][1] = "" sys_msg = [ { "content": ( system_prompt if system_prompt else "You are a helpful assistant." ), "role": "system", } ] history_messages = sys_msg + history_messages print(history_messages) data = { "messages": history_messages, "stream": True, "temprature": temperature, "top_k": top_k, "top_p": top_p, "seed": 42, "repeat_penalty": repetition_penalty, "chat_format": "mistral-instruct", "max_tokens": max_new_tokens, "response_format": { "type": "json_object", }, } # # Making the POST request and streaming the response response = requests.post( url, headers=headers, data=json.dumps(data), stream=True ) for line in response.iter_lines(): # Filter out keep-alive new lines if line: data = line.decode("utf-8").lstrip("data: ") # Check if the examples are valid valid_check = is_valid_json(data) if valid_check[0]: try: # Attempt to parse the JSON dataa # json_data = json.loads(data) json_data = valid_check[1] delta_content = ( json_data.get("choices", [{}])[0] .get("delta", {}) .get("content", "") ) if delta_content: # Ensure there's content to print # print(f"Bot: {delta_content}") history[-1][1] += delta_content # print(history) time.sleep(0.05) yield history except json.JSONDecodeError as e: print( f"Error decoding JSON: {e} date: {data}" ) # print(delta_content, flush=True, end="") # print(json_data['choices'][0]) msg.submit( user, [msg, chatbot], [msg, chatbot], queue=True, concurrency_limit=10 ).then( bot, inputs=[ chatbot, system_prompt_input, temperature_input, max_new_tokens_input, top_p_input, top_k_input, repetition_penalty_input, ], outputs=chatbot, ) clear.click(lambda: None, None, chatbot, queue=False) demo.queue(default_concurrency_limit=20, max_size=20, api_open=False) if __name__ == "__main__": demo.launch(show_api=False, share=False)