File size: 2,444 Bytes
691ea78
0231bb1
 
 
691ea78
0231bb1
691ea78
0231bb1
 
 
 
691ea78
0231bb1
 
691ea78
0231bb1
 
691ea78
0231bb1
691ea78
 
0231bb1
 
691ea78
0231bb1
 
 
 
691ea78
 
 
0231bb1
 
 
691ea78
0231bb1
 
 
691ea78
0231bb1
 
 
 
691ea78
0231bb1
 
 
 
 
 
 
 
691ea78
0231bb1
 
 
 
 
 
691ea78
0231bb1
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import gradio as gr
from openai import OpenAI
import os
import time

def predict(message, history, system_prompt, model, max_tokens, temperature, top_p):

    # Initialize the OpenAI client
    client = OpenAI(
        api_key=os.environ.get("API_TOKEN"),
    )

    # Start with the system prompt
    messages = [{"role": "system", "content": system_prompt}]

    # Add the conversation history
    messages.extend(history if history else [])

    # Add the current user message
    messages.append({"role": "user", "content": message})

    # Record the start time
    start_time = time.time()

    # Streaming response
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        stop=None,
        stream=True
    )

    full_message = ""
    first_chunk_time = None
    last_yield_time = None

    for chunk in response:
        if chunk.choices and chunk.choices[0].delta.content:
            if first_chunk_time is None:
                first_chunk_time = time.time() - start_time  # Record time for the first chunk

            full_message += chunk.choices[0].delta.content
            current_time = time.time()
            chunk_time = current_time - start_time  # calculate the time delay of the chunk
            print(f"Message received {chunk_time:.2f} seconds after request: {chunk.choices[0].delta.content}")  

            if last_yield_time is None or (current_time - last_yield_time >= 0.25):
                yield full_message
                last_yield_time = current_time

    # Ensure to yield any remaining message that didn't meet the time threshold
    if full_message:
        total_time = time.time() - start_time
        # Append timing information to the response message
        full_message += f" (First Chunk: {first_chunk_time:.2f}s, Total: {total_time:.2f}s)"
        yield full_message

gr.ChatInterface(
    fn=predict,
    type="messages",
    #save_history=True,
    #editable=True,
    additional_inputs=[
        gr.Textbox("You are a helpful AI assistant.", label="System Prompt"),
        gr.Dropdown(["gpt-4o", "gpt-4o-mini"], label="Model"),
        gr.Slider(800, 4000, value=2000, label="Max Token"),
        gr.Slider(0, 1, value=0.7, label="Temperature"),
        gr.Slider(0, 1, value=0.95, label="Top P"),
    ],
    css="footer{display:none !important}"
).launch()