llm-api-latency / main.py
pandada8's picture
init
d6bbae3
import pandas as pd
import gradio as gr
import plotly.express as px
import polars as pl
all_data = pl.read_parquet("data.parquet")
def show_plot(event: gr.SelectData, df):
row = event.index[0]
print(event.value)
print(event.index)
print(event.target)
print(all_data[row])
model, backend, prompt, completion_tokens, total_time_used = all_data[row, 'model'], all_data[row, 'backend'], all_data[row, 'prompt'], all_data[row, 'completion_tokens'], all_data[row, 'total_time_used']
chunks = (
all_data[row].select(pl.col('raw_chunks').explode())
.select(latency=pl.col('raw_chunks').struct.field('1')/1e3, payload=pl.col('raw_chunks').struct.field('2'))
.with_columns(diff=pl.col('latency').diff())
.with_row_count(name='n')
)
tps = completion_tokens / total_time_used * 1e3
return (
px.scatter(chunks, x='latency', y='n', title=f'{model} - {backend}<br><sup>{prompt}</sup>', trendline='ols'),
px.scatter(chunks, x='n', y='diff', title=f'{model} - {backend} {tps:.2f} token/s <br><sup>{prompt}</sup>', trendline='ols'),
chunks.drop('n')
)
def get_overview(model):
df = all_data.filter(pl.col('model') == model).with_columns(date=pl.col('datetime').dt.round('1h'), tps=pl.col('completion_tokens') / pl.col('total_time_used') * 1e3)
return px.line(df, x='date', y='tps', color='backend', symbol='backend')
with gr.Blocks() as demo:
with gr.Tab("overview"):
gr.Markdown("deepseek-v3 overview")
gr.Plot(get_overview('deepseek-v3'))
gr.Markdown('deepseek-r1 overview')
gr.Plot(get_overview('deepseek-r1'))
with gr.Tab("Traces"):
with gr.Row():
df = gr.DataFrame(all_data.select('datetime', 'backend', 'model', 'prompt'), interactive=False, show_search='filter')
with gr.Row():
plot = gr.Plot()
plot2 = gr.Plot()
with gr.Row():
detail_df = gr.DataFrame()
df.select(fn=show_plot, inputs=df, outputs=[plot, plot2, detail_df])
if __name__ == "__main__":
demo.launch()