llm-arch / pages /040_Test_Reporter.py
alfraser's picture
Moved the trace reload behind the admin screen and login
f89cac3
raw
history blame
4.6 kB
import pandas as pd
import plotly.express as px
import streamlit as st
from src.architectures import *
from src.st_helpers import st_setup
from src.testing import TestGroup
def show_stats(for_test_group: str):
"""
Set of nested functions to structure the display elements
"""
def show_elapsed_time_in_seconds_boxplot(stats):
with st.expander("**Elapsed End to End Time (seconds)**"):
data = []
for arch in stats:
for e in arch['elapsed']:
data.append([arch['arch_name'], e / 1000])
df = pd.DataFrame(data, columns=['Architecture', 'Elapsed time'])
fig = px.box(df, x="Architecture", y="Elapsed time")
fig.update_xaxes(tickangle=-90)
st.plotly_chart(fig, use_container_width=True)
def show_response_length_boxplot(stats):
with st.expander("**Response length (count of characters)**"):
data = []
for arch in stats:
for rl in arch['response_len']:
data.append([arch['arch_name'], rl])
df = pd.DataFrame(data, columns=['Architecture', 'Response length'])
fig = px.box(df, x="Architecture", y="Response length")
fig.update_xaxes(tickangle=-90)
st.plotly_chart(fig, use_container_width=True)
def show_elapsed_time_by_arch_step_stacked_bar(stats):
with st.expander("**Mean elapsed time by architecture step (seconds)**"):
data = []
for arch in stats:
for step in arch['steps']:
data.append([arch['arch_name'], step['step_name'], step['mean_elapsed'] / 1000])
df = pd.DataFrame(data, columns=['Architecture', 'Step', 'Mean elapsed time'])
fig = px.bar(df, x='Architecture', y='Mean elapsed time', color='Step', barmode='stack')
fig.update_xaxes(tickangle=-90)
st.plotly_chart(fig, use_container_width=True)
def show_time_vs_response_length_scatter_plot(stats):
with st.expander("**Time by response length (seconds/char)**"):
data = []
for arch in stats:
for elapsed, resp_len in zip(arch['elapsed'], arch['response_len']):
data.append([arch['arch_name'], elapsed / 1000, resp_len])
df = pd.DataFrame(data, columns=['Architecture', 'Elapsed time', 'Response length'])
fig = px.scatter(df, x='Elapsed time', y='Response length', color='Architecture')
fig.update_xaxes(tickangle=-90)
st.plotly_chart(fig, use_container_width=True)
def show_q_and_a_detail(stats):
questions = list(stats[0]['q_and_a'].keys())
num_archs = len(stats)
with st.expander("**Request/Response Details**"):
print(f'Displaying {len(questions)} questions and {num_archs} architectures')
for q in questions:
with st.expander(f"**{q}**"):
for i in range(num_archs):
st.divider()
if q not in stats[i]['q_and_a']:
answer = "No answer - test run possibly interrupted"
else:
answer = stats[i]['q_and_a'][q]
st.write(f"**{stats[i]['arch_name']}**\n{answer}")
test_group = TestGroup.for_test_group_tag(for_test_group)
title = "No comment provided for group" if test_group.comment == "" else test_group.comment
st.write(f"### {title}")
st.write(f"Total of {test_group.num_tests} tests over {test_group.num_archs} architectures ({test_group.num_tests_per_arch} per architecture).")
stats = test_group.summary_stats_by_arch()
st.write("#### Statistics")
show_elapsed_time_in_seconds_boxplot(stats)
show_response_length_boxplot(stats)
show_elapsed_time_by_arch_step_stacked_bar(stats)
show_time_vs_response_length_scatter_plot(stats)
st.write("#### Question and answer details")
show_q_and_a_detail(stats)
if st_setup('LLM Arch'):
summary = st.container()
with summary:
st.write("# Test Reporter")
TestGroup.load_all()
selector, display = st.columns([2, 3])
with selector:
test_groups = list(TestGroup.all.values())
test_groups.sort(key=lambda x: -x.start)
options = [f'{tg.test_group}: {tg.comment}' for tg in test_groups]
if selected := st.radio('**Pick a test set to review**', options=options, index=None):
with display:
show_stats(selected.split(":")[0])