import pandas as pd import plotly.express as px import streamlit as st from src.architectures import * from src.st_helpers import st_setup from src.testing import TestGroup def show_stats(for_test_group: str): """ Set of nested functions to structure the display elements """ def show_elapsed_time_in_seconds_boxplot(stats): with st.expander("**Elapsed End to End Time (seconds)**"): data = [] for arch in stats: for e in arch['elapsed']: data.append([arch['arch_name'], e / 1000]) df = pd.DataFrame(data, columns=['Architecture', 'Elapsed time']) fig = px.box(df, x="Architecture", y="Elapsed time") fig.update_xaxes(tickangle=-90) st.plotly_chart(fig, use_container_width=True) def show_response_length_boxplot(stats): with st.expander("**Response length (count of characters)**"): data = [] for arch in stats: for rl in arch['response_len']: data.append([arch['arch_name'], rl]) df = pd.DataFrame(data, columns=['Architecture', 'Response length']) fig = px.box(df, x="Architecture", y="Response length") fig.update_xaxes(tickangle=-90) st.plotly_chart(fig, use_container_width=True) def show_elapsed_time_by_arch_step_stacked_bar(stats): with st.expander("**Mean elapsed time by architecture step (seconds)**"): data = [] for arch in stats: for step in arch['steps']: data.append([arch['arch_name'], step['step_name'], step['mean_elapsed'] / 1000]) df = pd.DataFrame(data, columns=['Architecture', 'Step', 'Mean elapsed time']) fig = px.bar(df, x='Architecture', y='Mean elapsed time', color='Step', barmode='stack') fig.update_xaxes(tickangle=-90) st.plotly_chart(fig, use_container_width=True) def show_time_vs_response_length_scatter_plot(stats): with st.expander("**Time by response length (seconds/char)**"): data = [] for arch in stats: for elapsed, resp_len in zip(arch['elapsed'], arch['response_len']): data.append([arch['arch_name'], elapsed / 1000, resp_len]) df = pd.DataFrame(data, columns=['Architecture', 'Elapsed time', 'Response length']) chart_area = st.container() if st.checkbox("Show regression lines"): fig = px.scatter(df, x='Elapsed time', y='Response length', color='Architecture', trendline='ols', trendline_color_override='red') else: fig = px.scatter(df, x='Elapsed time', y='Response length', color='Architecture') fig.update_xaxes(tickangle=-90) with chart_area: st.plotly_chart(fig, use_container_width=True) def show_q_and_a_detail(stats): questions = list(stats[0]['q_and_a'].keys()) num_archs = len(stats) with st.expander("**Request/Response Details**"): print(f'Displaying {len(questions)} questions and {num_archs} architectures') for q in questions: with st.expander(f"**{q}**"): for i in range(num_archs): st.divider() if q not in stats[i]['q_and_a']: answer = "No answer - test run possibly interrupted" else: answer = stats[i]['q_and_a'][q] st.write(f"**{stats[i]['arch_name']}**\n{answer}") test_group = TestGroup.for_test_group_tag(for_test_group) title = "No comment provided for group" if test_group.comment == "" else test_group.comment st.write(f"### {title}") st.write(f"Total of {test_group.num_tests} tests over {test_group.num_archs} architectures ({test_group.num_tests_per_arch} per architecture).") stats = test_group.summary_stats_by_arch() st.write("#### Statistics") show_elapsed_time_in_seconds_boxplot(stats) show_response_length_boxplot(stats) show_elapsed_time_by_arch_step_stacked_bar(stats) show_time_vs_response_length_scatter_plot(stats) st.write("#### Question and answer details") show_q_and_a_detail(stats) if st_setup('LLM Arch'): summary = st.container() with summary: st.write("# Test Reporter") TestGroup.load_all() selector, display = st.columns([2, 3]) with selector: test_groups = list(TestGroup.all.values()) test_groups.sort(key=lambda x: -x.start) options = [f'{tg.test_group}: {tg.comment}' for tg in test_groups] if selected := st.radio('**Pick a test set to review**', options=options, index=None): with display: show_stats(selected.split(":")[0])