File size: 4,893 Bytes
82130cb
 
5044033
 
82130cb
5044033
82130cb
 
a9d1d49
82130cb
a9d1d49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
943d243
 
 
 
 
 
 
a9d1d49
943d243
 
a9d1d49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82130cb
 
 
 
 
 
 
9cec719
a9d1d49
 
 
 
5044033
9cec719
a9d1d49
9cec719
 
5044033
 
 
 
f89cac3
82130cb
5ea3cc9
82130cb
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd
import plotly.express as px
import streamlit as st

from src.architectures import *
from src.st_helpers import st_setup
from src.testing import TestGroup


def show_stats(for_test_group: str):
    """
    Set of nested functions to structure the display elements
    """
    def show_elapsed_time_in_seconds_boxplot(stats):
        with st.expander("**Elapsed End to End Time (seconds)**"):
            data = []
            for arch in stats:
                for e in arch['elapsed']:
                    data.append([arch['arch_name'], e / 1000])
            df = pd.DataFrame(data, columns=['Architecture', 'Elapsed time'])
            fig = px.box(df, x="Architecture", y="Elapsed time")
            fig.update_xaxes(tickangle=-90)
            st.plotly_chart(fig, use_container_width=True)

    def show_response_length_boxplot(stats):
        with st.expander("**Response length (count of characters)**"):
            data = []
            for arch in stats:
                for rl in arch['response_len']:
                    data.append([arch['arch_name'], rl])
            df = pd.DataFrame(data, columns=['Architecture', 'Response length'])
            fig = px.box(df, x="Architecture", y="Response length")
            fig.update_xaxes(tickangle=-90)
            st.plotly_chart(fig, use_container_width=True)

    def show_elapsed_time_by_arch_step_stacked_bar(stats):
        with st.expander("**Mean elapsed time by architecture step (seconds)**"):
            data = []
            for arch in stats:
                for step in arch['steps']:
                    data.append([arch['arch_name'], step['step_name'], step['mean_elapsed'] / 1000])
            df = pd.DataFrame(data, columns=['Architecture', 'Step', 'Mean elapsed time'])
            fig = px.bar(df, x='Architecture', y='Mean elapsed time', color='Step', barmode='stack')
            fig.update_xaxes(tickangle=-90)
            st.plotly_chart(fig, use_container_width=True)

    def show_time_vs_response_length_scatter_plot(stats):
        with st.expander("**Time by response length (seconds/char)**"):
            data = []
            for arch in stats:
                for elapsed, resp_len in zip(arch['elapsed'], arch['response_len']):
                    data.append([arch['arch_name'], elapsed / 1000, resp_len])
            df = pd.DataFrame(data, columns=['Architecture', 'Elapsed time', 'Response length'])

            chart_area = st.container()
            if st.checkbox("Show regression lines"):
                fig = px.scatter(df, x='Elapsed time', y='Response length', color='Architecture', trendline='ols', trendline_color_override='red')

            else:
                fig = px.scatter(df, x='Elapsed time', y='Response length', color='Architecture')
            fig.update_xaxes(tickangle=-90)
            with chart_area:
                st.plotly_chart(fig, use_container_width=True)

    def show_q_and_a_detail(stats):
        questions = list(stats[0]['q_and_a'].keys())
        num_archs = len(stats)
        with st.expander("**Request/Response Details**"):
            print(f'Displaying {len(questions)} questions and {num_archs} architectures')
        for q in questions:
            with st.expander(f"**{q}**"):
                for i in range(num_archs):
                    st.divider()
                    if q not in stats[i]['q_and_a']:
                        answer = "No answer - test run possibly interrupted"
                    else:
                        answer = stats[i]['q_and_a'][q]
                    st.write(f"**{stats[i]['arch_name']}**\n{answer}")


    test_group = TestGroup.for_test_group_tag(for_test_group)
    title = "No comment provided for group" if test_group.comment == "" else test_group.comment
    st.write(f"### {title}")
    st.write(f"Total of {test_group.num_tests} tests over {test_group.num_archs} architectures ({test_group.num_tests_per_arch} per architecture).")

    stats = test_group.summary_stats_by_arch()

    st.write("#### Statistics")
    show_elapsed_time_in_seconds_boxplot(stats)
    show_response_length_boxplot(stats)
    show_elapsed_time_by_arch_step_stacked_bar(stats)
    show_time_vs_response_length_scatter_plot(stats)

    st.write("#### Question and answer details")
    show_q_and_a_detail(stats)


if st_setup('LLM Arch'):
    summary = st.container()
    with summary:
        st.write("# Test Reporter")

        TestGroup.load_all()

        selector, display = st.columns([2, 3])

        with selector:
            test_groups = list(TestGroup.all.values())
            test_groups.sort(key=lambda x: -x.start)
            options = [f'{tg.test_group}: {tg.comment}' for tg in test_groups]
            if selected := st.radio('**Pick a test set to review**', options=options, index=None):
                with display:
                    show_stats(selected.split(":")[0])