File size: 2,509 Bytes
dab7e6b
d6824cb
 
fb754b1
902eec0
73db8f1
fb754b1
d6824cb
73db8f1
 
 
 
 
 
 
 
fb754b1
dfa0c95
d6824cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05beea4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import streamlit as st
from my_model.results.demo import ResultDemonstrator
from my_model.config import evaluation_config as config 


def run_demo()-> None:
    """
    Run the interactive Streamlit demo for visualizing model evaluation results and analysis.

    This function initializes the ResultDemonstrator class and sets up an interactive interface
    where users can choose to view either evaluation results & analysis or evaluation samples.
    Based on the user's selection, different aspects of the evaluation are displayed, such as
    main & ablation results, results per question category, or the impact of prompt length on performance.
    
    Returns:
        None
    """

    demo = ResultDemonstrator()  # Instantiate the ResultDemonstrator class
    col1, col2 = st.columns([1, 4])
    with col1:
        # User selects the evaluation analysis aspect
        section_type = st.radio("Select Evaluation Aspect", ["Evaluation Results & Analysis", 'Evaluation Samples'])
        # Only show analysis type if the section type is "Evaluation Results & Analysis"
        if section_type == "Evaluation Results & Analysis":
            analysis_type = st.radio("Select Type", ["Main & Ablation Results", "Results per Question Category",
                                                     "Prompt Length (token count) Impact on Performance"], index=2)
            if analysis_type == "Prompt Length (token count) Impact on Performance":
                # Based on the selection, other options appear
                model_name = st.radio("Select Model Size", config.MODEL_NAMES)
                score_name = st.radio("Select Score Type", ["VQA Score", "Exact Match"])
        elif section_type == 'Evaluation Samples':
            samples_button = st.button("Generate Random Samples")
    with col2:
        if section_type == "Evaluation Results & Analysis":
            if analysis_type == "Prompt Length (token count) Impact on Performance":
                for conf in config.MODEL_CONFIGURATIONS:
                    with st.expander(conf):
                        demo.plot_token_count_vs_scores(conf, model_name, score_name)
            elif analysis_type == "Main & Ablation Results":
                demo.display_main_results()
            elif analysis_type == "Results per Question Category":
                demo.display_ablation_results_per_question_category()
        elif section_type == 'Evaluation Samples':
            if samples_button:
                demo.show_samples(3)