File size: 9,206 Bytes
d1eeaf0
7a6b1cb
efb5c9e
 
7a6b1cb
efb5c9e
 
 
 
 
 
 
7a6b1cb
efb5c9e
 
8ca1ec6
 
7a6b1cb
 
 
efb5c9e
 
 
 
7a6b1cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efb5c9e
7a6b1cb
487b08a
7a6b1cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487b08a
7a6b1cb
 
 
 
 
efb5c9e
 
487b08a
7a6b1cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ca1ec6
 
efb5c9e
7a6b1cb
 
 
efb5c9e
7a6b1cb
efb5c9e
 
 
 
 
7a6b1cb
d1eeaf0
efb5c9e
 
 
 
 
7a6b1cb
d1eeaf0
efb5c9e
 
 
 
 
7a6b1cb
efb5c9e
 
d1eeaf0
efb5c9e
 
 
7a6b1cb
efb5c9e
 
d1eeaf0
efb5c9e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import gradio as gr
from scripts.get_scores import load_counterfactual_robustness_scores, load_negative_rejection_scores, load_scores_common
from scripts.evaluate_information_integration import evaluate_information_integration
from scripts.evaluate_negative_rejection import evaluate_negative_rejection
from scripts.helper import initialize_logging, update_config, update_logs_periodically
from scripts.evaluate_noise_robustness import evaluate_noise_robustness
from scripts.evaluate_factual_robustness import evaluate_factual_robustness

Noise_Robustness_DIR = "results/Noise Robustness/"
Negative_Rejection_DIR = "results/Negative Rejection/"
Counterfactual_Robustness_DIR = "results/Counterfactual Robustness/"
Infomration_Integration_DIR = "results/Information Integration/"
 
# Gradio UI
def launch_gradio_app(config):
    initialize_logging()
    
    def toggle_switch(value):
        config['UsePreCalculatedValue'] = value

    with gr.Blocks() as app:
        app.title = "RAG System Evaluation"
        gr.Markdown("# RAG System Evaluation on RGB Dataset")

        # Add the description here
        gr.Markdown("""

        Welcome to the **RAG System Evaluation on RGB Dataset**! This tool is designed to evaluate and compare the performance of various **Large Language Models (LLMs)** using Retrieval-Augmented Generation (RAG) on the [**RGB dataset**](https://github.com/chen700564/RGB). The evaluation focuses on key metrics such as **Noise Robustness**, **Negative Rejection**, **Counterfactual Robustness**, and **Information Integration**. These metrics help assess how well different models handle noisy inputs, reject invalid queries, manage counterfactual scenarios, and integrate information effectively.



        #### Key Features:

        - **Compare Multiple LLMs**: Evaluate and compare the performance of different LLMs side by side.

        - **Pre-calculated Metrics**: View results from pre-computed evaluations for quick insights.

        - **Recalculate Metrics**: Option to recalculate metrics for custom configurations.

        - **Interactive Controls**: Adjust model parameters, noise rates, and query counts to explore model behavior under different conditions.

        - **Detailed Reports**: Visualize results in clear, interactive tables for each evaluation metric.



        #### How to Use:

        1. **Select a Model**: Choose from the available LLMs to evaluate.

        2. **Configure Model Settings**: Adjust the noise rate and set the number of queries.

        3. **Choose Evaluation Mode**: Use pre-calculated values for quick results or recalculate metrics for custom analysis.

        4. **Compare Results**: Review and compare the evaluation metrics across different models in the tables below.

        5. **Logs**: View live logs to monitor what's happening behind the scenes in real-time.

                    

        """)

        # Top Section - Inputs and Controls
        with gr.Accordion("Model Settings", open=False):
            with gr.Row():
                with gr.Column():
                    model_name_input = gr.Dropdown(
                        label="Model Name",
                        choices=config['models'],
                        value=config['models'][0],
                        interactive=True
                    )
                    noise_rate_input = gr.Slider(
                        label="Noise Rate",
                        minimum=0,
                        maximum=1.0,
                        step=0.2,
                        value=config['noise_rate'],
                        interactive=True
                    )
                    num_queries_input = gr.Number(
                        label="Number of Queries",
                        value=config['num_queries'],
                        interactive=True
                    )
                with gr.Column():
                    toggle = gr.Checkbox(
                        label="Use pre-calculated values?",
                        value=True,
                        info="If checked, the report(s) will use pre-calculated metrics from saved output files. If any report has N/A value, Click on respective report generation button to generate value based on configuration. Uncheck to recalculate the metrics again."
                    )
                    refresh_btn = gr.Button("Refresh", variant="primary", scale= 0)
        # Next Section - Action Buttons
        with gr.Accordion("Evaluation Actions", open=False):
            with gr.Row():
                recalculate_noise_btn = gr.Button("Evaluate Noise Robustness")
                recalculate_negative_btn = gr.Button("Evaluate Negative Rejection")
                recalculate_counterfactual_btn = gr.Button("Evaluate Counterfactual Robustness")
                recalculate_integration_btn = gr.Button("Evaluate Integration Information")

        # Middle Section - Data Tables
        with gr.Accordion("Evaluation Results", open=True):
            with gr.Row():
                with gr.Column():
                    gr.Markdown("### πŸ“Š Noise Robustness\n**Description:** The experimental result of noise robustness measured by accuracy (%) under different noise ratios.")
                    noise_table = gr.Dataframe(value=load_scores_common(Noise_Robustness_DIR, config), interactive=False)
                with gr.Column():
                    gr.Markdown("### 🚫 Negative Rejection\n**Description:** This measures the model's ability to reject invalid or nonsensical queries.")
                    rejection_table = gr.Dataframe(value=load_negative_rejection_scores(config), interactive=False)
            with gr.Row():
                with gr.Column():
                    gr.Markdown("### πŸ”„ Counterfactual Robustness\n**Description:** Evaluates a model's ability to handle errors in external knowledge.")
                    counter_factual_table = gr.Dataframe(value=load_counterfactual_robustness_scores(config), interactive=False)
                with gr.Column():
                    gr.Markdown("### 🧠 Information Integration\n**Description:** The experimental result of information integration measured by accuracy (%) under different noise ratios.")
                    integration_table = gr.Dataframe(value=load_scores_common(Infomration_Integration_DIR, config), interactive=False)

        # Logs Section
        with gr.Accordion("View Live Logs", open=False):
            log_section = gr.Textbox(label="Logs", interactive=False, lines=10, every=2)

        # Event Handling
        toggle.change(toggle_switch, inputs=toggle)
        app.queue() 
        app.load(update_logs_periodically, outputs=log_section)
        # Refresh Scores Function
        def refresh_scores(model_name, noise_rate, num_queries):
            update_config(config, model_name, noise_rate, num_queries)
            return load_scores_common(Noise_Robustness_DIR, config), load_negative_rejection_scores(config), load_counterfactual_robustness_scores(config), load_scores_common(Infomration_Integration_DIR, config)

        refresh_btn.click(refresh_scores, inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[noise_table, rejection_table, counter_factual_table, integration_table])

        # Button Functions
        def recalculate_noise_robustness(model_name, noise_rate, num_queries):
            update_config(config, model_name, noise_rate, num_queries)
            evaluate_noise_robustness(config)
            return load_scores_common(Noise_Robustness_DIR, config)
        
        recalculate_noise_btn.click(recalculate_noise_robustness, inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[noise_table])

        def recalculate_counterfactual_robustness(model_name, noise_rate, num_queries):
            update_config(config, model_name, noise_rate, num_queries)
            evaluate_factual_robustness(config)
            return load_counterfactual_robustness_scores(config)
        
        recalculate_counterfactual_btn.click(recalculate_counterfactual_robustness, inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[counter_factual_table])

        def recalculate_negative_rejection(model_name, noise_rate, num_queries):
            update_config(config, model_name, noise_rate, num_queries)
            evaluate_negative_rejection(config)
            return load_negative_rejection_scores(config)
        
        recalculate_negative_btn.click(recalculate_negative_rejection, inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[rejection_table])

        def recalculate_integration_info(model_name, noise_rate, num_queries):
            update_config(config, model_name, noise_rate, num_queries)
            evaluate_information_integration(config)
            return load_scores_common(Infomration_Integration_DIR, config)
        
        recalculate_integration_btn.click(recalculate_integration_info , inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[integration_table])

    app.launch()