|
import gradio as gr
|
|
from scripts.get_scores import load_counterfactual_robustness_scores, load_negative_rejection_scores, load_scores_common
|
|
from scripts.evaluate_information_integration import evaluate_information_integration
|
|
from scripts.evaluate_negative_rejection import evaluate_negative_rejection
|
|
from scripts.helper import initialize_logging, update_config, update_logs_periodically
|
|
from scripts.evaluate_noise_robustness import evaluate_noise_robustness
|
|
from scripts.evaluate_factual_robustness import evaluate_factual_robustness
|
|
|
|
Noise_Robustness_DIR = "results/Noise Robustness/"
|
|
Negative_Rejection_DIR = "results/Negative Rejection/"
|
|
Counterfactual_Robustness_DIR = "results/Counterfactual Robustness/"
|
|
Infomration_Integration_DIR = "results/Information Integration/"
|
|
|
|
|
|
def launch_gradio_app(config):
|
|
initialize_logging()
|
|
|
|
def toggle_switch(value):
|
|
config['UsePreCalculatedValue'] = value
|
|
|
|
with gr.Blocks() as app:
|
|
app.title = "RAG System Evaluation"
|
|
gr.Markdown("# RAG System Evaluation on RGB Dataset")
|
|
|
|
|
|
gr.Markdown("""
|
|
Welcome to the **RAG System Evaluation on RGB Dataset**! This tool is designed to evaluate and compare the performance of various **Large Language Models (LLMs)** using Retrieval-Augmented Generation (RAG) on the [**RGB dataset**](https://github.com/chen700564/RGB). The evaluation focuses on key metrics such as **Noise Robustness**, **Negative Rejection**, **Counterfactual Robustness**, and **Information Integration**. These metrics help assess how well different models handle noisy inputs, reject invalid queries, manage counterfactual scenarios, and integrate information effectively.
|
|
|
|
#### Key Features:
|
|
- **Compare Multiple LLMs**: Evaluate and compare the performance of different LLMs side by side.
|
|
- **Pre-calculated Metrics**: View results from pre-computed evaluations for quick insights.
|
|
- **Recalculate Metrics**: Option to recalculate metrics for custom configurations.
|
|
- **Interactive Controls**: Adjust model parameters, noise rates, and query counts to explore model behavior under different conditions.
|
|
- **Detailed Reports**: Visualize results in clear, interactive tables for each evaluation metric.
|
|
|
|
#### How to Use:
|
|
1. **Select a Model**: Choose from the available LLMs to evaluate.
|
|
2. **Configure Model Settings**: Adjust the noise rate and set the number of queries.
|
|
3. **Choose Evaluation Mode**: Use pre-calculated values for quick results or recalculate metrics for custom analysis.
|
|
4. **Compare Results**: Review and compare the evaluation metrics across different models in the tables below.
|
|
5. **Logs**: View live logs to monitor what's happening behind the scenes in real-time.
|
|
|
|
""")
|
|
|
|
|
|
with gr.Accordion("Model Settings", open=False):
|
|
with gr.Row():
|
|
with gr.Column():
|
|
model_name_input = gr.Dropdown(
|
|
label="Model Name",
|
|
choices=config['models'],
|
|
value=config['models'][0],
|
|
interactive=True
|
|
)
|
|
noise_rate_input = gr.Slider(
|
|
label="Noise Rate",
|
|
minimum=0,
|
|
maximum=1.0,
|
|
step=0.2,
|
|
value=config['noise_rate'],
|
|
interactive=True
|
|
)
|
|
num_queries_input = gr.Number(
|
|
label="Number of Queries",
|
|
value=config['num_queries'],
|
|
interactive=True
|
|
)
|
|
with gr.Column():
|
|
toggle = gr.Checkbox(
|
|
label="Use pre-calculated values?",
|
|
value=True,
|
|
info="If checked, the report(s) will use pre-calculated metrics from saved output files. If any report has N/A value, Click on respective report generation button to generate value based on configuration. Uncheck to recalculate the metrics again."
|
|
)
|
|
refresh_btn = gr.Button("Refresh", variant="primary", scale= 0)
|
|
|
|
with gr.Accordion("Evaluation Actions", open=False):
|
|
with gr.Row():
|
|
recalculate_noise_btn = gr.Button("Evaluate Noise Robustness")
|
|
recalculate_negative_btn = gr.Button("Evaluate Negative Rejection")
|
|
recalculate_counterfactual_btn = gr.Button("Evaluate Counterfactual Robustness")
|
|
recalculate_integration_btn = gr.Button("Evaluate Integration Information")
|
|
|
|
|
|
with gr.Accordion("Evaluation Results", open=True):
|
|
with gr.Row():
|
|
with gr.Column():
|
|
gr.Markdown("### π Noise Robustness\n**Description:** The experimental result of noise robustness measured by accuracy (%) under different noise ratios.")
|
|
noise_table = gr.Dataframe(value=load_scores_common(Noise_Robustness_DIR, config), interactive=False)
|
|
with gr.Column():
|
|
gr.Markdown("### π« Negative Rejection\n**Description:** This measures the model's ability to reject invalid or nonsensical queries.")
|
|
rejection_table = gr.Dataframe(value=load_negative_rejection_scores(config), interactive=False)
|
|
with gr.Row():
|
|
with gr.Column():
|
|
gr.Markdown("### π Counterfactual Robustness\n**Description:** Evaluates a model's ability to handle errors in external knowledge.")
|
|
counter_factual_table = gr.Dataframe(value=load_counterfactual_robustness_scores(config), interactive=False)
|
|
with gr.Column():
|
|
gr.Markdown("### π§ Information Integration\n**Description:** The experimental result of information integration measured by accuracy (%) under different noise ratios.")
|
|
integration_table = gr.Dataframe(value=load_scores_common(Infomration_Integration_DIR, config), interactive=False)
|
|
|
|
|
|
with gr.Accordion("View Live Logs", open=False):
|
|
log_section = gr.Textbox(label="Logs", interactive=False, lines=10, every=2)
|
|
|
|
|
|
toggle.change(toggle_switch, inputs=toggle)
|
|
app.queue()
|
|
app.load(update_logs_periodically, outputs=log_section)
|
|
|
|
def refresh_scores(model_name, noise_rate, num_queries):
|
|
update_config(config, model_name, noise_rate, num_queries)
|
|
return load_scores_common(Noise_Robustness_DIR, config), load_negative_rejection_scores(config), load_counterfactual_robustness_scores(config), load_scores_common(Infomration_Integration_DIR, config)
|
|
|
|
refresh_btn.click(refresh_scores, inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[noise_table, rejection_table, counter_factual_table, integration_table])
|
|
|
|
|
|
def recalculate_noise_robustness(model_name, noise_rate, num_queries):
|
|
update_config(config, model_name, noise_rate, num_queries)
|
|
evaluate_noise_robustness(config)
|
|
return load_scores_common(Noise_Robustness_DIR, config)
|
|
|
|
recalculate_noise_btn.click(recalculate_noise_robustness, inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[noise_table])
|
|
|
|
def recalculate_counterfactual_robustness(model_name, noise_rate, num_queries):
|
|
update_config(config, model_name, noise_rate, num_queries)
|
|
evaluate_factual_robustness(config)
|
|
return load_counterfactual_robustness_scores(config)
|
|
|
|
recalculate_counterfactual_btn.click(recalculate_counterfactual_robustness, inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[counter_factual_table])
|
|
|
|
def recalculate_negative_rejection(model_name, noise_rate, num_queries):
|
|
update_config(config, model_name, noise_rate, num_queries)
|
|
evaluate_negative_rejection(config)
|
|
return load_negative_rejection_scores(config)
|
|
|
|
recalculate_negative_btn.click(recalculate_negative_rejection, inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[rejection_table])
|
|
|
|
def recalculate_integration_info(model_name, noise_rate, num_queries):
|
|
update_config(config, model_name, noise_rate, num_queries)
|
|
evaluate_information_integration(config)
|
|
return load_scores_common(Infomration_Integration_DIR, config)
|
|
|
|
recalculate_integration_btn.click(recalculate_integration_info , inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[integration_table])
|
|
|
|
app.launch()
|
|
|