File size: 9,206 Bytes
d1eeaf0 7a6b1cb efb5c9e 7a6b1cb efb5c9e 7a6b1cb efb5c9e 8ca1ec6 7a6b1cb efb5c9e 7a6b1cb efb5c9e 7a6b1cb 487b08a 7a6b1cb 487b08a 7a6b1cb efb5c9e 487b08a 7a6b1cb 8ca1ec6 efb5c9e 7a6b1cb efb5c9e 7a6b1cb efb5c9e 7a6b1cb d1eeaf0 efb5c9e 7a6b1cb d1eeaf0 efb5c9e 7a6b1cb efb5c9e d1eeaf0 efb5c9e 7a6b1cb efb5c9e d1eeaf0 efb5c9e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import gradio as gr
from scripts.get_scores import load_counterfactual_robustness_scores, load_negative_rejection_scores, load_scores_common
from scripts.evaluate_information_integration import evaluate_information_integration
from scripts.evaluate_negative_rejection import evaluate_negative_rejection
from scripts.helper import initialize_logging, update_config, update_logs_periodically
from scripts.evaluate_noise_robustness import evaluate_noise_robustness
from scripts.evaluate_factual_robustness import evaluate_factual_robustness
Noise_Robustness_DIR = "results/Noise Robustness/"
Negative_Rejection_DIR = "results/Negative Rejection/"
Counterfactual_Robustness_DIR = "results/Counterfactual Robustness/"
Infomration_Integration_DIR = "results/Information Integration/"
# Gradio UI
def launch_gradio_app(config):
initialize_logging()
def toggle_switch(value):
config['UsePreCalculatedValue'] = value
with gr.Blocks() as app:
app.title = "RAG System Evaluation"
gr.Markdown("# RAG System Evaluation on RGB Dataset")
# Add the description here
gr.Markdown("""
Welcome to the **RAG System Evaluation on RGB Dataset**! This tool is designed to evaluate and compare the performance of various **Large Language Models (LLMs)** using Retrieval-Augmented Generation (RAG) on the [**RGB dataset**](https://github.com/chen700564/RGB). The evaluation focuses on key metrics such as **Noise Robustness**, **Negative Rejection**, **Counterfactual Robustness**, and **Information Integration**. These metrics help assess how well different models handle noisy inputs, reject invalid queries, manage counterfactual scenarios, and integrate information effectively.
#### Key Features:
- **Compare Multiple LLMs**: Evaluate and compare the performance of different LLMs side by side.
- **Pre-calculated Metrics**: View results from pre-computed evaluations for quick insights.
- **Recalculate Metrics**: Option to recalculate metrics for custom configurations.
- **Interactive Controls**: Adjust model parameters, noise rates, and query counts to explore model behavior under different conditions.
- **Detailed Reports**: Visualize results in clear, interactive tables for each evaluation metric.
#### How to Use:
1. **Select a Model**: Choose from the available LLMs to evaluate.
2. **Configure Model Settings**: Adjust the noise rate and set the number of queries.
3. **Choose Evaluation Mode**: Use pre-calculated values for quick results or recalculate metrics for custom analysis.
4. **Compare Results**: Review and compare the evaluation metrics across different models in the tables below.
5. **Logs**: View live logs to monitor what's happening behind the scenes in real-time.
""")
# Top Section - Inputs and Controls
with gr.Accordion("Model Settings", open=False):
with gr.Row():
with gr.Column():
model_name_input = gr.Dropdown(
label="Model Name",
choices=config['models'],
value=config['models'][0],
interactive=True
)
noise_rate_input = gr.Slider(
label="Noise Rate",
minimum=0,
maximum=1.0,
step=0.2,
value=config['noise_rate'],
interactive=True
)
num_queries_input = gr.Number(
label="Number of Queries",
value=config['num_queries'],
interactive=True
)
with gr.Column():
toggle = gr.Checkbox(
label="Use pre-calculated values?",
value=True,
info="If checked, the report(s) will use pre-calculated metrics from saved output files. If any report has N/A value, Click on respective report generation button to generate value based on configuration. Uncheck to recalculate the metrics again."
)
refresh_btn = gr.Button("Refresh", variant="primary", scale= 0)
# Next Section - Action Buttons
with gr.Accordion("Evaluation Actions", open=False):
with gr.Row():
recalculate_noise_btn = gr.Button("Evaluate Noise Robustness")
recalculate_negative_btn = gr.Button("Evaluate Negative Rejection")
recalculate_counterfactual_btn = gr.Button("Evaluate Counterfactual Robustness")
recalculate_integration_btn = gr.Button("Evaluate Integration Information")
# Middle Section - Data Tables
with gr.Accordion("Evaluation Results", open=True):
with gr.Row():
with gr.Column():
gr.Markdown("### π Noise Robustness\n**Description:** The experimental result of noise robustness measured by accuracy (%) under different noise ratios.")
noise_table = gr.Dataframe(value=load_scores_common(Noise_Robustness_DIR, config), interactive=False)
with gr.Column():
gr.Markdown("### π« Negative Rejection\n**Description:** This measures the model's ability to reject invalid or nonsensical queries.")
rejection_table = gr.Dataframe(value=load_negative_rejection_scores(config), interactive=False)
with gr.Row():
with gr.Column():
gr.Markdown("### π Counterfactual Robustness\n**Description:** Evaluates a model's ability to handle errors in external knowledge.")
counter_factual_table = gr.Dataframe(value=load_counterfactual_robustness_scores(config), interactive=False)
with gr.Column():
gr.Markdown("### π§ Information Integration\n**Description:** The experimental result of information integration measured by accuracy (%) under different noise ratios.")
integration_table = gr.Dataframe(value=load_scores_common(Infomration_Integration_DIR, config), interactive=False)
# Logs Section
with gr.Accordion("View Live Logs", open=False):
log_section = gr.Textbox(label="Logs", interactive=False, lines=10, every=2)
# Event Handling
toggle.change(toggle_switch, inputs=toggle)
app.queue()
app.load(update_logs_periodically, outputs=log_section)
# Refresh Scores Function
def refresh_scores(model_name, noise_rate, num_queries):
update_config(config, model_name, noise_rate, num_queries)
return load_scores_common(Noise_Robustness_DIR, config), load_negative_rejection_scores(config), load_counterfactual_robustness_scores(config), load_scores_common(Infomration_Integration_DIR, config)
refresh_btn.click(refresh_scores, inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[noise_table, rejection_table, counter_factual_table, integration_table])
# Button Functions
def recalculate_noise_robustness(model_name, noise_rate, num_queries):
update_config(config, model_name, noise_rate, num_queries)
evaluate_noise_robustness(config)
return load_scores_common(Noise_Robustness_DIR, config)
recalculate_noise_btn.click(recalculate_noise_robustness, inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[noise_table])
def recalculate_counterfactual_robustness(model_name, noise_rate, num_queries):
update_config(config, model_name, noise_rate, num_queries)
evaluate_factual_robustness(config)
return load_counterfactual_robustness_scores(config)
recalculate_counterfactual_btn.click(recalculate_counterfactual_robustness, inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[counter_factual_table])
def recalculate_negative_rejection(model_name, noise_rate, num_queries):
update_config(config, model_name, noise_rate, num_queries)
evaluate_negative_rejection(config)
return load_negative_rejection_scores(config)
recalculate_negative_btn.click(recalculate_negative_rejection, inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[rejection_table])
def recalculate_integration_info(model_name, noise_rate, num_queries):
update_config(config, model_name, noise_rate, num_queries)
evaluate_information_integration(config)
return load_scores_common(Infomration_Integration_DIR, config)
recalculate_integration_btn.click(recalculate_integration_info , inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[integration_table])
app.launch()
|