Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | 
         @@ -14,13 +14,31 @@ from src.bin.PROBE import run_probe 
     | 
|
| 14 | 
         | 
| 15 | 
         
             
            global data_component, filter_component
         
     | 
| 16 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 17 | 
         
             
            def get_baseline_df(selected_methods, selected_metrics):
         
     | 
| 18 | 
         
             
                df = pd.read_csv(CSV_RESULT_PATH)
         
     | 
| 19 | 
         
             
                present_columns = ["method_name"] + selected_metrics
         
     | 
| 20 | 
         
             
                df = df[df['method_name'].isin(selected_methods)][present_columns]
         
     | 
| 21 | 
         
             
                return df
         
     | 
| 22 | 
         | 
| 23 | 
         
            -
            def  
     | 
| 24 | 
         
             
                df = pd.read_csv(CSV_RESULT_PATH)
         
     | 
| 25 | 
         
             
                filtered_df = df[df['method_name'].isin(methods_selected)]
         
     | 
| 26 | 
         | 
| 
         @@ -73,7 +91,7 @@ with block: 
     | 
|
| 73 | 
         | 
| 74 | 
         
             
                with gr.Tabs(elem_classes="tab-buttons") as tabs:
         
     | 
| 75 | 
         
             
                    # table jmmmu bench
         
     | 
| 76 | 
         
            -
                    with gr.TabItem("🏅 PROBE  
     | 
| 77 | 
         | 
| 78 | 
         
             
                        method_names = pd.read_csv(CSV_RESULT_PATH)['method_name'].unique().tolist()
         
     | 
| 79 | 
         
             
                        metric_names = pd.read_csv(CSV_RESULT_PATH).columns.tolist()
         
     | 
| 
         @@ -116,23 +134,45 @@ with block: 
     | 
|
| 116 | 
         
             
                            outputs=data_component
         
     | 
| 117 | 
         
             
                        )
         
     | 
| 118 | 
         | 
| 119 | 
         
            -
             
     | 
| 120 | 
         
            -
                         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 121 | 
         | 
| 122 | 
         
            -
             
     | 
| 123 | 
         
            -
             
     | 
| 124 | 
         
            -
             
     | 
| 125 | 
         
            -
                                 
     | 
| 126 | 
         
            -
                                 
     | 
| 127 | 
         
            -
                                 
     | 
| 128 | 
         
            -
                            
         
     | 
| 129 | 
         
            -
             
     | 
| 130 | 
         
            -
             
     | 
| 131 | 
         
            -
             
     | 
| 132 | 
         
            -
                            
         
     | 
| 133 | 
         
            -
                        plot_button.click(create_plot, inputs=[method_selector, x_metric_selector, y_metric_selector], outputs=output_plot)
         
     | 
| 134 | 
         | 
| 135 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 136 | 
         
             
                    with gr.TabItem("📝 About", elem_id="probe-benchmark-tab-table", id=2):
         
     | 
| 137 | 
         
             
                        with gr.Row():
         
     | 
| 138 | 
         
             
                            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         
     | 
| 
         | 
|
| 14 | 
         | 
| 15 | 
         
             
            global data_component, filter_component
         
     | 
| 16 | 
         | 
| 17 | 
         
            +
             
     | 
| 18 | 
         
            +
             
     | 
| 19 | 
         
            +
            def benchmark_plot(benchmark_type, methods_selected, x_metric, y_metric):
         
     | 
| 20 | 
         
            +
                if benchmark_type == 'Flexible':
         
     | 
| 21 | 
         
            +
                    # Use general visualizer logic
         
     | 
| 22 | 
         
            +
                    return general_visualizer_plot(methods_selected, x_metric=x_metric, y_metric=y_metric)
         
     | 
| 23 | 
         
            +
                elif benchmark_type == 'Benchmark 1':
         
     | 
| 24 | 
         
            +
                    return benchmark_1_plot(x_metric, y_metric)
         
     | 
| 25 | 
         
            +
                elif benchmark_type == 'Benchmark 2':
         
     | 
| 26 | 
         
            +
                    return benchmark_2_plot(x_metric, y_metric)
         
     | 
| 27 | 
         
            +
                elif benchmark_type == 'Benchmark 3':
         
     | 
| 28 | 
         
            +
                    return benchmark_3_plot(x_metric, y_metric)
         
     | 
| 29 | 
         
            +
                elif benchmark_type == 'Benchmark 4':
         
     | 
| 30 | 
         
            +
                    return benchmark_4_plot(x_metric, y_metric)
         
     | 
| 31 | 
         
            +
                else:
         
     | 
| 32 | 
         
            +
                    return "Invalid benchmark type selected."
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
             
            def get_baseline_df(selected_methods, selected_metrics):
         
     | 
| 36 | 
         
             
                df = pd.read_csv(CSV_RESULT_PATH)
         
     | 
| 37 | 
         
             
                present_columns = ["method_name"] + selected_metrics
         
     | 
| 38 | 
         
             
                df = df[df['method_name'].isin(selected_methods)][present_columns]
         
     | 
| 39 | 
         
             
                return df
         
     | 
| 40 | 
         | 
| 41 | 
         
            +
            def general_visualizer(methods_selected, x_metric, y_metric):
         
     | 
| 42 | 
         
             
                df = pd.read_csv(CSV_RESULT_PATH)
         
     | 
| 43 | 
         
             
                filtered_df = df[df['method_name'].isin(methods_selected)]
         
     | 
| 44 | 
         | 
| 
         | 
|
| 91 | 
         | 
| 92 | 
         
             
                with gr.Tabs(elem_classes="tab-buttons") as tabs:
         
     | 
| 93 | 
         
             
                    # table jmmmu bench
         
     | 
| 94 | 
         
            +
                    with gr.TabItem("🏅 PROBE Leaderboard", elem_id="probe-benchmark-tab-table", id=1):
         
     | 
| 95 | 
         | 
| 96 | 
         
             
                        method_names = pd.read_csv(CSV_RESULT_PATH)['method_name'].unique().tolist()
         
     | 
| 97 | 
         
             
                        metric_names = pd.read_csv(CSV_RESULT_PATH).columns.tolist()
         
     | 
| 
         | 
|
| 134 | 
         
             
                            outputs=data_component
         
     | 
| 135 | 
         
             
                        )
         
     | 
| 136 | 
         | 
| 137 | 
         
            +
                    with gr.TabItem("Visualizer"):
         
     | 
| 138 | 
         
            +
                        
         
     | 
| 139 | 
         
            +
                        # Dropdown for benchmark type
         
     | 
| 140 | 
         
            +
                        benchmark_types = TASK_INFO + ['flexible']
         
     | 
| 141 | 
         
            +
                        benchmark_type_selector = gr.Dropdown(choices=benchmark_types, label="Select Benchmark Type for Visualization", value="flexible")
         
     | 
| 142 | 
         
            +
                        
         
     | 
| 143 | 
         
            +
                        # Dynamic metric selectors (will be updated based on benchmark type)
         
     | 
| 144 | 
         
            +
                        x_metric_selector = gr.Dropdown(choices=[], label="Select X-axis Metric")
         
     | 
| 145 | 
         
            +
                        y_metric_selector = gr.Dropdown(choices=[], label="Select Y-axis Metric")
         
     | 
| 146 | 
         
            +
                        method_selector = gr.CheckboxGroup(choices=method_names, label="Select methods to visualize", interactive=True, value=method_names)
         
     | 
| 147 | 
         
            +
                        
         
     | 
| 148 | 
         
            +
                        # Button to draw the plot for the selected benchmark
         
     | 
| 149 | 
         
            +
                        plot_button = gr.Button("Plot Visualization")
         
     | 
| 150 | 
         
            +
                        plot_output = gr.Image(label="Plot")
         
     | 
| 151 | 
         | 
| 152 | 
         
            +
                        # Update metric selectors when benchmark type is chosen
         
     | 
| 153 | 
         
            +
                        def update_metric_choices(benchmark_type):
         
     | 
| 154 | 
         
            +
                            if benchmark_type == 'flexible':
         
     | 
| 155 | 
         
            +
                                # Show all metrics for the flexible visualizer
         
     | 
| 156 | 
         
            +
                                metric_names = df.columns.tolist()
         
     | 
| 157 | 
         
            +
                                return gr.update(choices=metric_names, value=metric_names[0]), gr.update(choices=metric_names, value=metric_names[1])
         
     | 
| 158 | 
         
            +
                            elif benchmark_type in benchmark_specific_metrics:
         
     | 
| 159 | 
         
            +
                                metrics = benchmark_specific_metrics[benchmark_type]
         
     | 
| 160 | 
         
            +
                                return gr.update(choices=metrics, value=metrics[0]), gr.update(choices=metrics[1])
         
     | 
| 161 | 
         
            +
                            return gr.update(choices=[]), gr.update(choices=[])
         
     | 
| 
         | 
|
| 
         | 
|
| 162 | 
         | 
| 163 | 
         
            +
                        benchmark_type_selector.change(
         
     | 
| 164 | 
         
            +
                            update_metric_choices, 
         
     | 
| 165 | 
         
            +
                            inputs=[benchmark_type_selector], 
         
     | 
| 166 | 
         
            +
                            outputs=[x_metric_selector, y_metric_selector]
         
     | 
| 167 | 
         
            +
                        )
         
     | 
| 168 | 
         
            +
             
     | 
| 169 | 
         
            +
                        # Generate the plot based on user input
         
     | 
| 170 | 
         
            +
                        plot_button.click(
         
     | 
| 171 | 
         
            +
                            benchmark_plot, 
         
     | 
| 172 | 
         
            +
                            inputs=[benchmark_type_selector, method_selector, x_metric_selector, y_metric_selector], 
         
     | 
| 173 | 
         
            +
                            outputs=plot_output
         
     | 
| 174 | 
         
            +
                        )
         
     | 
| 175 | 
         
            +
                        
         
     | 
| 176 | 
         
             
                    with gr.TabItem("📝 About", elem_id="probe-benchmark-tab-table", id=2):
         
     | 
| 177 | 
         
             
                        with gr.Row():
         
     | 
| 178 | 
         
             
                            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         
     |