evalita_llm_leaderboard

Running

App Files Files Community

rzanoli commited on Aug 29

Commit

9835969

1 Parent(s): 9cdb678

Place charts on the main page immediately before the leaderboard table

Browse files

Files changed (2) hide show

app.py +169 -36
src/display/css_html_js.py +1 -0

app.py CHANGED Viewed

@@ -160,11 +160,12 @@ def boxplot_per_task(dataframe=None, baselines=None):
             if task in baselines and baselines[task] is not None:
                 fig.add_shape(
                     type="line",
-                    x0=i-0.3, x1=i+0.3,
                     y0=baselines[task], y1=baselines[task],
-                    line=dict(color="black", width=2, dash="dash"),
                     xref="x", yref="y"
                 )
                 fig.add_annotation(
                     x=i, y=baselines[task],
                     text=f"{baselines[task]}%",
@@ -172,22 +173,23 @@ def boxplot_per_task(dataframe=None, baselines=None):
                     yshift=10,
                     font=dict(size=10, color="black")
                 )
     fig.update_layout(
         title="Distribution of Model Accuracy by Task",
-        #xaxis_title="Task",
         yaxis_title="Combined Performance",
         template="plotly_white",
         boxmode="group",
         dragmode=False,
-        font=dict(family="Arial", size=13),
         margin=dict(b=140),
     )
     fig.add_annotation(
         text=(
-            "In zero/few-shot settings, models are getting closer to the supervised EVALITA baselines <br>"
-            "(black dashed line), except for NER and REL."
         ),
         xref="paper", yref="paper",
         x=0.5, y=-0.30,
@@ -211,6 +213,12 @@ def boxplot_prompts_per_task(dataframe, tasks=None):
     if tasks is None:
         tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
     fig = go.Figure()
     # Liste per creare una sola voce in legenda per Average e Best
@@ -264,12 +272,12 @@ def boxplot_prompts_per_task(dataframe, tasks=None):
         )
     fig.update_layout(
-        title= "Average Prompt Accuracy vs Best Prompt Accuracy per Task",
-        xaxis_title="",
         yaxis_title="Combined Performance",
         barmode='group',
         template="plotly_white",
-        font=dict(family="Arial", size=13),
         yaxis=dict(range=[0, 100], fixedrange=True),
     )
@@ -286,29 +294,28 @@ def boxplot_prompts_per_task(dataframe, tasks=None):
     return fig
-def line_chart(dataframe):
     # Separiamo i dati in base a IS_FS
     df_true = dataframe[dataframe['IS_FS'] == True]
     df_false = dataframe[dataframe['IS_FS'] == False]
-    # Estrai valori x, y e labels per True e False
     x_true = df_true['#Params (B)'].tolist()
     y_true = df_true['Avg. Comb. Perf. ⬆️'].tolist()
-    labels_true = [
-        #re.search(r'>([^<>/]+/[^<>]+)<', m).group(1).split('/')[-1]
-        re.search(r'>([^<]+)<', m).group(1)
-        for m in df_true['Model'].tolist()
-    ]
     x_false = df_false['#Params (B)'].tolist()
     y_false = df_false['Avg. Comb. Perf. ⬆️'].tolist()
-    labels_false = [
-        #re.search(r'>([^<>/]+/[^<>]+)<', m).group(1).split('/')[-1]
-        re.search(r'>([^<]+)<', m).group(1)
-        for m in df_false['Model'].tolist()
-    ]
     fig = go.Figure()
@@ -316,11 +323,14 @@ def line_chart(dataframe):
     fig.add_trace(go.Scatter(
         x=x_true,
         y=y_true,
-        mode='markers',  # solo marker, niente testo
         name='5-Shot',
-        marker=dict(color='red', size=10),
         hovertemplate='<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>',
-        customdata=labels_true  # tutte le informazioni sul hover
     ))
     # Punti IS_FS=False
@@ -329,7 +339,10 @@ def line_chart(dataframe):
         y=y_false,
         mode='markers',
         name='0-Shot',
-        marker=dict(color='blue', size=10),
         hovertemplate='<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>',
         customdata=labels_false
     ))
@@ -340,13 +353,18 @@ def line_chart(dataframe):
         yaxis_title="Avg. Combined Performance",
         template="plotly_white",
         hovermode="closest",
-        dragmode=False
     )
-    # Aggiungi la caption come annotazione separata
     fig.add_annotation(
-        text="Models with more parameters generally perform better than smaller ones. However, few-shot learning <br>"
-        "can sometimes enable smaller models to outperform larger models evaluated in zero-shot settings.",
         xref="paper", yref="paper",
         x=0, y=-0.3,
         showarrow=False,
@@ -354,15 +372,124 @@ def line_chart(dataframe):
         align="left"
     )
-    # Disabilita lo zoom e altri controlli
     fig.update_xaxes(fixedrange=True, rangeslider_visible=False)
     fig.update_yaxes(fixedrange=True)
-    #fig.update_yaxes(range=[0, 100], fixedrange=True)
     return fig
 # Define task metadata (icons, names, descriptions)
@@ -441,13 +568,11 @@ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
             else:
                 new_model_column.append(row["Model"])
     # Lista delle colonne da aggiornare
-    cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"]
     # Applichiamo la trasformazione
-    for col in cols_to_update:
-        dataframe[col] = dataframe[col].replace({1: 7, 2: 8})
     # Aggiorna la colonna Model
     sorted_dataframe["Model"] = new_model_column
@@ -641,6 +766,12 @@ with demo:
     )
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         # Main leaderboard tab
@@ -668,6 +799,7 @@ with demo:
                         """
             )
         with gr.TabItem("📈 Charts"):
             #gr.Plot(value=line_chart(LEADERBOARD_DF), label="Andamento di esempio")
             #gr.Plot(value=line_chart_interactive_test(), label="Andamento interattivo")
@@ -675,6 +807,7 @@ with demo:
             gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES))
             gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF))
             gr.Plot(value=barplot_mean_few_minus_zero_shot(LEADERBOARD_DF))
         # About tab
         with gr.TabItem("📝 About"):

             if task in baselines and baselines[task] is not None:
                 fig.add_shape(
                     type="line",
+                    x0=i - 0.3, x1=i + 0.3,
                     y0=baselines[task], y1=baselines[task],
+                    line=dict(color="black", width=2, dash="dot"),  # più visibile
                     xref="x", yref="y"
                 )
+                '''
                 fig.add_annotation(
                     x=i, y=baselines[task],
                     text=f"{baselines[task]}%",
                     yshift=10,
                     font=dict(size=10, color="black")
                 )
+                '''
     fig.update_layout(
         title="Distribution of Model Accuracy by Task",
+        xaxis_title="Task",
         yaxis_title="Combined Performance",
         template="plotly_white",
         boxmode="group",
         dragmode=False,
+        font=dict(family="Arial", size=10),
         margin=dict(b=140),
     )
     fig.add_annotation(
         text=(
+            " In tasks like TE and SA, zero/few-shot models reach accuracy close to supervised <br>  "
+            "methods at EVALITA (dashed line); in NER and REL they remain much lower. "
         ),
         xref="paper", yref="paper",
         x=0.5, y=-0.30,
     if tasks is None:
         tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
+    # Lista delle colonne da aggiornare
+    cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"]
+    # Applichiamo la trasformazione
+    for col in cols_to_update:
+        dataframe[col] = dataframe[col].replace({1: 7, 2: 8})
     fig = go.Figure()
     # Liste per creare una sola voce in legenda per Average e Best
         )
     fig.update_layout(
+        title= "Prompt Accuracy: Avg vs Best",
+        xaxis_title="Task",
         yaxis_title="Combined Performance",
         barmode='group',
         template="plotly_white",
+        font=dict(family="Arial", size=10),
         yaxis=dict(range=[0, 100], fixedrange=True),
     )
     return fig
+def line_chart2(dataframe):
+    # Normalizziamo le dimensioni per avere marker non troppo piccoli né enormi
+    def scale_sizes(values, min_size=8, max_size=30):
+        vmin, vmax = min(values), max(values)
+        return [
+            min_size + (val - vmin) / (vmax - vmin) * (max_size - min_size) if vmax > vmin else (min_size + max_size) / 2
+            for val in values
+        ]
     # Separiamo i dati in base a IS_FS
     df_true = dataframe[dataframe['IS_FS'] == True]
     df_false = dataframe[dataframe['IS_FS'] == False]
+    # Estrai valori x, y e labels
     x_true = df_true['#Params (B)'].tolist()
     y_true = df_true['Avg. Comb. Perf. ⬆️'].tolist()
+    labels_true = [re.search(r'>([^<]+)<', m).group(1) for m in df_true['Model'].tolist()]
     x_false = df_false['#Params (B)'].tolist()
     y_false = df_false['Avg. Comb. Perf. ⬆️'].tolist()
+    labels_false = [re.search(r'>([^<]+)<', m).group(1) for m in df_false['Model'].tolist()]
     fig = go.Figure()
     fig.add_trace(go.Scatter(
         x=x_true,
         y=y_true,
+        mode='markers',
         name='5-Shot',
+        marker=dict(
+            color='blue',
+            size=scale_sizes(x_true)  # marker più grandi se #Params è grande
+        ),
         hovertemplate='<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>',
+        customdata=labels_true
     ))
     # Punti IS_FS=False
         y=y_false,
         mode='markers',
         name='0-Shot',
+        marker=dict(
+            color='red',
+            size=scale_sizes(x_false)
+        ),
         hovertemplate='<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>',
         customdata=labels_false
     ))
         yaxis_title="Avg. Combined Performance",
         template="plotly_white",
         hovermode="closest",
+        font=dict(family="Arial", size=10),
+        dragmode=False,
+        xaxis=dict(
+            tickvals=[0, 25, 50, 75, 100, 125],  # valori che vuoi mostrare
+            ticktext=["0", "25", "50", "75", "100"]
+        )
     )
+    # Caption
     fig.add_annotation(
+        text="Accuracy generally rises with #Params, but smaller models with 5-shot <br> "
+             "can outperform larger zero-shot models.",
         xref="paper", yref="paper",
         x=0, y=-0.3,
         showarrow=False,
         align="left"
     )
     fig.update_xaxes(fixedrange=True, rangeslider_visible=False)
     fig.update_yaxes(fixedrange=True)
     return fig
+def line_chart(dataframe):
+    import re
+    import plotly.graph_objects as go
+    # Normalizziamo le dimensioni per avere marker non troppo piccoli né enormi
+    def scale_sizes(values, min_size=8, max_size=30):
+        vmin, vmax = min(values), max(values)
+        return [
+            min_size + (val - vmin) / (vmax - vmin) * (max_size - min_size) if vmax > vmin else (min_size + max_size) / 2
+            for val in values
+        ]
+    # Separiamo i dati in base a IS_FS
+    df_true = dataframe[dataframe['IS_FS'] == True]
+    df_false = dataframe[dataframe['IS_FS'] == False]
+    # Estrai valori x, y e labels
+    x_true = df_true['#Params (B)'].tolist()
+    y_true = df_true['Avg. Comb. Perf. ⬆️'].tolist()
+    labels_true = [re.search(r'>([^<]+)<', m).group(1) for m in df_true['Model'].tolist()]
+    x_false = df_false['#Params (B)'].tolist()
+    y_false = df_false['Avg. Comb. Perf. ⬆️'].tolist()
+    labels_false = [re.search(r'>([^<]+)<', m).group(1) for m in df_false['Model'].tolist()]
+    fig = go.Figure()
+    # Punti IS_FS=True
+    fig.add_trace(go.Scatter(
+        x=x_true,
+        y=y_true,
+        mode='markers',
+        name='5-Shot',
+        marker=dict(
+            color='blue',
+            size=scale_sizes(x_true)
+        ),
+        hovertemplate='<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>',
+        customdata=labels_true
+    ))
+    # Punti IS_FS=False
+    fig.add_trace(go.Scatter(
+        x=x_false,
+        y=y_false,
+        mode='markers',
+        name='0-Shot',
+        marker=dict(
+            color='red',
+            size=scale_sizes(x_false)
+        ),
+        hovertemplate='<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>',
+        customdata=labels_false
+    ))
+    # Trova il massimo tra tutti i modelli
+    all_y = y_true + y_false
+    all_x = x_true + x_false
+    all_labels = labels_true + labels_false
+    max_idx = all_y.index(max(all_y))
+    max_x = all_x[max_idx]
+    max_y = all_y[max_idx]
+    max_label = all_labels[max_idx]
+    # Aggiungi annotazione visibile per il modello migliore
+    fig.add_annotation(
+        x=max_x,
+        y=max_y,
+        #text=f"Top: {max_label} ({max_y:.1f}%)",
+        text=f"{max_label}",
+        showarrow=True,
+        arrowhead=2,
+        arrowsize=1,
+        arrowwidth=2,
+        arrowcolor="black",
+        font=dict(size=11, color="black"),
+        xshift=10,
+        yshift=10,
+        ax = -30, ay = -20,  # sposta la label a sinistra e sopra il punto
+        xanchor = "right"  # allinea la label a destra rispetto al punto
+    )
+    fig.update_layout(
+        title="Avg. Combined Performance vs #Params",
+        xaxis_title="#Params (B)",
+        yaxis_title="Avg. Combined Performance",
+        template="plotly_white",
+        hovermode="closest",
+        font=dict(family="Arial", size=10),
+        dragmode=False,
+        xaxis=dict(
+            tickvals=[0, 25, 50, 75, 100, 125],
+            ticktext=["0", "25", "50", "75", "100"]
+        )
+    )
+    # Caption
+    fig.add_annotation(
+        text="Accuracy generally rises with #Params, but smaller models with 5-shot <br>"
+             "can outperform larger zero-shot models.",
+        xref="paper", yref="paper",
+        x=0, y=-0.3,
+        showarrow=False,
+        font=dict(size=11, color="gray"),
+        align="left"
+    )
+    fig.update_xaxes(fixedrange=True, rangeslider_visible=False)
+    fig.update_yaxes(fixedrange=True)
+    return fig
 # Define task metadata (icons, names, descriptions)
             else:
                 new_model_column.append(row["Model"])
     # Lista delle colonne da aggiornare
+    #cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"]
     # Applichiamo la trasformazione
+    #for col in cols_to_update:
+    #    dataframe[col] = dataframe[col].replace({1: 7, 2: 8})
     # Aggiorna la colonna Model
     sorted_dataframe["Model"] = new_model_column
     )
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    # ⬇️ QUI aggiungiamo i grafici subito sotto la barra del titolo e sopra le tabs
+    with gr.Row():
+        gr.Plot(value=line_chart(LEADERBOARD_DF), elem_id="line-chart")
+        gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES), elem_id="boxplot-task")
+        gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF), elem_id="boxplot-prompt-task")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         # Main leaderboard tab
                         """
             )
+        '''
         with gr.TabItem("📈 Charts"):
             #gr.Plot(value=line_chart(LEADERBOARD_DF), label="Andamento di esempio")
             #gr.Plot(value=line_chart_interactive_test(), label="Andamento interattivo")
             gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES))
             gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF))
             gr.Plot(value=barplot_mean_few_minus_zero_shot(LEADERBOARD_DF))
+        '''
         # About tab
         with gr.TabItem("📝 About"):

src/display/css_html_js.py CHANGED Viewed

@@ -104,3 +104,4 @@ get_window_url_params = """
         return url_params;
     }
     """

         return url_params;
     }
     """